From 65ec2f1ca40aaeeb4b472efd877dd92dcb73f31f Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 13:52:37 +0200
Subject: [PATCH 01/89] Move headers to -inl path

---
 cpp/include/raft/core/{logger.hpp => logger-inl.hpp}            | 2 +-
 .../detail/pairwise_matrix/{dispatch.cuh => dispatch-inl.cuh}   | 0
 cpp/include/raft/distance/{distance.cuh => distance-inl.cuh}    | 0
 .../raft/distance/{fused_l2_nn.cuh => fused_l2_nn-inl.cuh}      | 0
 .../{coalesced_reduction.cuh => coalesced_reduction-inl.cuh}    | 2 +-
 .../raft/matrix/detail/{select_k.cuh => select_k-inl.cuh}       | 0
 .../raft/neighbors/{ball_cover.cuh => ball_cover-inl.cuh}       | 0
 .../raft/neighbors/{brute_force.cuh => brute_force-inl.cuh}     | 0
 .../detail/{ivf_flat_search.cuh => ivf_flat_search-inl.cuh}     | 0
 .../detail/{selection_faiss.cuh => selection_faiss-inl.cuh}     | 0
 cpp/include/raft/neighbors/{ivf_flat.cuh => ivf_flat-inl.cuh}   | 0
 cpp/include/raft/neighbors/{ivf_pq.cuh => ivf_pq-inl.cuh}       | 0
 cpp/include/raft/neighbors/{refine.cuh => refine-inl.cuh}       | 0
 .../knn/detail/ball_cover/{registers.cuh => registers-inl.cuh}  | 0
 .../knn/detail/{fused_l2_knn.cuh => fused_l2_knn-inl.cuh}       | 0
 15 files changed, 2 insertions(+), 2 deletions(-)
 rename cpp/include/raft/core/{logger.hpp => logger-inl.hpp} (99%)
 rename cpp/include/raft/distance/detail/pairwise_matrix/{dispatch.cuh => dispatch-inl.cuh} (100%)
 rename cpp/include/raft/distance/{distance.cuh => distance-inl.cuh} (100%)
 rename cpp/include/raft/distance/{fused_l2_nn.cuh => fused_l2_nn-inl.cuh} (100%)
 rename cpp/include/raft/linalg/detail/{coalesced_reduction.cuh => coalesced_reduction-inl.cuh} (99%)
 rename cpp/include/raft/matrix/detail/{select_k.cuh => select_k-inl.cuh} (100%)
 rename cpp/include/raft/neighbors/{ball_cover.cuh => ball_cover-inl.cuh} (100%)
 rename cpp/include/raft/neighbors/{brute_force.cuh => brute_force-inl.cuh} (100%)
 rename cpp/include/raft/neighbors/detail/{ivf_flat_search.cuh => ivf_flat_search-inl.cuh} (100%)
 rename cpp/include/raft/neighbors/detail/{selection_faiss.cuh => selection_faiss-inl.cuh} (100%)
 rename cpp/include/raft/neighbors/{ivf_flat.cuh => ivf_flat-inl.cuh} (100%)
 rename cpp/include/raft/neighbors/{ivf_pq.cuh => ivf_pq-inl.cuh} (100%)
 rename cpp/include/raft/neighbors/{refine.cuh => refine-inl.cuh} (100%)
 rename cpp/include/raft/spatial/knn/detail/ball_cover/{registers.cuh => registers-inl.cuh} (100%)
 rename cpp/include/raft/spatial/knn/detail/{fused_l2_knn.cuh => fused_l2_knn-inl.cuh} (100%)

diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger-inl.hpp
similarity index 99%
rename from cpp/include/raft/core/logger.hpp
rename to cpp/include/raft/core/logger-inl.hpp
index 3984ec042a..d8f25f6d03 100644
--- a/cpp/include/raft/core/logger.hpp
+++ b/cpp/include/raft/core/logger-inl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
similarity index 100%
rename from cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
rename to cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance-inl.cuh
similarity index 100%
rename from cpp/include/raft/distance/distance.cuh
rename to cpp/include/raft/distance/distance-inl.cuh
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn-inl.cuh
similarity index 100%
rename from cpp/include/raft/distance/fused_l2_nn.cuh
rename to cpp/include/raft/distance/fused_l2_nn-inl.cuh
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
similarity index 99%
rename from cpp/include/raft/linalg/detail/coalesced_reduction.cuh
rename to cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index 238e17fa56..4cc19d4a17 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/matrix/detail/select_k.cuh b/cpp/include/raft/matrix/detail/select_k-inl.cuh
similarity index 100%
rename from cpp/include/raft/matrix/detail/select_k.cuh
rename to cpp/include/raft/matrix/detail/select_k-inl.cuh
diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover-inl.cuh
similarity index 100%
rename from cpp/include/raft/neighbors/ball_cover.cuh
rename to cpp/include/raft/neighbors/ball_cover-inl.cuh
diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force-inl.cuh
similarity index 100%
rename from cpp/include/raft/neighbors/brute_force.cuh
rename to cpp/include/raft/neighbors/brute_force-inl.cuh
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
similarity index 100%
rename from cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
rename to cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-inl.cuh
similarity index 100%
rename from cpp/include/raft/neighbors/detail/selection_faiss.cuh
rename to cpp/include/raft/neighbors/detail/selection_faiss-inl.cuh
diff --git a/cpp/include/raft/neighbors/ivf_flat.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
similarity index 100%
rename from cpp/include/raft/neighbors/ivf_flat.cuh
rename to cpp/include/raft/neighbors/ivf_flat-inl.cuh
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
similarity index 100%
rename from cpp/include/raft/neighbors/ivf_pq.cuh
rename to cpp/include/raft/neighbors/ivf_pq-inl.cuh
diff --git a/cpp/include/raft/neighbors/refine.cuh b/cpp/include/raft/neighbors/refine-inl.cuh
similarity index 100%
rename from cpp/include/raft/neighbors/refine.cuh
rename to cpp/include/raft/neighbors/refine-inl.cuh
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
similarity index 100%
rename from cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
rename to cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh
similarity index 100%
rename from cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
rename to cpp/include/raft/spatial/knn/detail/fused_l2_knn-inl.cuh

From 51e8673d5a3a7cbad98a153d8676e55c111d3ebb Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 13:55:16 +0200
Subject: [PATCH 02/89] Add back empty headers

---
 cpp/include/raft/core/logger.hpp                                | 0
 cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh   | 0
 cpp/include/raft/distance/distance.cuh                          | 0
 cpp/include/raft/distance/fused_l2_nn.cuh                       | 0
 cpp/include/raft/linalg/detail/coalesced_reduction.cuh          | 0
 cpp/include/raft/matrix/detail/select_k.cuh                     | 0
 cpp/include/raft/neighbors/ball_cover.cuh                       | 0
 cpp/include/raft/neighbors/brute_force.cuh                      | 0
 cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh | 0
 cpp/include/raft/neighbors/detail/ivf_flat_search.cuh           | 0
 cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh | 0
 cpp/include/raft/neighbors/detail/selection_faiss.cuh           | 0
 cpp/include/raft/neighbors/ivf_flat.cuh                         | 0
 cpp/include/raft/neighbors/ivf_pq.cuh                           | 0
 cpp/include/raft/neighbors/refine.cuh                           | 0
 cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh    | 0
 cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh            | 0
 cpp/include/raft/util/memory_pool.hpp                           | 0
 18 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 cpp/include/raft/core/logger.hpp
 create mode 100644 cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
 create mode 100644 cpp/include/raft/distance/distance.cuh
 create mode 100644 cpp/include/raft/distance/fused_l2_nn.cuh
 create mode 100644 cpp/include/raft/linalg/detail/coalesced_reduction.cuh
 create mode 100644 cpp/include/raft/matrix/detail/select_k.cuh
 create mode 100644 cpp/include/raft/neighbors/ball_cover.cuh
 create mode 100644 cpp/include/raft/neighbors/brute_force.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/selection_faiss.cuh
 create mode 100644 cpp/include/raft/neighbors/ivf_flat.cuh
 create mode 100644 cpp/include/raft/neighbors/ivf_pq.cuh
 create mode 100644 cpp/include/raft/neighbors/refine.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
 create mode 100644 cpp/include/raft/util/memory_pool.hpp

diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/matrix/detail/select_k.cuh b/cpp/include/raft/matrix/detail/select_k.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss.cuh b/cpp/include/raft/neighbors/detail/selection_faiss.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/neighbors/ivf_flat.cuh b/cpp/include/raft/neighbors/ivf_flat.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/neighbors/refine.cuh b/cpp/include/raft/neighbors/refine.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cpp/include/raft/util/memory_pool.hpp b/cpp/include/raft/util/memory_pool.hpp
new file mode 100644
index 0000000000..e69de29bb2

From 9e03fca6ec00f6f9daf0fe2774071b146c556a31 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 3 Apr 2023 17:04:12 +0200
Subject: [PATCH 03/89] Disable warnings for clang compilation

---
 cpp/cmake/modules/ConfigureCUDA.cmake | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index c733d46985..d88d48a5cf 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -19,6 +19,7 @@ endif()
 
 if(CMAKE_COMPILER_IS_GNUCXX)
   list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
+  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
 endif()
 
 if(CUDA_LOG_COMPILE_TIME)
@@ -33,9 +34,9 @@ list(APPEND RAFT_CUDA_FLAGS -Xfatbin=-compress-all)
 
 # set warnings as errors
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
-  list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
+  # list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
 endif()
-list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+
 
 # Option to enable line info in CUDA device compilation to allow introspection when profiling /
 # memchecking

From 3bace7b559d3156cdbb10cbeb0e9cfc186bb9703 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 3 Apr 2023 17:06:34 +0200
Subject: [PATCH 04/89] Comment out omp to enable clang compilation

---
 cpp/include/raft/neighbors/detail/refine.cuh | 77 ++++++++++----------
 1 file changed, 38 insertions(+), 39 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh
index aedfc42698..e3a2c7d109 100644
--- a/cpp/include/raft/neighbors/detail/refine.cuh
+++ b/cpp/include/raft/neighbors/detail/refine.cuh
@@ -25,7 +25,6 @@
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
 #include <cstdlib>
-#include <omp.h>
 
 #include <thrust/sequence.h>
 
@@ -201,44 +200,44 @@ void refine_host(raft::host_matrix_view<const data_t, matrix_idx, row_major> dat
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "neighbors::refine_host(%zu, %u)", size_t(numQueries), uint32_t(topK));
 
-#pragma omp parallel
-  {
-    struct_for_refinement* sfr =
-      (struct_for_refinement*)malloc(sizeof(struct_for_refinement) * topK);
-    for (size_t i = omp_get_thread_num(); i < numQueries; i += omp_get_num_threads()) {
-      // compute distance with original dataset vectors
-      const data_t* cur_query = queries_ptr + ((uint64_t)dimDataset * i);
-      for (size_t j = 0; j < (size_t)topK; j++) {
-        idx_t id                  = neighbors[j + (topK * i)];
-        const data_t* cur_dataset = dataset_ptr + ((uint64_t)dimDataset * id);
-        float distance            = 0.0;
-        for (size_t k = 0; k < (size_t)dimDataset; k++) {
-          float val_q = (float)(cur_query[k]);
-          float val_d = (float)(cur_dataset[k]);
-          if (metric == raft::distance::DistanceType::InnerProduct) {
-            distance += -val_q * val_d;  // Negate because we sort in ascending order.
-          } else {
-            distance += (val_q - val_d) * (val_q - val_d);
-          }
-        }
-        sfr[j].id       = id;
-        sfr[j].distance = distance;
-      }
-
-      qsort(sfr, topK, sizeof(struct_for_refinement), _postprocessing_qsort_compare);
-
-      for (size_t j = 0; j < (size_t)refinedTopK; j++) {
-        refinedNeighbors[j + (refinedTopK * i)] = sfr[j].id;
-        if (refinedDistances == NULL) continue;
-        if (metric == raft::distance::DistanceType::InnerProduct) {
-          refinedDistances[j + (refinedTopK * i)] = -sfr[j].distance;
-        } else {
-          refinedDistances[j + (refinedTopK * i)] = sfr[j].distance;
-        }
-      }
-    }
-    free(sfr);
-  }
+  // #pragma omp parallel
+  //   {
+  //     struct_for_refinement* sfr =
+  //       (struct_for_refinement*)malloc(sizeof(struct_for_refinement) * topK);
+  //     for (size_t i = omp_get_thread_num(); i < numQueries; i += omp_get_num_threads()) {
+  //       // compute distance with original dataset vectors
+  //       const data_t* cur_query = queries_ptr + ((uint64_t)dimDataset * i);
+  //       for (size_t j = 0; j < (size_t)topK; j++) {
+  //         idx_t id                  = neighbors[j + (topK * i)];
+  //         const data_t* cur_dataset = dataset_ptr + ((uint64_t)dimDataset * id);
+  //         float distance            = 0.0;
+  //         for (size_t k = 0; k < (size_t)dimDataset; k++) {
+  //           float val_q = (float)(cur_query[k]);
+  //           float val_d = (float)(cur_dataset[k]);
+  //           if (metric == raft::distance::DistanceType::InnerProduct) {
+  //             distance += -val_q * val_d;  // Negate because we sort in ascending order.
+  //           } else {
+  //             distance += (val_q - val_d) * (val_q - val_d);
+  //           }
+  //         }
+  //         sfr[j].id       = id;
+  //         sfr[j].distance = distance;
+  //       }
+
+  //       qsort(sfr, topK, sizeof(struct_for_refinement), _postprocessing_qsort_compare);
+
+  //       for (size_t j = 0; j < (size_t)refinedTopK; j++) {
+  //         refinedNeighbors[j + (refinedTopK * i)] = sfr[j].id;
+  //         if (refinedDistances == NULL) continue;
+  //         if (metric == raft::distance::DistanceType::InnerProduct) {
+  //           refinedDistances[j + (refinedTopK * i)] = -sfr[j].distance;
+  //         } else {
+  //           refinedDistances[j + (refinedTopK * i)] = sfr[j].distance;
+  //         }
+  //       }
+  //     }
+  //     free(sfr);
+  //   }
 }
 
 }  // namespace raft::neighbors::detail

From f0e74a216dde5f570b3d24b3f1615c03a7ee4363 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 3 Apr 2023 17:09:16 +0200
Subject: [PATCH 05/89] cmake: Define RAFT_EXPLICIT_INCLUDE

We define this internally (PRIVATE). In addition, we define
RAFT_COMPILED both internally and externally (PUBLIC).
---
 cpp/CMakeLists.txt             | 4 +++-
 cpp/bench/prims/CMakeLists.txt | 1 +
 cpp/test/CMakeLists.txt        | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4753b534e4..e3cdcbf760 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -275,6 +275,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/cluster/update_centroids_double.cu
     src/cluster/cluster_cost_float.cu
     src/cluster/cluster_cost_double.cu
+    src/core/logger.cpp
     src/neighbors/refine_d_int64_t_float.cu
     src/neighbors/refine_d_int64_t_int8_t.cu
     src/neighbors/refine_d_int64_t_uint8_t.cu
@@ -462,7 +463,8 @@ if(RAFT_COMPILE_LIBRARY)
     raft_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
                      "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
-  target_compile_definitions(raft_lib INTERFACE "RAFT_COMPILED")
+  target_compile_definitions(raft_lib PUBLIC "RAFT_COMPILED")
+  target_compile_definitions(raft_lib PRIVATE "RAFT_EXPLICIT_INSTANTIATE")
 
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
   target_link_options(raft_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index f6499623dd..c0c1706b2a 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -54,6 +54,7 @@ function(ConfigureBench)
     ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
                           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
+  target_compile_definitions(${BENCH_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE")
 
   target_include_directories(
     ${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench/prims>"
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index a778b0d195..91050461ae 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -58,6 +58,7 @@ function(ConfigureTest)
                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
 
+  target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE")
   target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>")
 
   install(
@@ -111,6 +112,8 @@ if(BUILD_TESTS)
     test/core/span.cu
     test/core/temporary_device_buffer.cu
     test/test.cpp
+    OPTIONAL
+    LIB
   )
 
   ConfigureTest(

From a8f62fba1b2372ea19d39d834ebd3c7d56aa1432 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 3 Apr 2023 17:22:28 +0200
Subject: [PATCH 06/89] Split raft/core/logger

---
 cpp/include/raft/core/logger-ext.hpp    | 128 +++++++++
 cpp/include/raft/core/logger-inl.hpp    | 350 +++++++++---------------
 cpp/include/raft/core/logger-macros.hpp | 106 +++++++
 cpp/include/raft/core/logger.hpp        |  24 ++
 cpp/src/core/logger.cpp                 |  16 ++
 5 files changed, 404 insertions(+), 220 deletions(-)
 create mode 100644 cpp/include/raft/core/logger-ext.hpp
 create mode 100644 cpp/include/raft/core/logger-macros.hpp
 create mode 100644 cpp/src/core/logger.cpp

diff --git a/cpp/include/raft/core/logger-ext.hpp b/cpp/include/raft/core/logger-ext.hpp
new file mode 100644
index 0000000000..69688560c7
--- /dev/null
+++ b/cpp/include/raft/core/logger-ext.hpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <memory>         // std::unique_ptr
+#include <string>         // std::string
+#include <unordered_map>  // std::unordered_map
+
+namespace raft {
+
+static const std::string RAFT_NAME = "raft";
+static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
+
+/**
+ * @brief The main Logging class for raft library.
+ *
+ * This class acts as a thin wrapper over the underlying `spdlog` interface. The
+ * design is done in this way in order to avoid us having to also ship `spdlog`
+ * header files in our installation.
+ *
+ * @todo This currently only supports logging to stdout. Need to add support in
+ *       future to add custom loggers as well [Issue #2046]
+ */
+class logger {
+ public:
+  // @todo setting the logger once per process with
+  logger(std::string const& name_ = "");
+  /**
+   * @brief Singleton method to get the underlying logger object
+   *
+   * @return the singleton logger object
+   */
+  static logger& get(std::string const& name = "");
+
+  /**
+   * @brief Set the logging level.
+   *
+   * Only messages with level equal or above this will be printed
+   *
+   * @param[in] level logging level
+   *
+   * @note The log level will actually be set only if the input is within the
+   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
+   *       be ignored. See documentation of decisiontree for how this gets used
+   */
+  void set_level(int level);
+
+  /**
+   * @brief Set the logging pattern
+   *
+   * @param[in] pattern the pattern to be set. Refer this link
+   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
+   *                    to know the right syntax of this pattern
+   */
+  void set_pattern(const std::string& pattern);
+
+  /**
+   * @brief Register a callback function to be run in place of usual log call
+   *
+   * @param[in] callback the function to be run on all logged messages
+   */
+  void set_callback(void (*callback)(int lvl, const char* msg));
+
+  /**
+   * @brief Register a flush function compatible with the registered callback
+   *
+   * @param[in] flush the function to use when flushing logs
+   */
+  void set_flush(void (*flush)());
+
+  /**
+   * @brief Tells whether messages will be logged for the given log level
+   *
+   * @param[in] level log level to be checked for
+   * @return true if messages will be logged for this level, else false
+   */
+  bool should_log_for(int level) const;
+  /**
+   * @brief Query for the current log level
+   *
+   * @return the current log level
+   */
+  int get_level() const;
+
+  /**
+   * @brief Get the current logging pattern
+   * @return the pattern
+   */
+  std::string get_pattern() const;
+
+  /**
+   * @brief Main logging method
+   *
+   * @param[in] level logging level of this message
+   * @param[in] fmt   C-like format string, followed by respective params
+   */
+  void log(int level, const char* fmt, ...);
+
+  /**
+   * @brief Flush logs by calling flush on underlying logger
+   */
+  void flush();
+
+  ~logger();
+
+ private:
+  logger();
+  // pimpl pattern:
+  // https://learn.microsoft.com/en-us/cpp/cpp/pimpl-for-compile-time-encapsulation-modern-cpp?view=msvc-170
+  class impl;
+  std::unique_ptr<impl> pimpl;
+  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
+};  // class logger
+
+};  // namespace raft
diff --git a/cpp/include/raft/core/logger-inl.hpp b/cpp/include/raft/core/logger-inl.hpp
index d8f25f6d03..b63b98ae90 100644
--- a/cpp/include/raft/core/logger-inl.hpp
+++ b/cpp/include/raft/core/logger-inl.hpp
@@ -15,9 +15,6 @@
  */
 #pragma once
 
-#ifndef __RAFT_RT_LOGGER
-#define __RAFT_RT_LOGGER
-
 #include <stdarg.h>
 
 #include <algorithm>
@@ -30,38 +27,20 @@
 
 #include <stdarg.h>
 
+#include "logger-macros.hpp"
+// The logger-ext.hpp file contains the class declaration of the logger class.
+// In this case, it is okay to include the logger-ext.hpp file because it
+// contains no RAFT_EXPLICIT template instantiations.
+#include "logger-ext.hpp"
+
 #define SPDLOG_HEADER_ONLY
 #include <raft/core/detail/callback_sink.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
 #include <spdlog/spdlog.h>                    // NOLINT
 
-/**
- * @defgroup logging levels used in raft
- *
- * @note exactly match the corresponding ones (but reverse in terms of value)
- *       in spdlog for wrapping purposes
- *
- * @{
- */
-#define RAFT_LEVEL_TRACE    6
-#define RAFT_LEVEL_DEBUG    5
-#define RAFT_LEVEL_INFO     4
-#define RAFT_LEVEL_WARN     3
-#define RAFT_LEVEL_ERROR    2
-#define RAFT_LEVEL_CRITICAL 1
-#define RAFT_LEVEL_OFF      0
-/** @} */
-
-#if !defined(RAFT_ACTIVE_LEVEL)
-#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_INFO
-#endif
-
 namespace raft {
 
-static const std::string RAFT_NAME = "raft";
-static const std::string default_log_pattern("[%L] [%H:%M:%S.%f] %v");
-
 namespace detail {
 
 /**
@@ -105,6 +84,23 @@ inline int convert_level_to_spdlog(int level)
 
 }  // namespace detail
 
+class logger::impl {  // defined privately here
+                      // ... all private data and functions: all of these
+                      //     can now change without recompiling callers ...
+ public:
+  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
+  std::shared_ptr<spdlog::logger> spdlogger;
+  std::string cur_pattern;
+  int cur_level;
+
+  impl(std::string const& name_ = "")
+    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
+      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
+      cur_pattern()
+  {
+  }
+};  // class logger::impl
+
 /**
  * @brief The main Logging class for raft library.
  *
@@ -115,210 +111,124 @@ inline int convert_level_to_spdlog(int level)
  * @todo This currently only supports logging to stdout. Need to add support in
  *       future to add custom loggers as well [Issue #2046]
  */
-class logger {
- public:
-  // @todo setting the logger once per process with
-  logger(std::string const& name_ = "")
-    : sink{std::make_shared<spdlog::sinks::callback_sink_mt>()},
-      spdlogger{std::make_shared<spdlog::logger>(name_, sink)},
-      cur_pattern()
-  {
-    set_pattern(default_log_pattern);
-    set_level(RAFT_ACTIVE_LEVEL);
-  }
-  /**
-   * @brief Singleton method to get the underlying logger object
-   *
-   * @return the singleton logger object
-   */
-  static logger& get(std::string const& name = "")
-  {
-    if (log_map.find(name) == log_map.end()) {
-      log_map[name] = std::make_shared<raft::logger>(name);
-    }
-    return *log_map[name];
-  }
+logger::logger(std::string const& name_) : pimpl(new impl(name_))
+{
+  set_pattern(default_log_pattern);
+  set_level(RAFT_ACTIVE_LEVEL);
+}
+/**
+ * @brief Singleton method to get the underlying logger object
+ *
+ * @return the singleton logger object
+ */
+logger& logger::get(std::string const& name)
+{
+  if (log_map.find(name) == log_map.end()) { log_map[name] = std::make_shared<raft::logger>(name); }
+  return *log_map[name];
+}
 
-  /**
-   * @brief Set the logging level.
-   *
-   * Only messages with level equal or above this will be printed
-   *
-   * @param[in] level logging level
-   *
-   * @note The log level will actually be set only if the input is within the
-   *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
-   *       be ignored. See documentation of decisiontree for how this gets used
-   */
-  void set_level(int level)
-  {
-    level = raft::detail::convert_level_to_spdlog(level);
-    spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
-  }
+/**
+ * @brief Set the logging level.
+ *
+ * Only messages with level equal or above this will be printed
+ *
+ * @param[in] level logging level
+ *
+ * @note The log level will actually be set only if the input is within the
+ *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
+ *       be ignored. See documentation of decisiontree for how this gets used
+ */
+void logger::set_level(int level)
+{
+  level = raft::detail::convert_level_to_spdlog(level);
+  pimpl->spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
+}
 
-  /**
-   * @brief Set the logging pattern
-   *
-   * @param[in] pattern the pattern to be set. Refer this link
-   *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
-   *                    to know the right syntax of this pattern
-   */
-  void set_pattern(const std::string& pattern)
-  {
-    cur_pattern = pattern;
-    spdlogger->set_pattern(pattern);
-  }
+/**
+ * @brief Set the logging pattern
+ *
+ * @param[in] pattern the pattern to be set. Refer this link
+ *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
+ *                    to know the right syntax of this pattern
+ */
+void logger::set_pattern(const std::string& pattern)
+{
+  pimpl->cur_pattern = pattern;
+  pimpl->spdlogger->set_pattern(pattern);
+}
 
-  /**
-   * @brief Register a callback function to be run in place of usual log call
-   *
-   * @param[in] callback the function to be run on all logged messages
-   */
-  void set_callback(void (*callback)(int lvl, const char* msg)) { sink->set_callback(callback); }
+/**
+ * @brief Register a callback function to be run in place of usual log call
+ *
+ * @param[in] callback the function to be run on all logged messages
+ */
+void logger::set_callback(void (*callback)(int lvl, const char* msg))
+{
+  pimpl->sink->set_callback(callback);
+}
 
-  /**
-   * @brief Register a flush function compatible with the registered callback
-   *
-   * @param[in] flush the function to use when flushing logs
-   */
-  void set_flush(void (*flush)()) { sink->set_flush(flush); }
+/**
+ * @brief Register a flush function compatible with the registered callback
+ *
+ * @param[in] flush the function to use when flushing logs
+ */
+void logger::set_flush(void (*flush)()) { pimpl->sink->set_flush(flush); }
 
-  /**
-   * @brief Tells whether messages will be logged for the given log level
-   *
-   * @param[in] level log level to be checked for
-   * @return true if messages will be logged for this level, else false
-   */
-  bool should_log_for(int level) const
-  {
-    level        = raft::detail::convert_level_to_spdlog(level);
-    auto level_e = static_cast<spdlog::level::level_enum>(level);
-    return spdlogger->should_log(level_e);
-  }
+/**
+ * @brief Tells whether messages will be logged for the given log level
+ *
+ * @param[in] level log level to be checked for
+ * @return true if messages will be logged for this level, else false
+ */
+bool logger::should_log_for(int level) const
+{
+  level        = raft::detail::convert_level_to_spdlog(level);
+  auto level_e = static_cast<spdlog::level::level_enum>(level);
+  return pimpl->spdlogger->should_log(level_e);
+}
 
-  /**
-   * @brief Query for the current log level
-   *
-   * @return the current log level
-   */
-  int get_level() const
-  {
-    auto level_e = spdlogger->level();
-    return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
-  }
+/**
+ * @brief Query for the current log level
+ *
+ * @return the current log level
+ */
+int logger::get_level() const
+{
+  auto level_e = pimpl->spdlogger->level();
+  return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
+}
 
-  /**
-   * @brief Get the current logging pattern
-   * @return the pattern
-   */
-  std::string get_pattern() const { return cur_pattern; }
+/**
+ * @brief Get the current logging pattern
+ * @return the pattern
+ */
+std::string logger::get_pattern() const { return pimpl->cur_pattern; }
 
-  /**
-   * @brief Main logging method
-   *
-   * @param[in] level logging level of this message
-   * @param[in] fmt   C-like format string, followed by respective params
-   */
-  void log(int level, const char* fmt, ...)
-  {
-    level        = raft::detail::convert_level_to_spdlog(level);
-    auto level_e = static_cast<spdlog::level::level_enum>(level);
-    // explicit check to make sure that we only expand messages when required
-    if (spdlogger->should_log(level_e)) {
-      va_list vl;
-      va_start(vl, fmt);
-      auto msg = raft::detail::format(fmt, vl);
-      va_end(vl);
-      spdlogger->log(level_e, msg);
-    }
+/**
+ * @brief Main logging method
+ *
+ * @param[in] level logging level of this message
+ * @param[in] fmt   C-like format string, followed by respective params
+ */
+void logger::log(int level, const char* fmt, ...)
+{
+  level        = raft::detail::convert_level_to_spdlog(level);
+  auto level_e = static_cast<spdlog::level::level_enum>(level);
+  // explicit check to make sure that we only expand messages when required
+  if (pimpl->spdlogger->should_log(level_e)) {
+    va_list vl;
+    va_start(vl, fmt);
+    auto msg = raft::detail::format(fmt, vl);
+    va_end(vl);
+    pimpl->spdlogger->log(level_e, msg);
   }
-
-  /**
-   * @brief Flush logs by calling flush on underlying logger
-   */
-  void flush() { spdlogger->flush(); }
-
-  ~logger() {}
-
- private:
-  logger();
-
-  static inline std::unordered_map<std::string, std::shared_ptr<raft::logger>> log_map;
-  std::shared_ptr<spdlog::sinks::callback_sink_mt> sink;
-  std::shared_ptr<spdlog::logger> spdlogger;
-  std::string cur_pattern;
-  int cur_level;
-};  // class logger
-
-};  // namespace raft
+}
 
 /**
- * @defgroup loggerMacros Helper macros for dealing with logging
- * @{
+ * @brief Flush logs by calling flush on underlying logger
  */
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
-#define RAFT_LOG_TRACE(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_TRACE(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
-#define RAFT_LOG_TRACE_VEC(ptr, len)                                      \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    print_vector(#ptr, ptr, len, ss);                                     \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_TRACE_VEC(ptr, len) void(0)
-#endif
+void logger::flush() { pimpl->spdlogger->flush(); }
 
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
-#define RAFT_LOG_DEBUG(fmt, ...)                                          \
-  do {                                                                    \
-    std::stringstream ss;                                                 \
-    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
-    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
-    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
-  } while (0)
-#else
-#define RAFT_LOG_DEBUG(fmt, ...) void(0)
-#endif
+logger::~logger() {}
 
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
-#define RAFT_LOG_INFO(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_INFO(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
-#define RAFT_LOG_WARN(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_WARN(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
-#define RAFT_LOG_ERROR(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_ERROR(fmt, ...) void(0)
-#endif
-
-#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
-#define RAFT_LOG_CRITICAL(fmt, ...) \
-  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
-#else
-#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
-#endif
-/** @} */
-
-#endif
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/core/logger-macros.hpp b/cpp/include/raft/core/logger-macros.hpp
new file mode 100644
index 0000000000..5ddb072067
--- /dev/null
+++ b/cpp/include/raft/core/logger-macros.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/**
+ * @defgroup logging levels used in raft
+ *
+ * @note exactly match the corresponding ones (but reverse in terms of value)
+ *       in spdlog for wrapping purposes
+ *
+ * @{
+ */
+#define RAFT_LEVEL_TRACE    6
+#define RAFT_LEVEL_DEBUG    5
+#define RAFT_LEVEL_INFO     4
+#define RAFT_LEVEL_WARN     3
+#define RAFT_LEVEL_ERROR    2
+#define RAFT_LEVEL_CRITICAL 1
+#define RAFT_LEVEL_OFF      0
+/** @} */
+
+#if !defined(RAFT_ACTIVE_LEVEL)
+#define RAFT_ACTIVE_LEVEL RAFT_LEVEL_INFO
+#endif
+
+/**
+ * @defgroup loggerMacros Helper macros for dealing with logging
+ * @{
+ */
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
+#define RAFT_LOG_TRACE(fmt, ...)                                          \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_TRACE(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_TRACE)
+#define RAFT_LOG_TRACE_VEC(ptr, len)                                      \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
+    print_vector(#ptr, ptr, len, ss);                                     \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_TRACE, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_TRACE_VEC(ptr, len) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
+#define RAFT_LOG_DEBUG(fmt, ...)                                          \
+  do {                                                                    \
+    std::stringstream ss;                                                 \
+    ss << raft::detail::format("%s:%d ", __FILE__, __LINE__);             \
+    ss << raft::detail::format(fmt, ##__VA_ARGS__);                       \
+    raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_DEBUG, ss.str().c_str()); \
+  } while (0)
+#else
+#define RAFT_LOG_DEBUG(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_INFO)
+#define RAFT_LOG_INFO(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_INFO, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_INFO(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_WARN)
+#define RAFT_LOG_WARN(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_WARN, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_WARN(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_ERROR)
+#define RAFT_LOG_ERROR(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_ERROR, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_ERROR(fmt, ...) void(0)
+#endif
+
+#if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_CRITICAL)
+#define RAFT_LOG_CRITICAL(fmt, ...) \
+  raft::logger::get(RAFT_NAME).log(RAFT_LEVEL_CRITICAL, fmt, ##__VA_ARGS__)
+#else
+#define RAFT_LOG_CRITICAL(fmt, ...) void(0)
+#endif
+/** @} */
diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp
index e69de29bb2..84e44ab7e8 100644
--- a/cpp/include/raft/core/logger.hpp
+++ b/cpp/include/raft/core/logger.hpp
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "logger-macros.hpp"
+
+#if defined(RAFT_COMPILED) && defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "logger-ext.hpp"
+#else
+#include "logger-inl.hpp"
+#endif
diff --git a/cpp/src/core/logger.cpp b/cpp/src/core/logger.cpp
new file mode 100644
index 0000000000..8f81cf2926
--- /dev/null
+++ b/cpp/src/core/logger.cpp
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <raft/core/logger-inl.hpp>

From db53bde3f3c021501f6e7eb2a8ad61dc519059a5 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 4 Apr 2023 11:11:51 +0200
Subject: [PATCH 07/89] Split raft/spatial/knn/detail/ball_cover/registers.cuh

This dramatically reduces the compile times of

ball_cover_knn_query.cu and ball_cover_all_knn_query.cu

They used to take 900 seconds. Now they take ~25s.
---
 cpp/CMakeLists.txt                            |   1 +
 .../spatial/knn/detail/ball_cover/common.cuh  |  39 +-----
 .../knn/detail/ball_cover/registers-ext.cuh   | 129 ++++++++++++++++++
 .../knn/detail/ball_cover/registers-inl.cuh   |   1 +
 .../knn/detail/ball_cover/registers-types.cuh |  66 +++++++++
 .../knn/detail/ball_cover/registers.cuh       |  23 ++++
 cpp/include/raft/util/raft_explicit.hpp       |  63 +++++++++
 .../knn/detail/ball_cover/registers.cu        |  60 ++++++++
 8 files changed, 345 insertions(+), 37 deletions(-)
 create mode 100644 cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/ball_cover/registers-types.cuh
 create mode 100644 cpp/include/raft/util/raft_explicit.hpp
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e3cdcbf760..94b24be853 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -444,6 +444,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/random/rmat_rectangular_generator_int64_double.cu
     src/random/rmat_rectangular_generator_int_float.cu
     src/random/rmat_rectangular_generator_int64_float.cu
+    src/spatial/knn/detail/ball_cover/registers.cu
   )
   set_target_properties(
     raft_lib
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
index 0a6718f5a5..5522e867fd 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "../haversine_distance.cuh"
+#include "registers-types.cuh"
 #include <cstdint>
 #include <thrust/functional.h>
 #include <thrust/tuple.h>
@@ -39,42 +40,6 @@ struct NNComp {
   }
 };
 
-template <typename value_t, typename value_int = std::uint32_t>
-struct DistFunc {
-  virtual __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
-                                                                 const value_t* b,
-                                                                 const value_int n_dims)
-  {
-    return -1;
-  };
-};
-
-template <typename value_t, typename value_int = std::uint32_t>
-struct HaversineFunc : public DistFunc<value_t, value_int> {
-  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
-                                                         const value_t* b,
-                                                         const value_int n_dims) override
-  {
-    return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], b[1]);
-  }
-};
-
-template <typename value_t, typename value_int = std::uint32_t>
-struct EuclideanFunc : public DistFunc<value_t, value_int> {
-  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
-                                                         const value_t* b,
-                                                         const value_int n_dims) override
-  {
-    value_t sum_sq = 0;
-    for (value_int i = 0; i < n_dims; ++i) {
-      value_t diff = a[i] - b[i];
-      sum_sq += diff * diff;
-    }
-
-    return raft::sqrt(sum_sq);
-  }
-};
-
 /**
  * Zeros the bit at location h in a one-hot encoded 32-bit int array
  */
@@ -105,4 +70,4 @@ __device__ inline bool _get_val(std::uint32_t* arr, std::uint32_t h)
 };  // namespace detail
 };  // namespace knn
 };  // namespace spatial
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
new file mode 100644
index 0000000000..b5b54c62a7
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../../ball_cover_types.hpp"
+#include "registers-types.cuh"          // DistFunc
+#include <cstdint>                      // uint32_t
+#include <raft/util/raft_explicit.hpp>  //RAFT_EXPLICIT
+
+#if defined(RAFT_EXPLICIT_INSTANTIATE)
+
+namespace raft::spatial::knn::detail {
+
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          int dims           = 2,
+          typename dist_func>
+void rbc_low_dim_pass_one(raft::device_resources const& handle,
+                          const BallCoverIndex<value_idx, value_t, value_int>& index,
+                          const value_t* query,
+                          const value_int n_query_rows,
+                          value_int k,
+                          const value_idx* R_knn_inds,
+                          const value_t* R_knn_dists,
+                          dist_func& dfunc,
+                          value_idx* inds,
+                          value_t* dists,
+                          float weight,
+                          value_int* dists_counter) RAFT_EXPLICIT;
+
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          int dims           = 2,
+          typename dist_func>
+void rbc_low_dim_pass_two(raft::device_resources const& handle,
+                          const BallCoverIndex<value_idx, value_t, value_int>& index,
+                          const value_t* query,
+                          const value_int n_query_rows,
+                          value_int k,
+                          const value_idx* R_knn_inds,
+                          const value_t* R_knn_dists,
+                          dist_func& dfunc,
+                          value_idx* inds,
+                          value_t* dists,
+                          float weight,
+                          value_int* post_dists_counter) RAFT_EXPLICIT;
+
+};  // namespace raft::spatial::knn::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  extern template void                                                                       \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  extern template void                                                                       \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::HaversineFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::HaversineFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::EuclideanFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::EuclideanFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::DistFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::DistFunc);
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::HaversineFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::HaversineFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::EuclideanFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::EuclideanFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::DistFunc);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::DistFunc);
+
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
index f665368c41..9c624dcb08 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
@@ -20,6 +20,7 @@
 
 #include "../../ball_cover_types.hpp"
 #include "../haversine_distance.cuh"
+#include "registers-types.cuh"  // DistFunc
 
 #include <cstdint>
 #include <limits.h>
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-types.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-types.cuh
new file mode 100644
index 0000000000..7f4268d2dc
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-types.cuh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../haversine_distance.cuh"  // compute_haversine
+#include <cstdint>                    // uint32_t
+
+namespace raft {
+namespace spatial {
+namespace knn {
+namespace detail {
+
+template <typename value_t, typename value_int = std::uint32_t>
+struct DistFunc {
+  virtual __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                                 const value_t* b,
+                                                                 const value_int n_dims)
+  {
+    return -1;
+  };
+};
+
+template <typename value_t, typename value_int = std::uint32_t>
+struct HaversineFunc : public DistFunc<value_t, value_int> {
+  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                         const value_t* b,
+                                                         const value_int n_dims) override
+  {
+    return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], b[1]);
+  }
+};
+
+template <typename value_t, typename value_int = std::uint32_t>
+struct EuclideanFunc : public DistFunc<value_t, value_int> {
+  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                         const value_t* b,
+                                                         const value_int n_dims) override
+  {
+    value_t sum_sq = 0;
+    for (value_int i = 0; i < n_dims; ++i) {
+      value_t diff = a[i] - b[i];
+      sum_sq += diff * diff;
+    }
+
+    return raft::sqrt(sum_sq);
+  }
+};
+
+};  // namespace detail
+};  // namespace knn
+};  // namespace spatial
+};  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index e69de29bb2..399d4b07c6 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if defined(RAFT_COMPILED) && defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "registers-ext.cuh"
+#else
+#include "registers-inl.cuh"
+#endif
diff --git a/cpp/include/raft/util/raft_explicit.hpp b/cpp/include/raft/util/raft_explicit.hpp
new file mode 100644
index 0000000000..fd81fe23de
--- /dev/null
+++ b/cpp/include/raft/util/raft_explicit.hpp
@@ -0,0 +1,63 @@
+/* Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#define RAFT_EXPLICIT                                                     \
+  {                                                                       \
+    raft::util::raft_explicit::do_not_implicitly_instantiate_templates(); \
+  }
+
+namespace raft::util::raft_explicit {
+
+// To make sure the static_assert only fires when
+// do_not_implicitly_instantiate_templates is instantiated, we use a dummy
+// template parameter as described in P2593:
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+template <bool implicit_instantiation_allowed = false>
+void do_not_implicitly_instantiate_templates()
+{
+  static_assert(implicit_instantiation_allowed,
+                "ACCIDENTAL_IMPLICIT_INSTANTIATION\n\n"
+
+                "If you see this error, then you have implicitly instantiated a function\n"
+                "template. To keep compile times in check, libfoo has the policy of\n"
+                "explicitly instantiating templates. To fix the compilation error, follow\n"
+                "these steps.\n\n"
+
+                "If you scroll up a bit in your error message, you probably saw two lines\n"
+                "like the following:\n\n"
+
+                "[.. snip ..] required from ‘void raft::do_not_implicitly_instantiate_templates() "
+                "[with int dummy = 0]’\n"
+                "[.. snip ..] from ‘void raft::bar(T) [with T = double]’\n\n"
+
+                "Simple solution:\n\n"
+
+                "    Add '#undef RAFT_EXPLICIT_INSTANTIATE' at the top of your .cpp/.cu file.\n\n"
+
+                "Best solution:\n\n"
+
+                "    1. Add the following line to the file include/raft/bar.hpp:\n\n"
+
+                "        extern template void raft::bar<double>(double);\n\n"
+
+                "    2. Add the following line to the file src/raft/bar.cpp:\n\n"
+
+                "        template void raft::bar<double>(double)\n\n"
+
+                "Probability is that there are many other similar lines in both files.\n");
+}
+
+}  // namespace raft::util::raft_explicit
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers.cu b/cpp/src/spatial/knn/detail/ball_cover/registers.cu
new file mode 100644
index 0000000000..0bb6d123a9
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers.cu
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims)                                                   \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    raft::spatial::knn::detail::DistFunc<Mvalue_t, Mvalue_int>& dfunc,                       \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims)                                                   \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    raft::spatial::knn::detail::DistFunc<Mvalue_t, Mvalue_int>& dfunc,                       \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(std::int64_t, float, std::uint32_t, 2);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(std::int64_t, float, std::uint32_t, 3);
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(std::int64_t, float, std::uint32_t, 2);
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(std::int64_t, float, std::uint32_t, 3);
+
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one

From 4d26ca91795b07fa1791560429f6b1271676db5f Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 14:01:33 +0200
Subject: [PATCH 08/89] Split memory_pool, fused_l2_knn, coalesced_reduction,
 selection_faiss

---
 cpp/CMakeLists.txt                            | 40 ++++++---
 .../core/resource/device_memory_resource.hpp  |  3 +-
 .../linalg/detail/coalesced_reduction-ext.cuh | 74 ++++++++++++++++
 .../linalg/detail/coalesced_reduction-inl.cuh | 21 +----
 .../detail/coalesced_reduction-types.cuh      | 34 ++++++++
 .../linalg/detail/coalesced_reduction.cuh     | 25 ++++++
 .../raft/neighbors/detail/knn_brute_force.cuh |  1 +
 .../neighbors/detail/selection_faiss-ext.cuh  | 70 ++++++++++++++++
 .../raft/neighbors/detail/selection_faiss.cuh | 25 ++++++
 .../spatial/knn/detail/fused_l2_knn-ext.cuh   | 84 +++++++++++++++++++
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  | 24 ++++++
 cpp/include/raft/util/cudart_utils.hpp        | 52 +-----------
 cpp/include/raft/util/memory_pool-ext.hpp     | 59 +++++++++++++
 cpp/include/raft/util/memory_pool-inl.hpp     | 43 ++++++++++
 cpp/include/raft/util/memory_pool.hpp         | 23 +++++
 cpp/src/linalg/detail/coalesced_reduction.cu  | 69 +++++++++++++++
 cpp/src/neighbors/detail/selection_faiss.cu   | 35 ++++++++
 cpp/src/util/memory_pool.cpp                  | 17 ++++
 cpp/test/util/device_atomics.cu               |  1 -
 19 files changed, 620 insertions(+), 80 deletions(-)
 create mode 100644 cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
 create mode 100644 cpp/include/raft/linalg/detail/coalesced_reduction-types.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
 create mode 100644 cpp/include/raft/util/memory_pool-ext.hpp
 create mode 100644 cpp/include/raft/util/memory_pool-inl.hpp
 create mode 100644 cpp/src/linalg/detail/coalesced_reduction.cu
 create mode 100644 cpp/src/neighbors/detail/selection_faiss.cu
 create mode 100644 cpp/src/util/memory_pool.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 94b24be853..f39638946c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -13,6 +13,17 @@
 set(RAPIDS_VERSION "23.04")
 set(RAFT_VERSION "23.04.00")
 
+include(FetchContent)
+FetchContent_Declare(
+  rapids-cmake
+  GIT_REPOSITORY https://github.com/ahendriksen/rapids-cmake.git
+  GIT_TAG different-rmm
+)
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.02/RAPIDS.cmake
+     ${CMAKE_CURRENT_BINARY_DIR}/RAPIDS.cmake
+)
+include(${CMAKE_CURRENT_BINARY_DIR}/RAPIDS.cmake)
+
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 include(../fetch_rapids.cmake)
 include(rapids-cmake)
@@ -276,12 +287,14 @@ if(RAFT_COMPILE_LIBRARY)
     src/cluster/cluster_cost_float.cu
     src/cluster/cluster_cost_double.cu
     src/core/logger.cpp
+    src/linalg/detail/coalesced_reduction.cu
     src/neighbors/refine_d_int64_t_float.cu
     src/neighbors/refine_d_int64_t_int8_t.cu
     src/neighbors/refine_d_int64_t_uint8_t.cu
     src/neighbors/refine_h_int64_t_float.cu
     src/neighbors/refine_h_int64_t_int8_t.cu
     src/neighbors/refine_h_int64_t_uint8_t.cu
+    src/neighbors/detail/selection_faiss.cu
     src/neighbors/specializations/refine_d_int64_t_float.cu
     src/neighbors/specializations/refine_d_int64_t_int8_t.cu
     src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
@@ -382,17 +395,17 @@ if(RAFT_COMPILE_LIBRARY)
     src/random/rmat_rectangular_generator_int64_double.cu
     src/random/rmat_rectangular_generator_int_float.cu
     src/random/rmat_rectangular_generator_int64_float.cu
-    src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
-    src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
-    src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
-    src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+    # src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+    # src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+    # src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+    # src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
     src/neighbors/specializations/ball_cover_all_knn_query.cu
     src/neighbors/specializations/ball_cover_build_index.cu
     src/neighbors/specializations/ball_cover_knn_query.cu
-    src/neighbors/specializations/fused_l2_knn_long_float_true.cu
-    src/neighbors/specializations/fused_l2_knn_long_float_false.cu
-    src/neighbors/specializations/fused_l2_knn_int_float_true.cu
-    src/neighbors/specializations/fused_l2_knn_int_float_false.cu
+    # src/neighbors/specializations/fused_l2_knn_long_float_true.cu
+    # src/neighbors/specializations/fused_l2_knn_long_float_false.cu
+    # src/neighbors/specializations/fused_l2_knn_int_float_true.cu
+    # src/neighbors/specializations/fused_l2_knn_int_float_false.cu
     src/neighbors/ivf_flat_search.cu
     src/neighbors/ivf_flat_build.cu
     src/neighbors/specializations/ivfflat_build_float_int64_t.cu
@@ -445,6 +458,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/random/rmat_rectangular_generator_int_float.cu
     src/random/rmat_rectangular_generator_int64_float.cu
     src/spatial/knn/detail/ball_cover/registers.cu
+    src/util/memory_pool.cpp
   )
   set_target_properties(
     raft_lib
@@ -618,7 +632,9 @@ rapids_export(
   COMPONENTS ${raft_components}
   COMPONENTS_EXPORT_SET ${raft_export_sets}
   GLOBAL_TARGETS raft compiled distributed
-  NAMESPACE raft:: DOCUMENTATION doc_string FINAL_CODE_BLOCK code_string
+  NAMESPACE raft::
+  DOCUMENTATION doc_string
+  FINAL_CODE_BLOCK code_string
 )
 
 # ##################################################################################################
@@ -628,8 +644,10 @@ rapids_export(
   EXPORT_SET raft-exports
   COMPONENTS ${raft_components}
   COMPONENTS_EXPORT_SET ${raft_export_sets}
-  GLOBAL_TARGETS raft
-  compiled distributed DOCUMENTATION doc_string NAMESPACE raft:: FINAL_CODE_BLOCK code_string
+  GLOBAL_TARGETS raft compiled distributed
+  DOCUMENTATION doc_string
+  NAMESPACE raft::
+  FINAL_CODE_BLOCK code_string
 )
 
 # ##################################################################################################
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 35ae3d715f..ebc41e0f8e 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -18,6 +18,7 @@
 #include <raft/core/resource/resource_types.hpp>
 #include <raft/core/resources.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 namespace raft::resource {
 class device_memory_resource : public resource {
@@ -72,4 +73,4 @@ inline void set_workspace_resource(resources const& res, rmm::mr::device_memory_
 {
   res.add_resource_factory(std::make_shared<workspace_resource_factory>(mr));
 };
-}  // namespace raft::resource
\ No newline at end of file
+}  // namespace raft::resource
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
new file mode 100644
index 0000000000..2a1bafae43
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "coalesced_reduction-types.cuh"
+#include <raft/core/operators.hpp>
+
+// Include inline definition as well. We cannot possibly cover all
+// instantiations in this file.
+#include "coalesced_reduction-inl.cuh"
+
+#define instantiate_raft_linalg_detail_coalescedReduction(                              \
+  InType, OutType, IdxType, MainLambda, ReduceLambda, FinalLambda)                      \
+  extern template void raft::linalg::detail::coalescedReduction(OutType* dots,          \
+                                                                const InType* data,     \
+                                                                IdxType D,              \
+                                                                IdxType N,              \
+                                                                OutType init,           \
+                                                                cudaStream_t stream,    \
+                                                                bool inplace,           \
+                                                                MainLambda main_op,     \
+                                                                ReduceLambda reduce_op, \
+                                                                FinalLambda final_op)
+
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::identity_op, raft::min_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::abs_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::abs_op, raft::max_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::abs_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::abs_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::identity_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::identity_op, raft::min_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, long, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::identity_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::abs_op, raft::max_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, unsigned int, raft::sq_op, raft::add_op, raft::identity_op);
+
+#undef instantiate_raft_linalg_detail_coalescedReduction
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index 4cc19d4a17..7ba4537b0f 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -16,23 +16,17 @@
 
 #pragma once
 
+#include "coalesced_reduction-types.cuh"  // policy structs
 #include <cub/cub.cuh>
-#include <raft/common/nvtx.hpp>
+#include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/util/cuda_utils.cuh>
-#include <rmm/device_uvector.hpp>
+#include <rmm/device_uvector.hpp>  // device_uvector
 
 namespace raft {
 namespace linalg {
 namespace detail {
 
-template <int warpSize, int rpb>
-struct ReductionThinPolicy {
-  static constexpr int LogicalWarpSize = warpSize;
-  static constexpr int RowsPerBlock    = rpb;
-  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
-};
-
 template <typename Policy,
           typename InType,
           typename OutType,
@@ -212,13 +206,6 @@ void coalescedReductionMediumDispatcher(OutType* dots,
     dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
-template <int tpb, int bpr>
-struct ReductionThickPolicy {
-  static constexpr int ThreadsPerBlock = tpb;
-  static constexpr int BlocksPerRow    = bpr;
-  static constexpr int BlockStride     = tpb * bpr;
-};
-
 template <typename Policy,
           typename InType,
           typename OutType,
@@ -365,4 +352,4 @@ void coalescedReduction(OutType* dots,
 
 }  // namespace detail
 }  // namespace linalg
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-types.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-types.cuh
new file mode 100644
index 0000000000..c31b4363dd
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-types.cuh
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace raft::linalg::detail {
+
+template <int warpSize, int rpb>
+struct ReductionThinPolicy {
+  static constexpr int LogicalWarpSize = warpSize;
+  static constexpr int RowsPerBlock    = rpb;
+  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
+};
+
+template <int tpb, int bpr>
+struct ReductionThickPolicy {
+  static constexpr int ThreadsPerBlock = tpb;
+  static constexpr int BlocksPerRow    = bpr;
+  static constexpr int BlockStride     = tpb * bpr;
+};
+
+}  // namespace raft::linalg::detail
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
index e69de29bb2..9a51611de1 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if defined(RAFT_COMPILED) && defined(RAFT_EXPLICIT_INSTANTIATE)
+// Too many lambdas and complicated types to instantiate everything..
+#include "coalesced_reduction-ext.cuh"
+#include "coalesced_reduction-inl.cuh"
+#else
+#include "coalesced_reduction-inl.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
index a776ce2586..0148a1a887 100644
--- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
@@ -36,6 +36,7 @@
 #include <raft/neighbors/detail/selection_faiss.cuh>
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 #include <raft/spatial/knn/detail/haversine_distance.cuh>
+#include <raft/spatial/knn/detail/processing.cuh>
 #include <set>
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
new file mode 100644
index 0000000000..cd6fdee192
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>                      // size_t
+#include <cstdint>                      // uint32_t
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+
+#if defined(RAFT_EXPLICIT_INSTANTIATE)
+
+namespace raft::neighbors::detail {
+/**
+ * @brief Select the k-nearest neighbors from dense
+ * distance and index matrices.
+ *
+ * @param[in] inK partitioned knn distance matrix
+ * @param[in] inV partitioned knn index matrix
+ * @param[in] n_rows number of rows in distance and index matrices
+ * @param[in] n_cols number of columns in distance and index matrices
+ * @param[out] outK merged knn distance matrix
+ * @param[out] outV merged knn index matrix
+ * @param[in] select_min whether to select the min or the max distances
+ * @param[in] k number of neighbors per partition (also number of merged neighbors)
+ * @param[in] stream CUDA stream to use
+ */
+template <typename payload_t = int, typename key_t = float>
+void select_k(const key_t* inK,
+              const payload_t* inV,
+              size_t n_rows,
+              size_t n_cols,
+              key_t* outK,
+              payload_t* outV,
+              bool select_min,
+              int k,
+              cudaStream_t stream) RAFT_EXPLICIT;
+};  // namespace raft::neighbors::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)           \
+  extern template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                         const payload_t* inV, \
+                                                         size_t n_rows,        \
+                                                         size_t n_cols,        \
+                                                         key_t* outK,          \
+                                                         payload_t* outV,      \
+                                                         bool select_min,      \
+                                                         int k,                \
+                                                         cudaStream_t stream)
+
+// @benfred: Not sure if this is correct. Should I not flip float and uint32_t?
+// It seems weird that float is the key and uint32_t is the payload type.
+instantiate_raft_neighbors_detail_select_k(uint32_t, float);
+instantiate_raft_neighbors_detail_select_k(long, float);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss.cuh b/cpp/include/raft/neighbors/detail/selection_faiss.cuh
index e69de29bb2..ccdba994d1 100644
--- a/cpp/include/raft/neighbors/detail/selection_faiss.cuh
+++ b/cpp/include/raft/neighbors/detail/selection_faiss.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if defined(RAFT_COMPILED)
+#include "selection_faiss-ext.cuh"
+#endif
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "selection_faiss-inl.cuh"
+#endif
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
new file mode 100644
index 0000000000..5f4d5a6347
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstddef>                           // size_t
+#include <raft/distance/distance_types.hpp>  // DistanceType
+#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
+
+#if defined(RAFT_EXPLICIT_INSTANTIATE)
+
+namespace raft::spatial::knn::detail {
+/**
+ * Compute the k-nearest neighbors using L2 expanded/unexpanded distance.
+
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[out] out_inds output indices array on device (size n_query_rows * k)
+ * @param[out] out_dists output dists array on device (size n_query_rows * k)
+ * @param[in] index input index array on device (size n_index_rows * D)
+ * @param[in] query input query array on device (size n_query_rows * D)
+ * @param[in] n_index_rows number of rows in index array
+ * @param[in] n_query_rows number of rows in query array
+ * @param[in] k number of closest neighbors to return
+ * @param[in] rowMajorIndex are the index arrays in row-major layout?
+ * @param[in] rowMajorQuery are the query array in row-major layout?
+ * @param[in] stream stream to order kernel launch
+ */
+template <typename value_idx, typename value_t, bool usePrevTopKs = false>
+void fusedL2Knn(size_t D,
+                value_idx* out_inds,
+                value_t* out_dists,
+                const value_t* index,
+                const value_t* query,
+                size_t n_index_rows,
+                size_t n_query_rows,
+                int k,
+                bool rowMajorIndex,
+                bool rowMajorQuery,
+                cudaStream_t stream,
+                raft::distance::DistanceType metric) RAFT_EXPLICIT;
+
+}  // namespace raft::spatial::knn::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs) \
+  extern template void                                                                      \
+  raft::spatial::knn::detail::fusedL2Knn<Mvalue_idx, Mvalue_t, MusePrevTopKs>(              \
+    size_t D,                                                                               \
+    Mvalue_idx * out_inds,                                                                  \
+    Mvalue_t * out_dists,                                                                   \
+    const Mvalue_t* index,                                                                  \
+    const Mvalue_t* query,                                                                  \
+    size_t n_index_rows,                                                                    \
+    size_t n_query_rows,                                                                    \
+    int k,                                                                                  \
+    bool rowMajorIndex,                                                                     \
+    bool rowMajorQuery,                                                                     \
+    cudaStream_t stream,                                                                    \
+    raft::distance::DistanceType metric)
+
+instantiate_raft_spatial_knn_detail_fusedL2Knn(long, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(long, float, false);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int, float, false);
+
+// These are used by brute_force_knn:
+instantiate_raft_spatial_knn_detail_fusedL2Knn(unsigned int, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(unsigned int, float, false);
+
+#undef instantiate_raft_spatial_knn_detail_fusedL2Knn
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index e69de29bb2..7b16fc6f72 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#if defined(RAFT_COMPILED)
+#include "fused_l2_knn-ext.cuh"
+#endif
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "fused_l2_knn-inl.cuh"
+#endif
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index 1134513587..f3b083ac4a 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -18,10 +18,9 @@
 
 #include <raft/core/error.hpp>
 #include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/memory_pool.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cuda_fp16.h>
 #include <cuda_runtime_api.h>
@@ -451,51 +450,4 @@ constexpr inline auto upper_bound<half>() -> half
   return static_cast<half>(__half_constexpr{0x7c00u});
 }
 
-/**
- * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
- * unique pointer.
- *
- * This function is useful in the code where multiple repeated allocations/deallocations are
- * expected.
- * Use case example:
- * @code{.cpp}
- *   void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) {
- *     auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float));
- *     if (pool_guard){
- *       RAFT_LOG_INFO("Created a pool %zu bytes", pool_guard->pool_size());
- *     } else {
- *       RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource");
- *     }
- *     rmm::device_uvector<float> x(n, stream, mr);
- *     rmm::device_uvector<float> y(n, stream, mr);
- *     ...
- *   }
- * @endcode
- * Here, the new memory resource would be created within the function scope if the passed `mr` is
- * null and the default resource is not a pool. After the call, `mr` contains a valid memory
- * resource in any case.
- *
- * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
- * into a `pool_memory_resource` if necessary and return the pointer to the result.
- * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
- * to 256 bytes).
- *
- * @return if a new memory pool is created, it returns a unique_ptr to it;
- *   this managed pointer controls the lifetime of the created memory resource.
- */
-inline auto get_pool_memory_resource(rmm::mr::device_memory_resource*& mr, size_t initial_size)
-{
-  using pool_res_t = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
-  std::unique_ptr<pool_res_t> pool_res{};
-  if (mr) return pool_res;
-  mr = rmm::mr::get_current_device_resource();
-  if (!dynamic_cast<pool_res_t*>(mr) &&
-      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(mr) &&
-      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(mr)) {
-    pool_res = std::make_unique<pool_res_t>(mr, (initial_size + 255) & (~255));
-    mr       = pool_res.get();
-  }
-  return pool_res;
-}
-
 }  // namespace raft
diff --git a/cpp/include/raft/util/memory_pool-ext.hpp b/cpp/include/raft/util/memory_pool-ext.hpp
new file mode 100644
index 0000000000..fb48bc70c4
--- /dev/null
+++ b/cpp/include/raft/util/memory_pool-ext.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cstddef>
+#include <memory>                                    // std::unique_ptr
+#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
+
+namespace raft {
+
+/**
+ * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
+ * unique pointer.
+ *
+ * This function is useful in the code where multiple repeated allocations/deallocations are
+ * expected.
+ * Use case example:
+ * @code{.cpp}
+ *   void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) {
+ *     auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float));
+ *     if (pool_guard){
+ *       RAFT_LOG_INFO("Created a pool %zu bytes", pool_guard->pool_size());
+ *     } else {
+ *       RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource");
+ *     }
+ *     rmm::device_uvector<float> x(n, stream, mr);
+ *     rmm::device_uvector<float> y(n, stream, mr);
+ *     ...
+ *   }
+ * @endcode
+ * Here, the new memory resource would be created within the function scope if the passed `mr` is
+ * null and the default resource is not a pool. After the call, `mr` contains a valid memory
+ * resource in any case.
+ *
+ * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
+ * into a `pool_memory_resource` if necessary and return the pointer to the result.
+ * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
+ * to 256 bytes).
+ *
+ * @return if a new memory pool is created, it returns a unique_ptr to it;
+ *   this managed pointer controls the lifetime of the created memory resource.
+ */
+std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
+  rmm::mr::device_memory_resource*& mr, size_t initial_size);
+
+}  // namespace raft
diff --git a/cpp/include/raft/util/memory_pool-inl.hpp b/cpp/include/raft/util/memory_pool-inl.hpp
new file mode 100644
index 0000000000..43ac24d3f6
--- /dev/null
+++ b/cpp/include/raft/util/memory_pool-inl.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cstddef>
+#include <memory>
+
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft {
+
+std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
+  rmm::mr::device_memory_resource*& mr, size_t initial_size)
+{
+  using pool_res_t = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+  std::unique_ptr<pool_res_t> pool_res{};
+  if (mr) return pool_res;
+  mr = rmm::mr::get_current_device_resource();
+  if (!dynamic_cast<pool_res_t*>(mr) &&
+      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(mr) &&
+      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(mr)) {
+    pool_res = std::make_unique<pool_res_t>(mr, (initial_size + 255) & (~255));
+    mr       = pool_res.get();
+  }
+  return pool_res;
+}
+
+}  // namespace raft
diff --git a/cpp/include/raft/util/memory_pool.hpp b/cpp/include/raft/util/memory_pool.hpp
index e69de29bb2..3bf0beb773 100644
--- a/cpp/include/raft/util/memory_pool.hpp
+++ b/cpp/include/raft/util/memory_pool.hpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef RAFT_COMPILED
+#include "memory_pool-ext.hpp"
+#else
+#include "memory_pool-inl.hpp"
+#endif  // RAFT_COMPILED
diff --git a/cpp/src/linalg/detail/coalesced_reduction.cu b/cpp/src/linalg/detail/coalesced_reduction.cu
new file mode 100644
index 0000000000..00d025df46
--- /dev/null
+++ b/cpp/src/linalg/detail/coalesced_reduction.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// #include <raft/linalg/detail/coalesced_reduction-ext.cuh>
+
+#include <raft/linalg/detail/coalesced_reduction-inl.cuh>
+
+#define instantiate_raft_linalg_detail_coalescedReduction(                       \
+  InType, OutType, IdxType, MainLambda, ReduceLambda, FinalLambda)               \
+  template void raft::linalg::detail::coalescedReduction(OutType* dots,          \
+                                                         const InType* data,     \
+                                                         IdxType D,              \
+                                                         IdxType N,              \
+                                                         OutType init,           \
+                                                         cudaStream_t stream,    \
+                                                         bool inplace,           \
+                                                         MainLambda main_op,     \
+                                                         ReduceLambda reduce_op, \
+                                                         FinalLambda final_op)
+
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::identity_op, raft::min_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::abs_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  double, double, int, raft::abs_op, raft::max_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::abs_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::abs_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::identity_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::identity_op, raft::min_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, int, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, long, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::identity_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::sq_op, raft::add_op, raft::identity_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::abs_op, raft::max_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, size_t, raft::sq_op, raft::add_op, raft::sqrt_op);
+instantiate_raft_linalg_detail_coalescedReduction(
+  float, float, unsigned int, raft::sq_op, raft::add_op, raft::identity_op);
+
+#undef instantiate_raft_linalg_detail_coalescedReduction
diff --git a/cpp/src/neighbors/detail/selection_faiss.cu b/cpp/src/neighbors/detail/selection_faiss.cu
new file mode 100644
index 0000000000..067ac5fdda
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+// @benfred: Not sure if this is correct. Should I not flip float and uint32_t?
+// It seems weird that float is the key and uint32_t is the payload type.
+instantiate_raft_neighbors_detail_select_k(uint32_t, float);
+instantiate_raft_neighbors_detail_select_k(long, float);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/util/memory_pool.cpp b/cpp/src/util/memory_pool.cpp
new file mode 100644
index 0000000000..837e870043
--- /dev/null
+++ b/cpp/src/util/memory_pool.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/util/memory_pool-inl.hpp>
diff --git a/cpp/test/util/device_atomics.cu b/cpp/test/util/device_atomics.cu
index 5e8a67c8f6..0b22073d1b 100644
--- a/cpp/test/util/device_atomics.cu
+++ b/cpp/test/util/device_atomics.cu
@@ -25,7 +25,6 @@
 #include <raft/util/device_atomics.cuh>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
 
 namespace raft {
 

From 2aebe4bc35280a9f29aaa90ae8be6c86d7dc1804 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 6 Apr 2023 14:54:06 +0200
Subject: [PATCH 09/89] Split ivf_flat_search and interleaved_scan

Also reduce specialization on veclen from 1, 2, .. 16/sizeof(T) to 1,
16/sizeof(T).
---
 cpp/CMakeLists.txt                            |   22 +-
 cpp/include/raft/core/mdarray.hpp             |    1 +
 .../raft/matrix/detail/select_k-ext.cuh       |   98 ++
 cpp/include/raft/matrix/detail/select_k.cuh   |   25 +
 .../raft/matrix/detail/select_warpsort.cuh    |    2 +-
 .../detail/ivf_flat_interleaved_scan-ext.cuh  |   93 ++
 .../detail/ivf_flat_interleaved_scan-inl.cuh  | 1084 +++++++++++++++++
 .../detail/ivf_flat_interleaved_scan.cuh      |   25 +
 .../neighbors/detail/ivf_flat_search-ext.cuh  |   61 +
 .../neighbors/detail/ivf_flat_search-inl.cuh  | 1083 +---------------
 .../raft/neighbors/detail/ivf_flat_search.cuh |   25 +
 cpp/include/raft/neighbors/detail/refine.cuh  |    2 +
 .../raft/neighbors/detail/selection_faiss.cuh |    8 +-
 cpp/include/raft/neighbors/ivf_flat_types.hpp |    3 +-
 .../neighbors/specializations/ivf_flat.cuh    |   25 +-
 .../raft/spatial/knn/detail/ann_utils.cuh     |    1 -
 ...at_interleaved_scan_float_float_int64_t.cu |   36 +
 ...interleaved_scan_int8_t_int32_t_int64_t.cu |   36 +
 ...terleaved_scan_uint8_t_uint32_t_int64_t.cu |   36 +
 cpp/src/neighbors/detail/ivf_flat_search.cu   |   35 +
 20 files changed, 1591 insertions(+), 1110 deletions(-)
 create mode 100644 cpp/include/raft/matrix/detail/select_k-ext.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
 create mode 100644 cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
 create mode 100644 cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/detail/ivf_flat_search.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f39638946c..8aa71647c2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -347,10 +347,14 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/specializations/fused_l2_nn_double_int64.cu
     src/distance/specializations/fused_l2_nn_float_int.cu
     src/distance/specializations/fused_l2_nn_float_int64.cu
-    src/matrix/specializations/detail/select_k_float_uint32_t.cu
-    src/matrix/specializations/detail/select_k_float_int64_t.cu
-    src/matrix/specializations/detail/select_k_half_uint32_t.cu
-    src/matrix/specializations/detail/select_k_half_int64_t.cu
+    src/matrix/detail/select_k_float_uint32_t.cu
+    src/matrix/detail/select_k_float_uint64_t.cu
+    src/matrix/detail/select_k_half_uint32_t.cu
+    src/matrix/detail/select_k_half_uint64_t.cu
+    # src/matrix/specializations/detail/select_k_float_uint32_t.cu
+    # src/matrix/specializations/detail/select_k_float_int64_t.cu
+    # src/matrix/specializations/detail/select_k_half_uint32_t.cu
+    # src/matrix/specializations/detail/select_k_half_int64_t.cu
     src/neighbors/ivfpq_build.cu
     src/neighbors/ivfpq_deserialize.cu
     src/neighbors/ivfpq_serialize.cu
@@ -395,6 +399,10 @@ if(RAFT_COMPILE_LIBRARY)
     src/random/rmat_rectangular_generator_int64_double.cu
     src/random/rmat_rectangular_generator_int_float.cu
     src/random/rmat_rectangular_generator_int64_float.cu
+    src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
+    src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
+    src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
+    src/neighbors/detail/ivf_flat_search.cu
     # src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
     # src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
     # src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
@@ -414,9 +422,9 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
     src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
     src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfflat_search_float_int64_t.cu
-    src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
+    # src/neighbors/specializations/ivfflat_search_float_int64_t.cu
+    # src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
+    # src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
     src/neighbors/ivfpq_build.cu
     src/neighbors/ivfpq_deserialize.cu
     src/neighbors/ivfpq_serialize.cu
diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index 61c1b500e6..35a5758890 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -25,6 +25,7 @@
 #include <stddef.h>
 
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/device_resources.hpp>
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/mdspan_types.hpp>
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
new file mode 100644
index 0000000000..074ac3127a
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cuda_fp16.h>
+#include <raft/util/raft_explicit.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft::matrix::detail {
+
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_val` as a row-major matrix with `len` columns and
+ * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
+ * in the row-major matrix `out_val` of size (batch_size, k).
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ *
+ * @param[in] in_val
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   typically, these are indices of the corresponding in_val.
+ * @param batch_size
+ *   number of input rows, i.e. the batch size.
+ * @param len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: len >= k.
+ * @param k
+ *   the number of outputs to select in each input row.
+ * @param[out] out_val
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the k smallest/largest values from each row of the `in_val`.
+ * @param[out] out_idx
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the payload selected together with `out_val`.
+ * @param select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param stream
+ * @param mr an optional memory resource to use across the calls (you can provide a large enough
+ *           memory pool here to avoid memory allocations within the call).
+ */
+template <typename T, typename IdxT>
+void select_k(const T* in_val,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out_val,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+}  // namespace raft::matrix::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
+  extern template void raft::matrix::detail::select_k(const T* in_val,              \
+                                                      const IdxT* in_idx,           \
+                                                      size_t batch_size,            \
+                                                      size_t len,                   \
+                                                      int k,                        \
+                                                      T* out_val,                   \
+                                                      IdxT* out_idx,                \
+                                                      bool select_min,              \
+                                                      rmm::cuda_stream_view stream, \
+                                                      rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(__half, uint32_t);
+instantiate_raft_matrix_detail_select_k(__half, int64_t);
+instantiate_raft_matrix_detail_select_k(float, int64_t);
+instantiate_raft_matrix_detail_select_k(float, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/include/raft/matrix/detail/select_k.cuh b/cpp/include/raft/matrix/detail/select_k.cuh
index e69de29bb2..54dabf77bd 100644
--- a/cpp/include/raft/matrix/detail/select_k.cuh
+++ b/cpp/include/raft/matrix/detail/select_k.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "select_k-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "select_k-ext.cuh"
+#endif
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index d362b73792..5f3d0e6bc7 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -27,7 +27,7 @@
 #include <functional>
 #include <type_traits>
 
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 /*
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
new file mode 100644
index 0000000000..a0eaea0260
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>                                // uintX_t
+#include <raft/neighbors/ivf_flat_types.hpp>      // index
+#include <raft/spatial/knn/detail/ann_utils.cuh>  // TODO: consider remove
+#include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>               // rmm:cuda_stream_view
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft::neighbors::ivf_flat::detail {
+
+using namespace raft::spatial::knn::detail;  // NOLINT
+
+/**
+ * @brief Configure and launch an appropriate template instance of the interleaved scan kernel.
+ *
+ * @tparam T value type
+ * @tparam AccT accumulated type
+ * @tparam IdxT type of the indices
+ *
+ * @param index previously built ivf-flat index
+ * @param[in] queries device pointer to the query vectors [batch_size, dim]
+ * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes]
+ * @param n_queries batch size
+ * @param metric type of the measured distance
+ * @param n_probes number of nearest clusters to query
+ * @param k number of nearest neighbors.
+ *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
+ * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
+ * metric.
+ * @param[out] neighbors device pointer to the result indices for each query and cluster
+ * [batch_size, grid_dim_x, k]
+ * @param[out] distances device pointer to the result distances for each query and cluster
+ * [batch_size, grid_dim_x, k]
+ * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters;
+ *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
+ * @param stream
+ */
+template <typename T, typename AccT, typename IdxT>
+void ivfflat_interleaved_scan(const raft::neighbors::ivf_flat::index<T, IdxT>& index,
+                              const T* queries,
+                              const uint32_t* coarse_query_results,
+                              const uint32_t n_queries,
+                              const raft::distance::DistanceType metric,
+                              const uint32_t n_probes,
+                              const uint32_t k,
+                              const bool select_min,
+                              IdxT* neighbors,
+                              float* distances,
+                              uint32_t& grid_dim_x,
+                              rmm::cuda_stream_view stream) RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors::ivf_flat::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT)         \
+  extern template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<T, AccT, IdxT>( \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,                                        \
+    const T* queries,                                                                              \
+    const uint32_t* coarse_query_results,                                                          \
+    const uint32_t n_queries,                                                                      \
+    const raft::distance::DistanceType metric,                                                     \
+    const uint32_t n_probes,                                                                       \
+    const uint32_t k,                                                                              \
+    const bool select_min,                                                                         \
+    IdxT* neighbors,                                                                               \
+    float* distances,                                                                              \
+    uint32_t& grid_dim_x,                                                                          \
+    rmm::cuda_stream_view stream)
+
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(float, float, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(int8_t, int32_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(uint8_t, uint32_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
new file mode 100644
index 0000000000..4848022e30
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
@@ -0,0 +1,1084 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/logger.hpp>  // RAFT_LOG_TRACE
+#include <raft/core/operators.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/cuda_rt_essentials.hpp>  // RAFT_CUDA_TRY
+#include <raft/util/device_loads_stores.cuh>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::neighbors::ivf_flat::detail {
+
+using namespace raft::spatial::knn::detail;  // NOLINT
+
+constexpr int kThreadsPerBlock = 128;
+
+/**
+ * @brief Copy `n` elements per block from one place to another.
+ *
+ * @param[out] out target pointer (unique per block)
+ * @param[in] in source pointer
+ * @param n number of elements to copy
+ */
+template <int VecBytes = 16, typename T>
+__device__ inline void copy_vectorized(T* out, const T* in, uint32_t n)
+{
+  constexpr int VecElems = VecBytes / sizeof(T);  // NOLINT
+  using align_bytes      = Pow2<(size_t)VecBytes>;
+  if constexpr (VecElems > 1) {
+    using align_elems = Pow2<VecElems>;
+    if (!align_bytes::areSameAlignOffsets(out, in)) {
+      return copy_vectorized<(VecBytes >> 1), T>(out, in, n);
+    }
+    {  // process unaligned head
+      uint32_t head = align_bytes::roundUp(in) - in;
+      if (head > 0) {
+        copy_vectorized<sizeof(T), T>(out, in, head);
+        n -= head;
+        in += head;
+        out += head;
+      }
+    }
+    {  // process main part vectorized
+      using vec_t = typename IOType<T, VecElems>::Type;
+      copy_vectorized<sizeof(vec_t), vec_t>(
+        reinterpret_cast<vec_t*>(out), reinterpret_cast<const vec_t*>(in), align_elems::div(n));
+    }
+    {  // process unaligned tail
+      uint32_t tail = align_elems::mod(n);
+      if (tail > 0) {
+        n -= tail;
+        copy_vectorized<sizeof(T), T>(out + n, in + n, tail);
+      }
+    }
+  }
+  if constexpr (VecElems <= 1) {
+    for (int i = threadIdx.x; i < n; i += blockDim.x) {
+      out[i] = in[i];
+    }
+  }
+}
+
+/**
+ * @brief Load a part of a vector from the index and from query, compute the (part of the) distance
+ * between them, and aggregate it using the provided Lambda; one structure per thread, per query,
+ * and per index item.
+ *
+ * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen)
+ * @tparam Lambda computing the part of the distance for one dimension and aggregating it:
+ *                void (AccT& acc, AccT x, AccT y)
+ * @tparam Veclen size of the vectorized load
+ * @tparam T type of the data in the query and the index
+ * @tparam AccT type of the accumulated value (an optimization for 8bit values to be loaded as 32bit
+ * values)
+ */
+template <int kUnroll, typename Lambda, int Veclen, typename T, typename AccT>
+struct loadAndComputeDist {
+  Lambda compute_dist;
+  AccT& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(AccT& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version assumes the query is stored in shared memory.
+   * Every thread here processes exactly kUnroll * Veclen elements independently of others.
+   */
+  template <typename IdxT>
+  __device__ __forceinline__ void runLoadShmemCompute(const T* const& data,
+                                                      const T* query_shared,
+                                                      IdxT loadIndex,
+                                                      IdxT shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      T encV[Veclen];
+      ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen);
+      T queryRegs[Veclen];
+      lds(queryRegs, &query_shared[shmemIndex + j * Veclen]);
+#pragma unroll
+      for (int k = 0; k < Veclen; ++k) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version assumes the query is stored in the global memory and is different for every
+   * thread. One warp loads exactly WarpSize query elements at once and then reshuffles them into
+   * corresponding threads (`WarpSize / (kUnroll * Veclen)` elements per thread at once).
+   */
+  template <typename IdxT>
+  __device__ __forceinline__ void runLoadShflAndCompute(const T*& data,
+                                                        const T* query,
+                                                        IdxT baseLoadIndex,
+                                                        const int lane_id)
+  {
+    T queryReg               = query[baseLoadIndex + lane_id];
+    constexpr int stride     = kUnroll * Veclen;
+    constexpr int totalIter  = WarpSize / stride;
+    constexpr int gmemStride = stride * kIndexGroupSize;
+#pragma unroll
+    for (int i = 0; i < totalIter; ++i, data += gmemStride) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        T encV[Veclen];
+        ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen);
+        const int d = (i * kUnroll + j) * Veclen;
+#pragma unroll
+        for (int k = 0; k < Veclen; ++k) {
+          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
+        }
+      }
+    }
+  }
+
+  /**
+   * Load parts of vectors from the index and query and accumulates the partial distance.
+   * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`.
+   */
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    const int loadDim     = dimBlocks + lane_id;
+    T queryReg            = loadDim < dim ? query[loadDim] : 0;
+    const int loadDataIdx = lane_id * Veclen;
+    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) {
+      T enc[Veclen];
+      ldg(enc, data + loadDataIdx);
+#pragma unroll
+      for (int k = 0; k < Veclen; k++) {
+        compute_dist(dist, shfl(queryReg, d + k, WarpSize), enc[k]);
+      }
+    }
+  }
+};
+
+// This handles uint8_t 8, 16 Veclens
+template <int kUnroll, typename Lambda, int uint8_veclen>
+struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
+    loadIndex                = loadIndex * veclen_int;
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV[veclen_int];
+      ldg(encV,
+          reinterpret_cast<unsigned const*>(data) + loadIndex + j * kIndexGroupSize * veclen_int);
+      uint32_t queryRegs[veclen_int];
+      lds(queryRegs, reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
+    uint32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int stride = kUnroll * uint8_veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV[veclen_int];
+        ldg(encV,
+            reinterpret_cast<unsigned const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
+        const int d = (i * kUnroll + j) * veclen_int;
+#pragma unroll
+        for (int k = 0; k < veclen_int; ++k) {
+          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen_int = uint8_veclen / 4;
+    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int
+    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks;
+         d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) {
+      uint32_t enc[veclen_int];
+      ldg(enc, reinterpret_cast<uint32_t const*>(data) + lane_id * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        uint32_t q = shfl(queryReg, (d / 4) + k, WarpSize);
+        compute_dist(dist, q, enc[k]);
+      }
+    }
+  }
+};
+
+// Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
+// using above common template of int2/int4
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = reinterpret_cast<unsigned const*>(data)[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = reinterpret_cast<unsigned const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 4;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = reinterpret_cast<unsigned const*>(data)[lane_id + j * kIndexGroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 4;
+    const int loadDim    = dimBlocks + lane_id;
+    uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = reinterpret_cast<unsigned const*>(data)[lane_id];
+      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg =
+      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 2;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 2;
+    int loadDim          = dimBlocks + lane_id * veclen;
+    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = reinterpret_cast<uint16_t const*>(data)[lane_id];
+      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
+  Lambda compute_dist;
+  uint32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
+                                                      const uint8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      uint32_t encV      = data[loadIndex + j * kIndexGroupSize];
+      uint32_t queryRegs = query_shared[shmemIndex + j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
+                                                        const uint8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    uint32_t queryReg    = query[baseLoadIndex + lane_id];
+    constexpr int veclen = 1;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        uint32_t encV = data[lane_id + j * kIndexGroupSize];
+        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
+                                                                 const uint8_t* query,
+                                                                 const int lane_id,
+                                                                 const int dim,
+                                                                 const int dimBlocks)
+  {
+    constexpr int veclen = 1;
+    int loadDim          = dimBlocks + lane_id;
+    uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      uint32_t enc = data[lane_id];
+      uint32_t q   = shfl(queryReg, d, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+// This device function is for int8 veclens 4, 8 and 16
+template <int kUnroll, typename Lambda, int int8_veclen>
+struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
+
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      int32_t encV[veclen_int];
+      ldg(encV,
+          reinterpret_cast<int32_t const*>(data) + (loadIndex + j * kIndexGroupSize) * veclen_int);
+      int32_t queryRegs[veclen_int];
+      lds(queryRegs, reinterpret_cast<int32_t const*>(query_shared + shmemIndex) + j * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        compute_dist(dist, queryRegs[k], encV[k]);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
+
+    int32_t queryReg =
+      (lane_id < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int stride = kUnroll * int8_veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        int32_t encV[veclen_int];
+        ldg(encV,
+            reinterpret_cast<int32_t const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
+        const int d = (i * kUnroll + j) * veclen_int;
+#pragma unroll
+        for (int k = 0; k < veclen_int; ++k) {
+          int32_t q = shfl(queryReg, d + k, WarpSize);
+          compute_dist(dist, q, encV[k]);
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen_int = int8_veclen / 4;
+    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int;
+    int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) {
+      int32_t enc[veclen_int];
+      ldg(enc, reinterpret_cast<int32_t const*>(data) + lane_id * veclen_int);
+#pragma unroll
+      for (int k = 0; k < veclen_int; k++) {
+        int32_t q = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
+        compute_dist(dist, q, enc[k]);
+      }
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      int32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
+      int32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
+      compute_dist(dist, queryRegs, encV);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    int32_t queryReg =
+      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
+    constexpr int veclen = 2;
+    constexpr int stride = kUnroll * veclen;
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        int32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
+        int32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
+        compute_dist(dist, q, encV);
+      }
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen = 2;
+    int loadDim          = dimBlocks + lane_id * veclen;
+    int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      int32_t enc = reinterpret_cast<uint16_t const*>(data + lane_id * veclen)[0];
+      int32_t q   = shfl(queryReg, d / veclen, WarpSize);
+      compute_dist(dist, q, enc);
+    }
+  }
+};
+
+template <int kUnroll, typename Lambda>
+struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t> {
+  Lambda compute_dist;
+  int32_t& dist;
+  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
+    : dist(dist), compute_dist(op)
+  {
+  }
+
+  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
+                                                      const int8_t* query_shared,
+                                                      int loadIndex,
+                                                      int shmemIndex)
+  {
+#pragma unroll
+    for (int j = 0; j < kUnroll; ++j) {
+      compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * kIndexGroupSize]);
+    }
+  }
+
+  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
+                                                        const int8_t* query,
+                                                        int baseLoadIndex,
+                                                        const int lane_id)
+  {
+    constexpr int veclen = 1;
+    constexpr int stride = kUnroll * veclen;
+    int32_t queryReg     = query[baseLoadIndex + lane_id];
+
+#pragma unroll
+    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
+#pragma unroll
+      for (int j = 0; j < kUnroll; ++j) {
+        compute_dist(
+          dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[lane_id + j * kIndexGroupSize]);
+      }
+    }
+  }
+  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
+    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
+  {
+    constexpr int veclen = 1;
+    const int loadDim    = dimBlocks + lane_id;
+    int32_t queryReg     = loadDim < dim ? query[loadDim] : 0;
+    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
+      compute_dist(dist, shfl(queryReg, d, WarpSize), data[lane_id]);
+    }
+  }
+};
+
+/**
+ * Scan clusters for nearest neighbors of the query vectors.
+ * See `ivfflat_interleaved_scan` for more information.
+ *
+ * The clusters are stored in the interleaved index format described in ivf_flat_types.hpp.
+ * For each query vector, a set of clusters is probed: the distance to each vector in the cluster is
+ * calculated, and the top-k nearest neighbors are selected.
+ *
+ * @param compute_dist distance function
+ * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a
+ * block; this number must be a multiple of `WarpSize * Veclen`.
+ * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim]
+ * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes]
+ * @param[in] list_indices index<T, IdxT>.indices
+ * @param[in] list_data index<T, IdxT>.data
+ * @param[in] list_sizes index<T, IdxT>.list_sizes
+ * @param[in] list_offsets index<T, IdxT>.list_offsets
+ * @param n_probes
+ * @param k
+ * @param dim
+ * @param[out] neighbors
+ * @param[out] distances
+ */
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename Lambda,
+          typename PostLambda>
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  interleaved_scan_kernel(Lambda compute_dist,
+                          PostLambda post_process,
+                          const uint32_t query_smem_elems,
+                          const T* query,
+                          const uint32_t* coarse_index,
+                          const IdxT* const* list_indices_ptrs,
+                          const T* const* list_data_ptrs,
+                          const uint32_t* list_sizes,
+                          const uint32_t n_probes,
+                          const uint32_t k,
+                          const uint32_t dim,
+                          IdxT* neighbors,
+                          float* distances)
+{
+  extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
+  // Using shared memory for the (part of the) query;
+  // This allows to save on global memory bandwidth when reading index and query
+  // data at the same time.
+  // Its size is `query_smem_elems`.
+  T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
+  // Make the query input and output point to this block's shared query
+  {
+    const int query_id = blockIdx.y;
+    query += query_id * dim;
+    neighbors += query_id * k * gridDim.x + blockIdx.x * k;
+    distances += query_id * k * gridDim.x + blockIdx.x * k;
+    coarse_index += query_id * n_probes;
+  }
+
+  // Copy a part of the query into shared memory for faster processing
+  copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
+  __syncthreads();
+
+  using block_sort_t = matrix::detail::select::warpsort::block_sort<
+    matrix::detail::select::warpsort::warp_sort_filtered,
+    Capacity,
+    Ascending,
+    float,
+    IdxT>;
+  block_sort_t queue(k);
+
+  {
+    using align_warp  = Pow2<WarpSize>;
+    const int lane_id = align_warp::mod(threadIdx.x);
+
+    // How many full warps needed to compute the distance (without remainder)
+    const uint32_t full_warps_along_dim = align_warp::roundDown(dim);
+
+    const uint32_t shm_assisted_dim =
+      (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim;
+
+    // Every CUDA block scans one cluster at a time.
+    for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) {
+      const uint32_t list_id = coarse_index[probe_id];  // The id of cluster(list)
+
+      // The number of vectors in each cluster(list); [nlist]
+      const uint32_t list_length = list_sizes[list_id];
+
+      // The number of interleaved groups to be processed
+      const uint32_t num_groups =
+        align_warp::div(list_length + align_warp::Mask);  // ceildiv by power of 2
+
+      constexpr int kUnroll        = WarpSize / Veclen;
+      constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
+      // Every warp reads WarpSize vectors and computes the distances to them.
+      // Then, the distances and corresponding ids are distributed among the threads,
+      // and each thread adds one (id, dist) pair to the filtering queue.
+      for (uint32_t group_id = align_warp::div(threadIdx.x); group_id < num_groups;
+           group_id += kNumWarps) {
+        AccT dist = 0;
+        // This is where this warp begins reading data (start position of an interleaved group)
+        const T* data = list_data_ptrs[list_id] + (group_id * kIndexGroupSize) * dim;
+
+        // This is the vector a given lane/thread handles
+        const uint32_t vec_id = group_id * WarpSize + lane_id;
+        const bool valid      = vec_id < list_length;
+
+        // Process first shm_assisted_dim dimensions (always using shared memory)
+        if (valid) {
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                  compute_dist);
+          for (int pos = 0; pos < shm_assisted_dim;
+               pos += WarpSize, data += kIndexGroupSize * WarpSize) {
+            lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
+          }
+        }
+
+        if (dim > query_smem_elems) {
+          // The default path - using shfl ops - for dimensions beyond query_smem_elems
+          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
+                                                                                  compute_dist);
+          for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += WarpSize) {
+            lc.runLoadShflAndCompute(data, query, pos, lane_id);
+          }
+          lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim);
+        } else {
+          // when  shm_assisted_dim == full_warps_along_dim < dim
+          if (valid) {
+            loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT> lc(dist, compute_dist);
+            for (int pos = full_warps_along_dim; pos < dim;
+                 pos += Veclen, data += kIndexGroupSize * Veclen) {
+              lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
+            }
+          }
+        }
+
+        // Enqueue one element per thread
+        const float val  = valid ? static_cast<float>(dist) : block_sort_t::queue_t::kDummy;
+        const size_t idx = valid ? static_cast<size_t>(list_indices_ptrs[list_id][vec_id]) : 0;
+        queue.add(val, idx);
+      }
+    }
+  }
+
+  // finalize and store selected neighbours
+  __syncthreads();
+  queue.done(interleaved_scan_kernel_smem);
+  queue.store(distances, neighbors, post_process);
+}
+
+/**
+ *  Configure the gridDim.x to maximize GPU occupancy, but reduce the output size
+ */
+template <typename T>
+uint32_t configure_launch_x(uint32_t numQueries, uint32_t n_probes, int32_t sMemSize, T func)
+{
+  int dev_id;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  int num_sms;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  int num_blocks_per_sm = 0;
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize));
+
+  size_t min_grid_size = num_sms * num_blocks_per_sm;
+  size_t min_grid_x    = ceildiv<size_t>(min_grid_size, numQueries);
+  return min_grid_x > n_probes ? n_probes : static_cast<uint32_t>(min_grid_x);
+}
+
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename Lambda,
+          typename PostLambda>
+void launch_kernel(Lambda lambda,
+                   PostLambda post_process,
+                   const index<T, IdxT>& index,
+                   const T* queries,
+                   const uint32_t* coarse_index,
+                   const uint32_t num_queries,
+                   const uint32_t n_probes,
+                   const uint32_t k,
+                   IdxT* neighbors,
+                   float* distances,
+                   uint32_t& grid_dim_x,
+                   rmm::cuda_stream_view stream)
+{
+  RAFT_EXPECTS(Veclen == index.veclen(),
+               "Configured Veclen does not match the index interleaving pattern.");
+  constexpr auto kKernel =
+    interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda, PostLambda>;
+  const int max_query_smem = 16384;
+  int query_smem_elems =
+    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
+  int smem_size              = query_smem_elems * sizeof(T);
+  constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
+  auto block_merge_mem =
+    raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<AccT, IdxT>(
+      kThreadsPerBlock / kSubwarpSize, k);
+  smem_size += std::max<int>(smem_size, block_merge_mem);
+
+  // power-of-two less than cuda limit (for better addr alignment)
+  constexpr uint32_t kMaxGridY = 32768;
+
+  if (grid_dim_x == 0) {
+    grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), n_probes, smem_size, kKernel);
+    return;
+  }
+
+  for (uint32_t query_offset = 0; query_offset < num_queries; query_offset += kMaxGridY) {
+    uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, num_queries - query_offset);
+    dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
+    dim3 block_dim(kThreadsPerBlock);
+    RAFT_LOG_TRACE(
+      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), n_probes = %d, "
+      "smem_size = %d",
+      grid_dim.x,
+      grid_dim.y,
+      block_dim.x,
+      n_probes,
+      smem_size);
+    kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
+                                                        post_process,
+                                                        query_smem_elems,
+                                                        queries,
+                                                        coarse_index,
+                                                        index.inds_ptrs().data_handle(),
+                                                        index.data_ptrs().data_handle(),
+                                                        index.list_sizes().data_handle(),
+                                                        n_probes,
+                                                        k,
+                                                        index.dim(),
+                                                        neighbors,
+                                                        distances);
+    queries += grid_dim_y * index.dim();
+    neighbors += grid_dim_y * grid_dim_x * k;
+    distances += grid_dim_y * grid_dim_x * k;
+  }
+}
+
+template <int Veclen, typename T, typename AccT>
+struct euclidean_dist {
+  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
+  {
+    const auto diff = x - y;
+    acc += diff * diff;
+  }
+};
+
+template <int Veclen>
+struct euclidean_dist<Veclen, uint8_t, uint32_t> {
+  __device__ __forceinline__ void operator()(uint32_t& acc, uint32_t x, uint32_t y)
+  {
+    if constexpr (Veclen > 1) {
+      const auto diff = __vabsdiffu4(x, y);
+      acc             = dp4a(diff, diff, acc);
+    } else {
+      const auto diff = __usad(x, y, 0u);
+      acc += diff * diff;
+    }
+  }
+};
+
+template <int Veclen>
+struct euclidean_dist<Veclen, int8_t, int32_t> {
+  __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
+  {
+    if constexpr (Veclen > 1) {
+      // Note that we enforce here that the unsigned version of dp4a is used, because the difference
+      // between two int8 numbers can be greater than 127 and therefore represented as a negative
+      // number in int8. Casting from int8 to int32 would yield incorrect results, while casting
+      // from uint8 to uint32 is correct.
+      const auto diff = __vabsdiffs4(x, y);
+      acc             = dp4a(diff, diff, static_cast<uint32_t>(acc));
+    } else {
+      const auto diff = x - y;
+      acc += diff * diff;
+    }
+  }
+};
+
+template <int Veclen, typename T, typename AccT>
+struct inner_prod_dist {
+  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
+  {
+    if constexpr (Veclen > 1 && (std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>)) {
+      acc = dp4a(x, y, acc);
+    } else {
+      acc += x * y;
+    }
+  }
+};
+
+/** Select the distance computation function and forward the rest of the arguments. */
+template <int Capacity,
+          int Veclen,
+          bool Ascending,
+          typename T,
+          typename AccT,
+          typename IdxT,
+          typename... Args>
+void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... args)
+{
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+    case raft::distance::DistanceType::L2Unexpanded:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           euclidean_dist<Veclen, T, AccT>,
+                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
+    case raft::distance::DistanceType::L2SqrtExpanded:
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           euclidean_dist<Veclen, T, AccT>,
+                           raft::sqrt_op>({}, {}, std::forward<Args>(args)...);
+    case raft::distance::DistanceType::InnerProduct:
+      return launch_kernel<Capacity,
+                           Veclen,
+                           Ascending,
+                           T,
+                           AccT,
+                           IdxT,
+                           inner_prod_dist<Veclen, T, AccT>,
+                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
+    // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
+    default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
+  }
+}
+
+/**
+ * Lift the `capacity` and `veclen` parameters to the template level,
+ * forward the rest of the arguments unmodified to `launch_interleaved_scan_kernel`.
+ */
+template <typename T,
+          typename AccT,
+          typename IdxT,
+          int Capacity = matrix::detail::select::warpsort::kMaxCapacity,
+          int Veclen   = std::max<int>(1, 16 / sizeof(T))>
+struct select_interleaved_scan_kernel {
+  /**
+   * Recursively reduce the `Capacity` and `Veclen` parameters until they match the
+   * corresponding runtime arguments.
+   * By default, this recursive process starts with maximum possible values of the
+   * two parameters and ends with both values equal to 1.
+   */
+  template <typename... Args>
+  static inline void run(int capacity, int veclen, bool select_min, Args&&... args)
+  {
+    if constexpr (Capacity > 1) {
+      if (capacity * 2 <= Capacity) {
+        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity / 2, Veclen>::run(
+          capacity, veclen, select_min, std::forward<Args>(args)...);
+      }
+    }
+    if constexpr (Veclen > 1) {
+      if (veclen % Veclen != 0) {
+        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity, 1>::run(
+          capacity, veclen, select_min, std::forward<Args>(args)...);
+      }
+    }
+    // NB: this is the limitation of the warpsort structures that use a huge number of
+    //     registers (used in the main kernel here).
+    RAFT_EXPECTS(capacity == Capacity,
+                 "Capacity must be power-of-two not bigger than the maximum allowed size "
+                 "matrix::detail::select::warpsort::kMaxCapacity (%d).",
+                 matrix::detail::select::warpsort::kMaxCapacity);
+    RAFT_EXPECTS(
+      veclen == Veclen,
+      "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
+    if (select_min) {
+      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT, IdxT>(std::forward<Args>(args)...);
+    } else {
+      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT, IdxT>(std::forward<Args>(args)...);
+    }
+  }
+};
+
+/**
+ * @brief Configure and launch an appropriate template instance of the interleaved scan kernel.
+ *
+ * @tparam T value type
+ * @tparam AccT accumulated type
+ * @tparam IdxT type of the indices
+ *
+ * @param index previously built ivf-flat index
+ * @param[in] queries device pointer to the query vectors [batch_size, dim]
+ * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes]
+ * @param n_queries batch size
+ * @param metric type of the measured distance
+ * @param n_probes number of nearest clusters to query
+ * @param k number of nearest neighbors.
+ *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
+ * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
+ * metric.
+ * @param[out] neighbors device pointer to the result indices for each query and cluster
+ * [batch_size, grid_dim_x, k]
+ * @param[out] distances device pointer to the result distances for each query and cluster
+ * [batch_size, grid_dim_x, k]
+ * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters;
+ *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
+ * @param stream
+ */
+template <typename T, typename AccT, typename IdxT>
+void ivfflat_interleaved_scan(const index<T, IdxT>& index,
+                              const T* queries,
+                              const uint32_t* coarse_query_results,
+                              const uint32_t n_queries,
+                              const raft::distance::DistanceType metric,
+                              const uint32_t n_probes,
+                              const uint32_t k,
+                              const bool select_min,
+                              IdxT* neighbors,
+                              float* distances,
+                              uint32_t& grid_dim_x,
+                              rmm::cuda_stream_view stream)
+{
+  // greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
+  // function is used in both raft::neighbors::ivf_flat::search and
+  // raft::neighbors::detail::refine_device. To prevent a duplicate
+  // instantiation of this function (which defines ~270 kernels) in the refine
+  // specializations, an extern template definition is provided. Please check
+  // related function calls after editing this function definition. Search for
+  // `greppable-id-specializations-ivf-flat-search` to find them.
+
+  const int capacity = bound_by_power_of_two(k);
+  select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
+                                                     index.veclen(),
+                                                     select_min,
+                                                     metric,
+                                                     index,
+                                                     queries,
+                                                     coarse_query_results,
+                                                     n_queries,
+                                                     n_probes,
+                                                     k,
+                                                     neighbors,
+                                                     distances,
+                                                     grid_dim_x,
+                                                     stream);
+}
+
+}  // namespace raft::neighbors::ivf_flat::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
index e69de29bb2..74a1a84e74 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef RAFT_COMPILED
+#include "ivf_flat_interleaved_scan-ext.cuh"
+#endif
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "ivf_flat_interleaved_scan-inl.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
new file mode 100644
index 0000000000..529412a17c
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>                                // uintX_t
+#include <raft/neighbors/ivf_flat_types.hpp>      // index
+#include <raft/spatial/knn/detail/ann_utils.cuh>  // TODO: consider remove
+#include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>               // rmm:cuda_stream_view
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft::neighbors::ivf_flat::detail {
+
+/** See raft::neighbors::ivf_flat::search docs */
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const raft::neighbors::ivf_flat::index<T, IdxT>& index,
+            const T* queries,
+            uint32_t n_queries,
+            uint32_t k,
+            IdxT* neighbors,
+            float* distances,
+            rmm::mr::device_memory_resource* mr = nullptr);
+
+}  // namespace raft::neighbors::ivf_flat::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT)         \
+  extern template void raft::neighbors::ivf_flat::detail::search<T, IdxT>( \
+    raft::device_resources const& handle,                                  \
+    const search_params& params,                                           \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,                \
+    const T* queries,                                                      \
+    uint32_t n_queries,                                                    \
+    uint32_t k,                                                            \
+    IdxT* neighbors,                                                       \
+    float* distances,                                                      \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_neighbors_ivf_flat_detail_search(float, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_search(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_search(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_search
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
index e6533eaf51..92ba3613d8 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
@@ -16,1083 +16,22 @@
 
 #pragma once
 
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/core/logger.hpp>
-#include <raft/core/mdarray.hpp>
-#include <raft/core/operators.hpp>
-#include <raft/distance/distance.cuh>
-#include <raft/distance/distance_types.hpp>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/detail/select_warpsort.cuh>
-#include <raft/neighbors/ivf_flat_types.hpp>
-#include <raft/spatial/knn/detail/ann_utils.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/device_loads_stores.cuh>
-#include <raft/util/integer_utils.hpp>
-#include <raft/util/pow2_utils.cuh>
-#include <raft/util/vectorized.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <raft/core/device_resources.hpp>                       // raft::device_resources
+#include <raft/core/logger.hpp>                                 // RAFT_LOG_TRACE
+#include <raft/distance/distance_types.hpp>                     // is_min_close, DistanceType
+#include <raft/linalg/gemm.cuh>                                 // raft::linalg::gemm
+#include <raft/linalg/norm.cuh>                                 // raft::linalg::norm
+#include <raft/linalg/unary_op.cuh>                             // raft::linalg::unary_op
+#include <raft/matrix/detail/select_k.cuh>                      // matrix::detail::select_k
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan.cuh>  // interleaved_scan
+#include <raft/neighbors/ivf_flat_types.hpp>                    // raft::neighbors::ivf_flat::index
+#include <rmm/mr/device/per_device_resource.hpp>                // rmm::device_memory_resource
 
 namespace raft::neighbors::ivf_flat::detail {
 
-using namespace raft::spatial::knn::detail;  // NOLINT
-
-constexpr int kThreadsPerBlock = 128;
-
-/**
- * @brief Copy `n` elements per block from one place to another.
- *
- * @param[out] out target pointer (unique per block)
- * @param[in] in source pointer
- * @param n number of elements to copy
- */
-template <int VecBytes = 16, typename T>
-__device__ inline void copy_vectorized(T* out, const T* in, uint32_t n)
-{
-  constexpr int VecElems = VecBytes / sizeof(T);  // NOLINT
-  using align_bytes      = Pow2<(size_t)VecBytes>;
-  if constexpr (VecElems > 1) {
-    using align_elems = Pow2<VecElems>;
-    if (!align_bytes::areSameAlignOffsets(out, in)) {
-      return copy_vectorized<(VecBytes >> 1), T>(out, in, n);
-    }
-    {  // process unaligned head
-      uint32_t head = align_bytes::roundUp(in) - in;
-      if (head > 0) {
-        copy_vectorized<sizeof(T), T>(out, in, head);
-        n -= head;
-        in += head;
-        out += head;
-      }
-    }
-    {  // process main part vectorized
-      using vec_t = typename IOType<T, VecElems>::Type;
-      copy_vectorized<sizeof(vec_t), vec_t>(
-        reinterpret_cast<vec_t*>(out), reinterpret_cast<const vec_t*>(in), align_elems::div(n));
-    }
-    {  // process unaligned tail
-      uint32_t tail = align_elems::mod(n);
-      if (tail > 0) {
-        n -= tail;
-        copy_vectorized<sizeof(T), T>(out + n, in + n, tail);
-      }
-    }
-  }
-  if constexpr (VecElems <= 1) {
-    for (int i = threadIdx.x; i < n; i += blockDim.x) {
-      out[i] = in[i];
-    }
-  }
-}
-
-/**
- * @brief Load a part of a vector from the index and from query, compute the (part of the) distance
- * between them, and aggregate it using the provided Lambda; one structure per thread, per query,
- * and per index item.
- *
- * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen)
- * @tparam Lambda computing the part of the distance for one dimension and aggregating it:
- *                void (AccT& acc, AccT x, AccT y)
- * @tparam Veclen size of the vectorized load
- * @tparam T type of the data in the query and the index
- * @tparam AccT type of the accumulated value (an optimization for 8bit values to be loaded as 32bit
- * values)
- */
-template <int kUnroll, typename Lambda, int Veclen, typename T, typename AccT>
-struct loadAndComputeDist {
-  Lambda compute_dist;
-  AccT& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(AccT& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  /**
-   * Load parts of vectors from the index and query and accumulates the partial distance.
-   * This version assumes the query is stored in shared memory.
-   * Every thread here processes exactly kUnroll * Veclen elements independently of others.
-   */
-  template <typename IdxT>
-  __device__ __forceinline__ void runLoadShmemCompute(const T* const& data,
-                                                      const T* query_shared,
-                                                      IdxT loadIndex,
-                                                      IdxT shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      T encV[Veclen];
-      ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen);
-      T queryRegs[Veclen];
-      lds(queryRegs, &query_shared[shmemIndex + j * Veclen]);
-#pragma unroll
-      for (int k = 0; k < Veclen; ++k) {
-        compute_dist(dist, queryRegs[k], encV[k]);
-      }
-    }
-  }
-
-  /**
-   * Load parts of vectors from the index and query and accumulates the partial distance.
-   * This version assumes the query is stored in the global memory and is different for every
-   * thread. One warp loads exactly WarpSize query elements at once and then reshuffles them into
-   * corresponding threads (`WarpSize / (kUnroll * Veclen)` elements per thread at once).
-   */
-  template <typename IdxT>
-  __device__ __forceinline__ void runLoadShflAndCompute(const T*& data,
-                                                        const T* query,
-                                                        IdxT baseLoadIndex,
-                                                        const int lane_id)
-  {
-    T queryReg               = query[baseLoadIndex + lane_id];
-    constexpr int stride     = kUnroll * Veclen;
-    constexpr int totalIter  = WarpSize / stride;
-    constexpr int gmemStride = stride * kIndexGroupSize;
-#pragma unroll
-    for (int i = 0; i < totalIter; ++i, data += gmemStride) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        T encV[Veclen];
-        ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen);
-        const int d = (i * kUnroll + j) * Veclen;
-#pragma unroll
-        for (int k = 0; k < Veclen; ++k) {
-          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
-        }
-      }
-    }
-  }
-
-  /**
-   * Load parts of vectors from the index and query and accumulates the partial distance.
-   * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`.
-   */
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    const int loadDim     = dimBlocks + lane_id;
-    T queryReg            = loadDim < dim ? query[loadDim] : 0;
-    const int loadDataIdx = lane_id * Veclen;
-    for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) {
-      T enc[Veclen];
-      ldg(enc, data + loadDataIdx);
-#pragma unroll
-      for (int k = 0; k < Veclen; k++) {
-        compute_dist(dist, shfl(queryReg, d + k, WarpSize), enc[k]);
-      }
-    }
-  }
-};
-
-// This handles uint8_t 8, 16 Veclens
-template <int kUnroll, typename Lambda, int uint8_veclen>
-struct loadAndComputeDist<kUnroll, Lambda, uint8_veclen, uint8_t, uint32_t> {
-  Lambda compute_dist;
-  uint32_t& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
-    loadIndex                = loadIndex * veclen_int;
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV[veclen_int];
-      ldg(encV,
-          reinterpret_cast<unsigned const*>(data) + loadIndex + j * kIndexGroupSize * veclen_int);
-      uint32_t queryRegs[veclen_int];
-      lds(queryRegs, reinterpret_cast<unsigned const*>(query_shared + shmemIndex) + j * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        compute_dist(dist, queryRegs[k], encV[k]);
-      }
-    }
-  }
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    constexpr int veclen_int = uint8_veclen / 4;  // converting uint8_t veclens to int
-    uint32_t queryReg =
-      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int stride = kUnroll * uint8_veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV[veclen_int];
-        ldg(encV,
-            reinterpret_cast<unsigned const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
-        const int d = (i * kUnroll + j) * veclen_int;
-#pragma unroll
-        for (int k = 0; k < veclen_int; ++k) {
-          compute_dist(dist, shfl(queryReg, d + k, WarpSize), encV[k]);
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen_int = uint8_veclen / 4;
-    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int
-    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint32_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks;
-         d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) {
-      uint32_t enc[veclen_int];
-      ldg(enc, reinterpret_cast<uint32_t const*>(data) + lane_id * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        uint32_t q = shfl(queryReg, (d / 4) + k, WarpSize);
-        compute_dist(dist, q, enc[k]);
-      }
-    }
-  }
-};
-
-// Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while
-// using above common template of int2/int4
-template <int kUnroll, typename Lambda>
-struct loadAndComputeDist<kUnroll, Lambda, 4, uint8_t, uint32_t> {
-  Lambda compute_dist;
-  uint32_t& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = reinterpret_cast<unsigned const*>(data)[loadIndex + j * kIndexGroupSize];
-      uint32_t queryRegs = reinterpret_cast<unsigned const*>(query_shared + shmemIndex)[j];
-      compute_dist(dist, queryRegs, encV);
-    }
-  }
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    uint32_t queryReg =
-      (lane_id < 8) ? reinterpret_cast<unsigned const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int veclen = 4;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = reinterpret_cast<unsigned const*>(data)[lane_id + j * kIndexGroupSize];
-        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
-        compute_dist(dist, q, encV);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen = 4;
-    const int loadDim    = dimBlocks + lane_id;
-    uint32_t queryReg    = loadDim < dim ? reinterpret_cast<unsigned const*>(query)[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = reinterpret_cast<unsigned const*>(data)[lane_id];
-      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
-      compute_dist(dist, q, enc);
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda>
-struct loadAndComputeDist<kUnroll, Lambda, 2, uint8_t, uint32_t> {
-  Lambda compute_dist;
-  uint32_t& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
-      uint32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
-      compute_dist(dist, queryRegs, encV);
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    uint32_t queryReg =
-      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int veclen = 2;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
-        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
-        compute_dist(dist, q, encV);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen = 2;
-    int loadDim          = dimBlocks + lane_id * veclen;
-    uint32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = reinterpret_cast<uint16_t const*>(data)[lane_id];
-      uint32_t q   = shfl(queryReg, d / veclen, WarpSize);
-      compute_dist(dist, q, enc);
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda>
-struct loadAndComputeDist<kUnroll, Lambda, 1, uint8_t, uint32_t> {
-  Lambda compute_dist;
-  uint32_t& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(uint32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data,
-                                                      const uint8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      uint32_t encV      = data[loadIndex + j * kIndexGroupSize];
-      uint32_t queryRegs = query_shared[shmemIndex + j];
-      compute_dist(dist, queryRegs, encV);
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data,
-                                                        const uint8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    uint32_t queryReg    = query[baseLoadIndex + lane_id];
-    constexpr int veclen = 1;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        uint32_t encV = data[lane_id + j * kIndexGroupSize];
-        uint32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
-        compute_dist(dist, q, encV);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data,
-                                                                 const uint8_t* query,
-                                                                 const int lane_id,
-                                                                 const int dim,
-                                                                 const int dimBlocks)
-  {
-    constexpr int veclen = 1;
-    int loadDim          = dimBlocks + lane_id;
-    uint32_t queryReg    = loadDim < dim ? query[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      uint32_t enc = data[lane_id];
-      uint32_t q   = shfl(queryReg, d, WarpSize);
-      compute_dist(dist, q, enc);
-    }
-  }
-};
-
-// This device function is for int8 veclens 4, 8 and 16
-template <int kUnroll, typename Lambda, int int8_veclen>
-struct loadAndComputeDist<kUnroll, Lambda, int8_veclen, int8_t, int32_t> {
-  Lambda compute_dist;
-  int32_t& dist;
-
-  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
-
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      int32_t encV[veclen_int];
-      ldg(encV,
-          reinterpret_cast<int32_t const*>(data) + (loadIndex + j * kIndexGroupSize) * veclen_int);
-      int32_t queryRegs[veclen_int];
-      lds(queryRegs, reinterpret_cast<int32_t const*>(query_shared + shmemIndex) + j * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        compute_dist(dist, queryRegs[k], encV[k]);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
-                                                        const int8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    constexpr int veclen_int = int8_veclen / 4;  // converting int8_t veclens to int
-
-    int32_t queryReg =
-      (lane_id < 8) ? reinterpret_cast<int32_t const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int stride = kUnroll * int8_veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        int32_t encV[veclen_int];
-        ldg(encV,
-            reinterpret_cast<int32_t const*>(data) + (lane_id + j * kIndexGroupSize) * veclen_int);
-        const int d = (i * kUnroll + j) * veclen_int;
-#pragma unroll
-        for (int k = 0; k < veclen_int; ++k) {
-          int32_t q = shfl(queryReg, d + k, WarpSize);
-          compute_dist(dist, q, encV[k]);
-        }
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    constexpr int veclen_int = int8_veclen / 4;
-    const int loadDim        = dimBlocks + lane_id * 4;  // Here 4 is for 1 - int;
-    int32_t queryReg = loadDim < dim ? reinterpret_cast<int32_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) {
-      int32_t enc[veclen_int];
-      ldg(enc, reinterpret_cast<int32_t const*>(data) + lane_id * veclen_int);
-#pragma unroll
-      for (int k = 0; k < veclen_int; k++) {
-        int32_t q = shfl(queryReg, (d / 4) + k, WarpSize);  // Here 4 is for 1 - int;
-        compute_dist(dist, q, enc[k]);
-      }
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda>
-struct loadAndComputeDist<kUnroll, Lambda, 2, int8_t, int32_t> {
-  Lambda compute_dist;
-  int32_t& dist;
-  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      int32_t encV      = reinterpret_cast<uint16_t const*>(data)[loadIndex + j * kIndexGroupSize];
-      int32_t queryRegs = reinterpret_cast<uint16_t const*>(query_shared + shmemIndex)[j];
-      compute_dist(dist, queryRegs, encV);
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
-                                                        const int8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    int32_t queryReg =
-      (lane_id < 16) ? reinterpret_cast<uint16_t const*>(query + baseLoadIndex)[lane_id] : 0;
-    constexpr int veclen = 2;
-    constexpr int stride = kUnroll * veclen;
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        int32_t encV = reinterpret_cast<uint16_t const*>(data)[lane_id + j * kIndexGroupSize];
-        int32_t q    = shfl(queryReg, i * kUnroll + j, WarpSize);
-        compute_dist(dist, q, encV);
-      }
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    constexpr int veclen = 2;
-    int loadDim          = dimBlocks + lane_id * veclen;
-    int32_t queryReg = loadDim < dim ? reinterpret_cast<uint16_t const*>(query + loadDim)[0] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      int32_t enc = reinterpret_cast<uint16_t const*>(data + lane_id * veclen)[0];
-      int32_t q   = shfl(queryReg, d / veclen, WarpSize);
-      compute_dist(dist, q, enc);
-    }
-  }
-};
-
-template <int kUnroll, typename Lambda>
-struct loadAndComputeDist<kUnroll, Lambda, 1, int8_t, int32_t> {
-  Lambda compute_dist;
-  int32_t& dist;
-  __device__ __forceinline__ loadAndComputeDist(int32_t& dist, Lambda op)
-    : dist(dist), compute_dist(op)
-  {
-  }
-
-  __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data,
-                                                      const int8_t* query_shared,
-                                                      int loadIndex,
-                                                      int shmemIndex)
-  {
-#pragma unroll
-    for (int j = 0; j < kUnroll; ++j) {
-      compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * kIndexGroupSize]);
-    }
-  }
-
-  __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data,
-                                                        const int8_t* query,
-                                                        int baseLoadIndex,
-                                                        const int lane_id)
-  {
-    constexpr int veclen = 1;
-    constexpr int stride = kUnroll * veclen;
-    int32_t queryReg     = query[baseLoadIndex + lane_id];
-
-#pragma unroll
-    for (int i = 0; i < WarpSize / stride; ++i, data += stride * kIndexGroupSize) {
-#pragma unroll
-      for (int j = 0; j < kUnroll; ++j) {
-        compute_dist(
-          dist, shfl(queryReg, i * kUnroll + j, WarpSize), data[lane_id + j * kIndexGroupSize]);
-      }
-    }
-  }
-  __device__ __forceinline__ void runLoadShflAndComputeRemainder(
-    const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks)
-  {
-    constexpr int veclen = 1;
-    const int loadDim    = dimBlocks + lane_id;
-    int32_t queryReg     = loadDim < dim ? query[loadDim] : 0;
-    for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) {
-      compute_dist(dist, shfl(queryReg, d, WarpSize), data[lane_id]);
-    }
-  }
-};
-
-/**
- * Scan clusters for nearest neighbors of the query vectors.
- * See `ivfflat_interleaved_scan` for more information.
- *
- * The clusters are stored in the interleaved index format described in ivf_flat_types.hpp.
- * For each query vector, a set of clusters is probed: the distance to each vector in the cluster is
- * calculated, and the top-k nearest neighbors are selected.
- *
- * @param compute_dist distance function
- * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a
- * block; this number must be a multiple of `WarpSize * Veclen`.
- * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim]
- * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes]
- * @param[in] list_indices index<T, IdxT>.indices
- * @param[in] list_data index<T, IdxT>.data
- * @param[in] list_sizes index<T, IdxT>.list_sizes
- * @param[in] list_offsets index<T, IdxT>.list_offsets
- * @param n_probes
- * @param k
- * @param dim
- * @param[out] neighbors
- * @param[out] distances
- */
-template <int Capacity,
-          int Veclen,
-          bool Ascending,
-          typename T,
-          typename AccT,
-          typename IdxT,
-          typename Lambda,
-          typename PostLambda>
-__global__ void __launch_bounds__(kThreadsPerBlock)
-  interleaved_scan_kernel(Lambda compute_dist,
-                          PostLambda post_process,
-                          const uint32_t query_smem_elems,
-                          const T* query,
-                          const uint32_t* coarse_index,
-                          const IdxT* const* list_indices_ptrs,
-                          const T* const* list_data_ptrs,
-                          const uint32_t* list_sizes,
-                          const uint32_t n_probes,
-                          const uint32_t k,
-                          const uint32_t dim,
-                          IdxT* neighbors,
-                          float* distances)
-{
-  extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[];
-  // Using shared memory for the (part of the) query;
-  // This allows to save on global memory bandwidth when reading index and query
-  // data at the same time.
-  // Its size is `query_smem_elems`.
-  T* query_shared = reinterpret_cast<T*>(interleaved_scan_kernel_smem);
-  // Make the query input and output point to this block's shared query
-  {
-    const int query_id = blockIdx.y;
-    query += query_id * dim;
-    neighbors += query_id * k * gridDim.x + blockIdx.x * k;
-    distances += query_id * k * gridDim.x + blockIdx.x * k;
-    coarse_index += query_id * n_probes;
-  }
-
-  // Copy a part of the query into shared memory for faster processing
-  copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
-  __syncthreads();
-
-  using block_sort_t = matrix::detail::select::warpsort::block_sort<
-    matrix::detail::select::warpsort::warp_sort_filtered,
-    Capacity,
-    Ascending,
-    float,
-    IdxT>;
-  block_sort_t queue(k);
-
-  {
-    using align_warp  = Pow2<WarpSize>;
-    const int lane_id = align_warp::mod(threadIdx.x);
-
-    // How many full warps needed to compute the distance (without remainder)
-    const uint32_t full_warps_along_dim = align_warp::roundDown(dim);
-
-    const uint32_t shm_assisted_dim =
-      (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim;
-
-    // Every CUDA block scans one cluster at a time.
-    for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) {
-      const uint32_t list_id = coarse_index[probe_id];  // The id of cluster(list)
-
-      // The number of vectors in each cluster(list); [nlist]
-      const uint32_t list_length = list_sizes[list_id];
-
-      // The number of interleaved groups to be processed
-      const uint32_t num_groups =
-        align_warp::div(list_length + align_warp::Mask);  // ceildiv by power of 2
-
-      constexpr int kUnroll        = WarpSize / Veclen;
-      constexpr uint32_t kNumWarps = kThreadsPerBlock / WarpSize;
-      // Every warp reads WarpSize vectors and computes the distances to them.
-      // Then, the distances and corresponding ids are distributed among the threads,
-      // and each thread adds one (id, dist) pair to the filtering queue.
-      for (uint32_t group_id = align_warp::div(threadIdx.x); group_id < num_groups;
-           group_id += kNumWarps) {
-        AccT dist = 0;
-        // This is where this warp begins reading data (start position of an interleaved group)
-        const T* data = list_data_ptrs[list_id] + (group_id * kIndexGroupSize) * dim;
-
-        // This is the vector a given lane/thread handles
-        const uint32_t vec_id = group_id * WarpSize + lane_id;
-        const bool valid      = vec_id < list_length;
-
-        // Process first shm_assisted_dim dimensions (always using shared memory)
-        if (valid) {
-          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
-                                                                                  compute_dist);
-          for (int pos = 0; pos < shm_assisted_dim;
-               pos += WarpSize, data += kIndexGroupSize * WarpSize) {
-            lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
-          }
-        }
-
-        if (dim > query_smem_elems) {
-          // The default path - using shfl ops - for dimensions beyond query_smem_elems
-          loadAndComputeDist<kUnroll, decltype(compute_dist), Veclen, T, AccT> lc(dist,
-                                                                                  compute_dist);
-          for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += WarpSize) {
-            lc.runLoadShflAndCompute(data, query, pos, lane_id);
-          }
-          lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim);
-        } else {
-          // when  shm_assisted_dim == full_warps_along_dim < dim
-          if (valid) {
-            loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT> lc(dist, compute_dist);
-            for (int pos = full_warps_along_dim; pos < dim;
-                 pos += Veclen, data += kIndexGroupSize * Veclen) {
-              lc.runLoadShmemCompute(data, query_shared, lane_id, pos);
-            }
-          }
-        }
-
-        // Enqueue one element per thread
-        const float val  = valid ? static_cast<float>(dist) : block_sort_t::queue_t::kDummy;
-        const size_t idx = valid ? static_cast<size_t>(list_indices_ptrs[list_id][vec_id]) : 0;
-        queue.add(val, idx);
-      }
-    }
-  }
-
-  // finalize and store selected neighbours
-  __syncthreads();
-  queue.done(interleaved_scan_kernel_smem);
-  queue.store(distances, neighbors, post_process);
-}
-
-/**
- *  Configure the gridDim.x to maximize GPU occupancy, but reduce the output size
- */
-template <typename T>
-uint32_t configure_launch_x(uint32_t numQueries, uint32_t n_probes, int32_t sMemSize, T func)
-{
-  int dev_id;
-  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
-  int num_sms;
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
-  int num_blocks_per_sm = 0;
-  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize));
-
-  size_t min_grid_size = num_sms * num_blocks_per_sm;
-  size_t min_grid_x    = ceildiv<size_t>(min_grid_size, numQueries);
-  return min_grid_x > n_probes ? n_probes : static_cast<uint32_t>(min_grid_x);
-}
-
-template <int Capacity,
-          int Veclen,
-          bool Ascending,
-          typename T,
-          typename AccT,
-          typename IdxT,
-          typename Lambda,
-          typename PostLambda>
-void launch_kernel(Lambda lambda,
-                   PostLambda post_process,
-                   const index<T, IdxT>& index,
-                   const T* queries,
-                   const uint32_t* coarse_index,
-                   const uint32_t num_queries,
-                   const uint32_t n_probes,
-                   const uint32_t k,
-                   IdxT* neighbors,
-                   float* distances,
-                   uint32_t& grid_dim_x,
-                   rmm::cuda_stream_view stream)
-{
-  RAFT_EXPECTS(Veclen == index.veclen(),
-               "Configured Veclen does not match the index interleaving pattern.");
-  constexpr auto kKernel =
-    interleaved_scan_kernel<Capacity, Veclen, Ascending, T, AccT, IdxT, Lambda, PostLambda>;
-  const int max_query_smem = 16384;
-  int query_smem_elems =
-    std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
-  int smem_size              = query_smem_elems * sizeof(T);
-  constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
-  auto block_merge_mem =
-    raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<AccT, IdxT>(
-      kThreadsPerBlock / kSubwarpSize, k);
-  smem_size += std::max<int>(smem_size, block_merge_mem);
-
-  // power-of-two less than cuda limit (for better addr alignment)
-  constexpr uint32_t kMaxGridY = 32768;
-
-  if (grid_dim_x == 0) {
-    grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), n_probes, smem_size, kKernel);
-    return;
-  }
-
-  for (uint32_t query_offset = 0; query_offset < num_queries; query_offset += kMaxGridY) {
-    uint32_t grid_dim_y = std::min<uint32_t>(kMaxGridY, num_queries - query_offset);
-    dim3 grid_dim(grid_dim_x, grid_dim_y, 1);
-    dim3 block_dim(kThreadsPerBlock);
-    RAFT_LOG_TRACE(
-      "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), n_probes = %d, "
-      "smem_size = %d",
-      grid_dim.x,
-      grid_dim.y,
-      block_dim.x,
-      n_probes,
-      smem_size);
-    kKernel<<<grid_dim, block_dim, smem_size, stream>>>(lambda,
-                                                        post_process,
-                                                        query_smem_elems,
-                                                        queries,
-                                                        coarse_index,
-                                                        index.inds_ptrs().data_handle(),
-                                                        index.data_ptrs().data_handle(),
-                                                        index.list_sizes().data_handle(),
-                                                        n_probes,
-                                                        k,
-                                                        index.dim(),
-                                                        neighbors,
-                                                        distances);
-    queries += grid_dim_y * index.dim();
-    neighbors += grid_dim_y * grid_dim_x * k;
-    distances += grid_dim_y * grid_dim_x * k;
-  }
-}
-
-template <int Veclen, typename T, typename AccT>
-struct euclidean_dist {
-  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
-  {
-    const auto diff = x - y;
-    acc += diff * diff;
-  }
-};
-
-template <int Veclen>
-struct euclidean_dist<Veclen, uint8_t, uint32_t> {
-  __device__ __forceinline__ void operator()(uint32_t& acc, uint32_t x, uint32_t y)
-  {
-    if constexpr (Veclen > 1) {
-      const auto diff = __vabsdiffu4(x, y);
-      acc             = dp4a(diff, diff, acc);
-    } else {
-      const auto diff = __usad(x, y, 0u);
-      acc += diff * diff;
-    }
-  }
-};
-
-template <int Veclen>
-struct euclidean_dist<Veclen, int8_t, int32_t> {
-  __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y)
-  {
-    if constexpr (Veclen > 1) {
-      // Note that we enforce here that the unsigned version of dp4a is used, because the difference
-      // between two int8 numbers can be greater than 127 and therefore represented as a negative
-      // number in int8. Casting from int8 to int32 would yield incorrect results, while casting
-      // from uint8 to uint32 is correct.
-      const auto diff = __vabsdiffs4(x, y);
-      acc             = dp4a(diff, diff, static_cast<uint32_t>(acc));
-    } else {
-      const auto diff = x - y;
-      acc += diff * diff;
-    }
-  }
-};
-
-template <int Veclen, typename T, typename AccT>
-struct inner_prod_dist {
-  __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y)
-  {
-    if constexpr (Veclen > 1 && (std::is_same_v<T, int8_t> || std::is_same_v<T, uint8_t>)) {
-      acc = dp4a(x, y, acc);
-    } else {
-      acc += x * y;
-    }
-  }
-};
-
-/** Select the distance computation function and forward the rest of the arguments. */
-template <int Capacity,
-          int Veclen,
-          bool Ascending,
-          typename T,
-          typename AccT,
-          typename IdxT,
-          typename... Args>
-void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... args)
-{
-  switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-    case raft::distance::DistanceType::L2Unexpanded:
-      return launch_kernel<Capacity,
-                           Veclen,
-                           Ascending,
-                           T,
-                           AccT,
-                           IdxT,
-                           euclidean_dist<Veclen, T, AccT>,
-                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
-    case raft::distance::DistanceType::L2SqrtExpanded:
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      return launch_kernel<Capacity,
-                           Veclen,
-                           Ascending,
-                           T,
-                           AccT,
-                           IdxT,
-                           euclidean_dist<Veclen, T, AccT>,
-                           raft::sqrt_op>({}, {}, std::forward<Args>(args)...);
-    case raft::distance::DistanceType::InnerProduct:
-      return launch_kernel<Capacity,
-                           Veclen,
-                           Ascending,
-                           T,
-                           AccT,
-                           IdxT,
-                           inner_prod_dist<Veclen, T, AccT>,
-                           raft::identity_op>({}, {}, std::forward<Args>(args)...);
-    // NB: update the description of `knn::ivf_flat::build` when adding here a new metric.
-    default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric));
-  }
-}
-
-/**
- * Lift the `capacity` and `veclen` parameters to the template level,
- * forward the rest of the arguments unmodified to `launch_interleaved_scan_kernel`.
- */
-template <typename T,
-          typename AccT,
-          typename IdxT,
-          int Capacity = matrix::detail::select::warpsort::kMaxCapacity,
-          int Veclen   = std::max<int>(1, 16 / sizeof(T))>
-struct select_interleaved_scan_kernel {
-  /**
-   * Recursively reduce the `Capacity` and `Veclen` parameters until they match the
-   * corresponding runtime arguments.
-   * By default, this recursive process starts with maximum possible values of the
-   * two parameters and ends with both values equal to 1.
-   */
-  template <typename... Args>
-  static inline void run(int capacity, int veclen, bool select_min, Args&&... args)
-  {
-    if constexpr (Capacity > 1) {
-      if (capacity * 2 <= Capacity) {
-        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity / 2, Veclen>::run(
-          capacity, veclen, select_min, std::forward<Args>(args)...);
-      }
-    }
-    if constexpr (Veclen > 1) {
-      if (veclen * 2 <= Veclen) {
-        return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity, Veclen / 2>::run(
-          capacity, veclen, select_min, std::forward<Args>(args)...);
-      }
-    }
-    // NB: this is the limitation of the warpsort structures that use a huge number of
-    //     registers (used in the main kernel here).
-    RAFT_EXPECTS(capacity == Capacity,
-                 "Capacity must be power-of-two not bigger than the maximum allowed size "
-                 "matrix::detail::select::warpsort::kMaxCapacity (%d).",
-                 matrix::detail::select::warpsort::kMaxCapacity);
-    RAFT_EXPECTS(
-      veclen == Veclen,
-      "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
-    if (select_min) {
-      launch_with_fixed_consts<Capacity, Veclen, true, T, AccT, IdxT>(std::forward<Args>(args)...);
-    } else {
-      launch_with_fixed_consts<Capacity, Veclen, false, T, AccT, IdxT>(std::forward<Args>(args)...);
-    }
-  }
-};
-
-/**
- * @brief Configure and launch an appropriate template instance of the interleaved scan kernel.
- *
- * @tparam T value type
- * @tparam AccT accumulated type
- * @tparam IdxT type of the indices
- *
- * @param index previously built ivf-flat index
- * @param[in] queries device pointer to the query vectors [batch_size, dim]
- * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes]
- * @param n_queries batch size
- * @param metric type of the measured distance
- * @param n_probes number of nearest clusters to query
- * @param k number of nearest neighbors.
- *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
- * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
- * metric.
- * @param[out] neighbors device pointer to the result indices for each query and cluster
- * [batch_size, grid_dim_x, k]
- * @param[out] distances device pointer to the result distances for each query and cluster
- * [batch_size, grid_dim_x, k]
- * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters;
- *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
- * @param stream
- */
-template <typename T, typename AccT, typename IdxT>
-void ivfflat_interleaved_scan(const index<T, IdxT>& index,
-                              const T* queries,
-                              const uint32_t* coarse_query_results,
-                              const uint32_t n_queries,
-                              const raft::distance::DistanceType metric,
-                              const uint32_t n_probes,
-                              const uint32_t k,
-                              const bool select_min,
-                              IdxT* neighbors,
-                              float* distances,
-                              uint32_t& grid_dim_x,
-                              rmm::cuda_stream_view stream)
-{
-  // greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
-  // function is used in both raft::neighbors::ivf_flat::search and
-  // raft::neighbors::detail::refine_device. To prevent a duplicate
-  // instantiation of this function (which defines ~270 kernels) in the refine
-  // specializations, an extern template definition is provided. Please check
-  // related function calls after editing this function definition. Search for
-  // `greppable-id-specializations-ivf-flat-search` to find them.
-
-  const int capacity = bound_by_power_of_two(k);
-  select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
-                                                     index.veclen(),
-                                                     select_min,
-                                                     metric,
-                                                     index,
-                                                     queries,
-                                                     coarse_query_results,
-                                                     n_queries,
-                                                     n_probes,
-                                                     k,
-                                                     neighbors,
-                                                     distances,
-                                                     grid_dim_x,
-                                                     stream);
-}
-
 template <typename T, typename AccT, typename IdxT>
 void search_impl(raft::device_resources const& handle,
-                 const index<T, IdxT>& index,
+                 const raft::neighbors::ivf_flat::index<T, IdxT>& index,
                  const T* queries,
                  uint32_t n_queries,
                  uint32_t k,
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
index e69de29bb2..1f262e4463 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "ivf_flat_search-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "ivf_flat_search-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh
index e3a2c7d109..20f86d9bae 100644
--- a/cpp/include/raft/neighbors/detail/refine.cuh
+++ b/cpp/include/raft/neighbors/detail/refine.cuh
@@ -20,7 +20,9 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/neighbors/detail/ivf_flat_build.cuh>
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan.cuh>
 #include <raft/neighbors/detail/ivf_flat_search.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss.cuh b/cpp/include/raft/neighbors/detail/selection_faiss.cuh
index ccdba994d1..d1a2ac1a17 100644
--- a/cpp/include/raft/neighbors/detail/selection_faiss.cuh
+++ b/cpp/include/raft/neighbors/detail/selection_faiss.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#if defined(RAFT_COMPILED)
-#include "selection_faiss-ext.cuh"
-#endif
-
 #if !defined(RAFT_EXPLICIT_INSTANTIATE)
 #include "selection_faiss-inl.cuh"
 #endif
+
+#if defined(RAFT_COMPILED)
+#include "selection_faiss-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index 2a6aa12847..39269738dc 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -27,6 +27,7 @@
 #include <raft/neighbors/ivf_list_types.hpp>
 #include <raft/util/integer_utils.hpp>
 
+#include <algorithm>  // std::max
 #include <memory>
 #include <optional>
 #include <thrust/fill.h>
@@ -379,7 +380,7 @@ struct index : ann::index {
   {
     // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
     // template parameter (https://github.com/rapidsai/raft/issues/711)
-    uint32_t veclen = 16 / sizeof(T);
+    uint32_t veclen = std::max<uint32_t>(1, 16 / sizeof(T));
     while (dim % veclen != 0) {
       veclen = veclen >> 1;
     }
diff --git a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
index 161f3462c9..0f17bd8586 100644
--- a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
+++ b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
@@ -44,30 +44,7 @@ namespace raft::neighbors::ivf_flat {
     raft::device_resources const& handle,                                                \
     raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                      \
     std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,               \
-    raft::neighbors::ivf_flat::index<T, IdxT>* idx);                                     \
-                                                                                         \
-  extern template void search(raft::device_resources const&,                             \
-                              raft::neighbors::ivf_flat::search_params const&,           \
-                              const raft::neighbors::ivf_flat::index<T, IdxT>&,          \
-                              raft::device_matrix_view<const T, IdxT, row_major>,        \
-                              raft::device_matrix_view<IdxT, IdxT, row_major>,           \
-                              raft::device_matrix_view<float, IdxT, row_major>);         \
-                                                                                         \
-  extern template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<      \
-    T,                                                                                   \
-    typename raft::spatial::knn::detail::utils::config<T>::value_t,                      \
-    IdxT>(const index<T, IdxT>& index,                                                   \
-          const T* queries,                                                              \
-          const uint32_t* coarse_query_results,                                          \
-          const uint32_t n_queries,                                                      \
-          const raft::distance::DistanceType metric,                                     \
-          const uint32_t n_probes,                                                       \
-          const uint32_t k,                                                              \
-          const bool select_min,                                                         \
-          IdxT* neighbors,                                                               \
-          float* distances,                                                              \
-          uint32_t& grid_dim_x,                                                          \
-          rmm::cuda_stream_view stream);
+    raft::neighbors::ivf_flat::index<T, IdxT>* idx);
 
 RAFT_INST(float, int64_t);
 RAFT_INST(int8_t, int64_t);
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 395714a161..d8fe216a85 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/core/logger.hpp>
-#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
diff --git a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
new file mode 100644
index 0000000000..4dfa2a707c
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT)  \
+  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<T, AccT, IdxT>( \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,                                 \
+    const T* queries,                                                                       \
+    const uint32_t* coarse_query_results,                                                   \
+    const uint32_t n_queries,                                                               \
+    const raft::distance::DistanceType metric,                                              \
+    const uint32_t n_probes,                                                                \
+    const uint32_t k,                                                                       \
+    const bool select_min,                                                                  \
+    IdxT* neighbors,                                                                        \
+    float* distances,                                                                       \
+    uint32_t& grid_dim_x,                                                                   \
+    rmm::cuda_stream_view stream)
+
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(float, float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan
diff --git a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
new file mode 100644
index 0000000000..2d54248e4d
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT)  \
+  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<T, AccT, IdxT>( \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,                                 \
+    const T* queries,                                                                       \
+    const uint32_t* coarse_query_results,                                                   \
+    const uint32_t n_queries,                                                               \
+    const raft::distance::DistanceType metric,                                              \
+    const uint32_t n_probes,                                                                \
+    const uint32_t k,                                                                       \
+    const bool select_min,                                                                  \
+    IdxT* neighbors,                                                                        \
+    float* distances,                                                                       \
+    uint32_t& grid_dim_x,                                                                   \
+    rmm::cuda_stream_view stream)
+
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(int8_t, int32_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan
diff --git a/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
new file mode 100644
index 0000000000..75fe52f3c7
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT)  \
+  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<T, AccT, IdxT>( \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,                                 \
+    const T* queries,                                                                       \
+    const uint32_t* coarse_query_results,                                                   \
+    const uint32_t n_queries,                                                               \
+    const raft::distance::DistanceType metric,                                              \
+    const uint32_t n_probes,                                                                \
+    const uint32_t k,                                                                       \
+    const bool select_min,                                                                  \
+    IdxT* neighbors,                                                                        \
+    float* distances,                                                                       \
+    uint32_t& grid_dim_x,                                                                   \
+    rmm::cuda_stream_view stream)
+
+instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(uint8_t, uint32_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan
diff --git a/cpp/src/neighbors/detail/ivf_flat_search.cu b/cpp/src/neighbors/detail/ivf_flat_search.cu
new file mode 100644
index 0000000000..345a8f499d
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_flat_search.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_flat_search-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT)  \
+  template void raft::neighbors::ivf_flat::detail::search<T, IdxT>( \
+    raft::device_resources const& handle,                           \
+    const search_params& params,                                    \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,         \
+    const T* queries,                                               \
+    uint32_t n_queries,                                             \
+    uint32_t k,                                                     \
+    IdxT* neighbors,                                                \
+    float* distances,                                               \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_neighbors_ivf_flat_detail_search(float, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_search(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_detail_search(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_detail_search

From 16a828f4829dbc0a7ab86cfd3412b0ff8333cc1a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 5 Apr 2023 20:31:12 -0500
Subject: [PATCH 10/89] Pin `dask` and `distributed` for release (#1399)

This PR pins `dask` and `distributed` to `2023.3.2` and `2023.3.2.1` respectively for `23.04` release.

xref: https://github.com/rapidsai/cudf/pull/13070

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)
  - Ben Frederickson (https://github.com/benfred)
  - Joseph (https://github.com/jolorunyomi)

URL: https://github.com/rapidsai/raft/pull/1399
---
 .github/workflows/pr.yaml                        | 4 ++--
 .github/workflows/test.yaml                      | 4 ++--
 conda/environments/all_cuda-118_arch-x86_64.yaml | 5 +++--
 conda/recipes/raft-dask/meta.yaml                | 5 +++--
 dependencies.yaml                                | 5 +++--
 python/raft-dask/pyproject.toml                  | 4 ++--
 6 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9b81ab7d82..97554b380e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -105,7 +105,7 @@ jobs:
       build_type: pull-request
       package-name: raft_dask
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
       test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
       test-smoketest: "python ./ci/wheel_smoke_test_raft_dask.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 876c9a6722..d204d2c16e 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -51,6 +51,6 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: raft_dask
-      test-before-amd64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      test-before-arm64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
       test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 1afebc98e6..fd2d1d2280 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -18,9 +18,10 @@ dependencies:
 - cupy
 - cxx-compiler
 - cython>=0.29,<0.30
+- dask-core==2023.3.2
 - dask-cuda==23.4.*
-- dask>=2023.1.1
-- distributed>=2023.1.1
+- dask==2023.3.2
+- distributed==2023.3.2.1
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 59a67fe148..cd08deabfa 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -46,9 +46,10 @@ requirements:
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
     - cuda-python >=11.7.1,<12.0
-    - dask >=2023.1.1
+    - dask ==2023.3.2
+    - dask-core ==2023.3.2
     - dask-cuda ={{ minor_version }}
-    - distributed >=2023.1.1
+    - distributed ==2023.3.2.1
     - joblib >=0.11
     - nccl >=2.9.9
     - pylibraft {{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 64fd7cd454..af29bf68a8 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -265,15 +265,16 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask>=2023.1.1
+          - dask==2023.3.2
           - dask-cuda==23.4.*
-          - distributed>=2023.1.1
+          - distributed==2023.3.2.1
           - joblib>=0.11
           - numba>=0.49
           - *numpy
           - ucx-py==0.31.*
       - output_types: conda
         packages:
+          - dask-core==2023.3.2
           - ucx>=1.13.0
           - ucx-proc=*=gpu
       - output_types: pyproject
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index fe490ea117..ba6cd7ccae 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -35,8 +35,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
     "dask-cuda==23.4.*",
-    "dask>=2023.1.1",
-    "distributed>=2023.1.1",
+    "dask==2023.3.2",
+    "distributed==2023.3.2.1",
     "joblib>=0.11",
     "numba>=0.49",
     "numpy>=1.21",

From 66b8493cd3b228838ea8d50474e824f718286c1a Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 6 Apr 2023 10:13:57 +0200
Subject: [PATCH 11/89] CAGRA (#1375)

This PR adds CAGRA, a graph based method for nearest neighbor search.

Authors:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Ben Frederickson (https://github.com/benfred)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/1375
---
 cpp/include/raft/neighbors/cagra.cuh          |  226 ++++
 .../raft/neighbors/cagra_serialize.cuh        |  154 +++
 cpp/include/raft/neighbors/cagra_types.hpp    |  199 +++
 .../raft/neighbors/detail/cagra/bitonic.hpp   |  226 ++++
 .../neighbors/detail/cagra/cagra_build.cuh    |  236 ++++
 .../neighbors/detail/cagra/cagra_search.cuh   |  100 ++
 .../detail/cagra/cagra_serialize.cuh          |  123 ++
 .../detail/cagra/compute_distance.hpp         |  253 ++++
 .../neighbors/detail/cagra/device_common.hpp  |   76 ++
 .../raft/neighbors/detail/cagra/factory.cuh   |   90 ++
 .../raft/neighbors/detail/cagra/fragment.hpp  |  212 +++
 .../neighbors/detail/cagra/graph_core.cuh     |  809 ++++++++++++
 .../raft/neighbors/detail/cagra/hashmap.hpp   |   86 ++
 .../detail/cagra/search_multi_cta.cuh         |  632 +++++++++
 .../detail/cagra/search_multi_kernel.cuh      |  721 ++++++++++
 .../neighbors/detail/cagra/search_plan.cuh    |  334 +++++
 .../detail/cagra/search_single_cta.cuh        | 1157 +++++++++++++++++
 .../detail/cagra/topk_for_cagra/topk.h        |   57 +
 .../detail/cagra/topk_for_cagra/topk_core.cuh |  926 +++++++++++++
 .../raft/neighbors/detail/cagra/utils.hpp     |  143 ++
 cpp/include/raft/util/cache_util.cuh          |    4 +-
 cpp/test/CMakeLists.txt                       |    1 +
 cpp/test/neighbors/ann_cagra.cuh              |  313 +++++
 .../ann_cagra/test_float_uint32_t.cu          |   32 +
 cpp/test/neighbors/ann_utils.cuh              |   49 +
 25 files changed, 7157 insertions(+), 2 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/cagra.cuh
 create mode 100644 cpp/include/raft/neighbors/cagra_serialize.cuh
 create mode 100644 cpp/include/raft/neighbors/cagra_types.hpp
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/bitonic.hpp
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/device_common.hpp
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/factory.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/fragment.hpp
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/hashmap.hpp
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk.h
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/cagra/utils.hpp
 create mode 100644 cpp/test/neighbors/ann_cagra.cuh
 create mode 100644 cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu

diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
new file mode 100644
index 0000000000..90728efd70
--- /dev/null
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/cagra/cagra_build.cuh"
+#include "detail/cagra/cagra_search.cuh"
+#include "detail/cagra/graph_core.cuh"
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_device_accessor.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/neighbors/cagra_types.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace raft::neighbors::experimental::cagra {
+
+/**
+ * @defgroup cagra CUDA ANN Graph-based nearest neighbor search
+ * @{
+ */
+
+/**
+ * @brief Build a kNN graph.
+ *
+ * The kNN graph is the first building block for CAGRA index.
+ * This function uses the IVF-PQ method to build a kNN graph.
+ *
+ * The output is a dense matrix that stores the neighbor indices for each pont in the dataset.
+ * Each point has the same number of neighbors.
+ *
+ * See [cagra::build](#cagra::build) for an alternative method.
+ *
+ * The following distance metrics are supported:
+ * - L2Expanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_pq::index_params build_params;
+ *   ivf_pq::search_params search_params
+ *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
+ *   // create knn graph
+ *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
+ *   auto pruned_gaph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 64);
+ *   cagra::prune(res, dataset, knn_graph.view(), pruned_graph.view());
+ *   // Construct an index from dataset and pruned knn_graph
+ *   auto index = cagra::index<T, IdxT>(res, build_params.metric(), dataset, pruned_graph.view());
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resources
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ * @param[out] knn_graph a host matrix view to store the output knn graph [n_rows, graph_degree]
+ * @param[in] refine_rate refinement rate for ivf-pq search
+ * @param[in] build_params (optional) ivf_pq index building parameters for knn graph
+ * @param[in] search_params (optional) ivf_pq search parameters
+ */
+template <typename DataT, typename IdxT, typename accessor>
+void build_knn_graph(raft::device_resources const& res,
+                     mdspan<const DataT, matrix_extent<IdxT>, row_major, accessor> dataset,
+                     raft::host_matrix_view<IdxT, IdxT, row_major> knn_graph,
+                     std::optional<float> refine_rate                   = std::nullopt,
+                     std::optional<ivf_pq::index_params> build_params   = std::nullopt,
+                     std::optional<ivf_pq::search_params> search_params = std::nullopt)
+{
+  detail::build_knn_graph(res, dataset, knn_graph, refine_rate, build_params, search_params);
+}
+
+/**
+ * @brief Prune a KNN graph.
+ *
+ * Decrease the number of neighbors for each node.
+ *
+ * See [cagra::build_knn_graph](#cagra::build_knn_graph) for usage example
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resources
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ * @param[in] knn_graph a matrix view (host or device) of the input knn graph [n_rows,
+ * knn_graph_degree]
+ * @param[out] new_graph a host matrix view of the pruned knn graph [n_rows, graph_degree]
+ */
+template <class DATA_T,
+          typename IdxT = uint32_t,
+          typename d_accessor =
+            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::device>,
+          typename g_accessor =
+            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::host>>
+void prune(raft::device_resources const& res,
+           mdspan<const DATA_T, matrix_extent<IdxT>, row_major, d_accessor> dataset,
+           mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
+           raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
+{
+  detail::graph::prune(res, dataset, knn_graph, new_graph);
+}
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * The build consist of two steps: build an intermediate knn-graph, and prune it to
+ * create the final graph. The index_params struct controls the node degree of these
+ * graphs.
+ *
+ * It is required that dataset and the pruned graph fit the GPU memory.
+ *
+ * To customize the parameters for knn-graph building and pruning, and to reuse the
+ * intermediate results, you could build the index in two steps using
+ * [cagra::build_knn_graph](#cagra::build_knn_graph) and [cagra::prune](#cagra::prune).
+ *
+ * The following distance metrics are supported:
+ * - L2
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // search K nearest neighbours
+ *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
+ *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
+ *   ivf_pq::search(res, search_params, index, queries, neighbors, distances);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] params parameters for building the index
+ * @param[in] dataset a matrix view (host or device) to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed cagra index
+ */
+template <typename T,
+          typename IdxT = uint32_t,
+          typename Accessor =
+            host_device_accessor<std::experimental::default_accessor<T>, memory_type::host>>
+index<T, IdxT> build(raft::device_resources const& res,
+                     const index_params& params,
+                     mdspan<const T, matrix_extent<IdxT>, row_major, Accessor> dataset)
+{
+  size_t degree = params.intermediate_graph_degree;
+  if (degree >= dataset.extent(0)) {
+    RAFT_LOG_WARN(
+      "Intermediate graph degree cannot be larger than dataset size, reducing it to %lu",
+      dataset.extent(0));
+    degree = dataset.extent(0) - 1;
+  }
+  RAFT_EXPECTS(degree >= params.graph_degree,
+               "Intermediate graph degree cannot be smaller than final graph degree");
+
+  auto knn_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), degree);
+
+  build_knn_graph(res, dataset, knn_graph.view());
+
+  auto cagra_graph = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), params.graph_degree);
+
+  prune<T, IdxT>(res, dataset, knn_graph.view(), cagra_graph.view());
+
+  // Construct an index from dataset and pruned knn graph.
+  return index<T, IdxT>(res, params.metric, dataset, cagra_graph.view());
+}
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [cagra::build](#cagra::build) documentation for a usage example.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the search
+ * @param[in] idx cagra index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ */
+template <typename T, typename IdxT>
+void search(raft::device_resources const& res,
+            const search_params& params,
+            const index<T, IdxT>& idx,
+            raft::device_matrix_view<const T, IdxT, row_major> queries,
+            raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
+            raft::device_matrix_view<float, IdxT, row_major> distances)
+{
+  RAFT_EXPECTS(
+    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
+    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
+               "Number of columns in output neighbors and distances matrices must equal k");
+
+  RAFT_EXPECTS(queries.extent(1) == idx.dim(),
+               "Number of query dimensions should equal number of dimensions in the index.");
+
+  detail::search_main(res, params, idx, queries, neighbors, distances);
+}
+/** @} */  // end group cagra
+
+}  // namespace raft::neighbors::experimental::cagra
diff --git a/cpp/include/raft/neighbors/cagra_serialize.cuh b/cpp/include/raft/neighbors/cagra_serialize.cuh
new file mode 100644
index 0000000000..befd5e9c07
--- /dev/null
+++ b/cpp/include/raft/neighbors/cagra_serialize.cuh
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/cagra/cagra_serialize.cuh"
+
+namespace raft::neighbors::experimental::cagra {
+
+/**
+ * \defgroup cagra_serialize CAGRA Serialize
+ * @{
+ */
+
+/**
+ * Write the index to an output stream
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ *
+ * raft::device_resources handle;
+ *
+ * // create an output stream
+ * std::ostream os(std::cout.rdbuf());
+ * // create an index with `auto index = cagra::build(...);`
+ * raft::serialize(handle, os, index);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle the raft handle
+ * @param[in] os output stream
+ * @param[in] index CAGRA index
+ *
+ */
+template <typename T, typename IdxT>
+void serialize(raft::device_resources const& handle, std::ostream& os, const index<T, IdxT>& index)
+{
+  detail::serialize(handle, os, index);
+}
+
+/**
+ * Save the index to file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ *
+ * raft::device_resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = cagra::build(...);`
+ * raft::serialize(handle, filename, index);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index CAGRA index
+ *
+ */
+template <typename T, typename IdxT>
+void serialize(raft::device_resources const& handle,
+               const std::string& filename,
+               const index<T, IdxT>& index)
+{
+  detail::serialize(handle, filename, index);
+}
+
+/**
+ * Load index from input stream
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ *
+ * raft::device_resources handle;
+ *
+ * // create an input stream
+ * std::istream is(std::cin.rdbuf());
+ * using T    = float; // data element type
+ * using IdxT = int; // type of the index
+ * auto index = raft::deserialize<T, IdxT>(handle, is);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle the raft handle
+ * @param[in] is input stream
+ *
+ * @return raft::neighbors::cagra::index<T, IdxT>
+ */
+template <typename T, typename IdxT>
+index<T, IdxT> deserialize(raft::device_resources const& handle, std::istream& is)
+{
+  return detail::deserialize<T, IdxT>(handle, is);
+}
+
+/**
+ * Load index from file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ *
+ * raft::device_resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * using T    = float; // data element type
+ * using IdxT = int; // type of the index
+ * auto index = raft::deserialize<T, IdxT>(handle, filename);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ *
+ * @return raft::neighbors::cagra::index<T, IdxT>
+ */
+template <typename T, typename IdxT>
+index<T, IdxT> deserialize(raft::device_resources const& handle, const std::string& filename)
+{
+  return detail::deserialize<T, IdxT>(handle, filename);
+}
+
+/**@}*/
+
+}  // namespace raft::neighbors::experimental::cagra
diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
new file mode 100644
index 0000000000..bd9b3b586b
--- /dev/null
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "ann_types.hpp"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan_types.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/util/integer_utils.hpp>
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <thrust/fill.h>
+#include <type_traits>
+
+namespace raft::neighbors::experimental::cagra {
+/**
+ * @ingroup cagra
+ * @{
+ */
+
+struct index_params : ann::index_params {
+  size_t intermediate_graph_degree = 128;  // Degree of input graph for pruning.
+  size_t graph_degree              = 64;   // Degree of output graph.
+};
+
+enum class search_algo {
+  SINGLE_CTA,  // for large batch
+  MULTI_CTA,   // for small batch
+  MULTI_KERNEL,
+  AUTO
+};
+
+enum class hash_mode { HASH, SMALL, AUTO };
+
+struct search_params : ann::search_params {
+  /** Maximum number of queries to search at the same time (batch size). */
+  size_t max_queries = 1;
+
+  /** Number of intermediate search results retained during the search.
+   *
+   *  This is the main knob to adjust trade off between accuracy and search speed.
+   *  Higher values improve the search accuracy.
+   */
+  size_t itopk_size = 64;
+
+  /** Upper limit of search iterations. Auto select when 0.*/
+  size_t max_iterations = 0;
+
+  // In the following we list additional search parameters for fine tuning.
+  // Reasonable default values are automatically chosen.
+
+  /** Which search implementation to use. */
+  search_algo algo = search_algo::AUTO;
+
+  /** Number of threads used to calculate a single distance. 4, 8, 16, or 32. */
+  size_t team_size = 0;
+
+  /*/ Number of graph nodes to select as the starting point for the search in each iteration. aka
+   * search width?*/
+  size_t num_parents = 1;
+  /** Lower limit of search iterations. */
+  size_t min_iterations = 0;
+
+  /** Bit length for reading the dataset vectors. 0, 64 or 128. Auto selection when 0. */
+  size_t load_bit_length = 0;
+  /** Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0. */
+  size_t thread_block_size = 0;
+  /** Hashmap type. Auto selection when AUTO. */
+  hash_mode hashmap_mode = hash_mode::AUTO;
+  /** Lower limit of hashmap bit length. More than 8. */
+  size_t hashmap_min_bitlen = 0;
+  /** Upper limit of hashmap fill rate. More than 0.1, less than 0.9.*/
+  float hashmap_max_fill_rate = 0.5;
+
+  /* Number of iterations of initial random seed node selection. 1 or more. */
+  uint32_t num_random_samplings = 1;
+  // Bit mask used for initial random seed node selection. */
+  uint64_t rand_xor_mask = 0x128394;
+};
+
+static_assert(std::is_aggregate_v<index_params>);
+static_assert(std::is_aggregate_v<search_params>);
+
+/**
+ * @brief CAGRA index.
+ *
+ * The index stores the dataset and a kNN graph in device memory.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ */
+template <typename T, typename IdxT>
+struct index : ann::index {
+  static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
+                "IdxT must be able to represent all values of uint32_t");
+
+ public:
+  /** Distance metric used for clustering. */
+  [[nodiscard]] constexpr inline auto metric() const noexcept -> raft::distance::DistanceType
+  {
+    return metric_;
+  }
+
+  // /** Total length of the index. */
+  [[nodiscard]] constexpr inline auto size() const noexcept -> IdxT { return dataset_.extent(0); }
+
+  /** Dimensionality of the data. */
+  [[nodiscard]] constexpr inline auto dim() const noexcept -> uint32_t
+  {
+    return dataset_.extent(1);
+  }
+  /** Graph degree */
+  [[nodiscard]] constexpr inline auto graph_degree() const noexcept -> uint32_t
+  {
+    return graph_.extent(1);
+  }
+
+  /** Dataset [size, dim] */
+  [[nodiscard]] inline auto dataset() const noexcept -> device_matrix_view<const T, IdxT, row_major>
+  {
+    return dataset_.view();
+  }
+
+  /** neighborhood graph [size, graph-degree] */
+  inline auto graph() noexcept -> device_matrix_view<IdxT, IdxT, row_major>
+  {
+    return graph_.view();
+  }
+
+  [[nodiscard]] inline auto graph() const noexcept
+    -> device_matrix_view<const IdxT, IdxT, row_major>
+  {
+    return graph_.view();
+  }
+
+  // Don't allow copying the index for performance reasons (try avoiding copying data)
+  index(const index&) = delete;
+  index(index&&)      = default;
+  auto operator=(const index&) -> index& = delete;
+  auto operator=(index&&) -> index& = default;
+  ~index()                          = default;
+
+  /** Construct an empty index. */
+  index(raft::device_resources const& res)
+    : ann::index(),
+      metric_(raft::distance::DistanceType::L2Expanded),
+      dataset_(make_device_matrix<T, IdxT>(res, 0, 0)),
+      graph_(make_device_matrix<IdxT, IdxT>(res, 0, 0))
+  {
+  }
+
+  /** Construct an index from dataset and knn_graph arrays */
+  template <typename data_accessor, typename graph_accessor>
+  index(raft::device_resources const& res,
+        raft::distance::DistanceType metric,
+        mdspan<const T, matrix_extent<IdxT>, row_major, data_accessor> dataset,
+        mdspan<IdxT, matrix_extent<IdxT>, row_major, graph_accessor> knn_graph)
+    : ann::index(),
+      metric_(metric),
+      dataset_(make_device_matrix<T, IdxT>(res, dataset.extent(0), dataset.extent(1))),
+      graph_(make_device_matrix<IdxT, IdxT>(res, knn_graph.extent(0), knn_graph.extent(1)))
+  {
+    RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0),
+                 "Dataset and knn_graph must have equal number of rows");
+    raft::copy(dataset_.data_handle(), dataset.data_handle(), dataset.size(), res.get_stream());
+    raft::copy(graph_.data_handle(), knn_graph.data_handle(), knn_graph.size(), res.get_stream());
+    res.sync_stream();
+  }
+
+ private:
+  raft::distance::DistanceType metric_;
+  raft::device_matrix<T, IdxT, row_major> dataset_;
+  raft::device_matrix<IdxT, IdxT, row_major> graph_;
+};
+
+/** @} */
+
+}  // namespace raft::neighbors::experimental::cagra
diff --git a/cpp/include/raft/neighbors/detail/cagra/bitonic.hpp b/cpp/include/raft/neighbors/detail/cagra/bitonic.hpp
new file mode 100644
index 0000000000..45aff99421
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/bitonic.hpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdint>
+#include <raft/core/detail/macros.hpp>
+
+namespace raft::neighbors::experimental::cagra::detail {
+namespace bitonic {
+
+namespace detail {
+
+template <class K, class V>
+_RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, K& k1, V& v1, const bool asc)
+{
+  if ((k0 != k1) && ((k0 < k1) != asc)) {
+    const auto tmp_k = k0;
+    k0               = k1;
+    k1               = tmp_k;
+    const auto tmp_v = v0;
+    v0               = v1;
+    v1               = tmp_v;
+  }
+}
+
+template <class K, class V>
+_RAFT_DEVICE inline void swap_if_needed(K& k0, V& v0, const unsigned lane_offset, const bool asc)
+{
+  auto k1 = __shfl_xor_sync(~0u, k0, lane_offset);
+  auto v1 = __shfl_xor_sync(~0u, v0, lane_offset);
+  if ((k0 != k1) && ((k0 < k1) != asc)) {
+    k0 = k1;
+    v0 = v1;
+  }
+}
+
+template <class K, class V, unsigned N, unsigned warp_size = 32>
+struct warp_merge_core {
+  _RAFT_DEVICE inline void operator()(K k[N], V v[N], const std::uint32_t range, const bool asc)
+  {
+    const auto lane_id = threadIdx.x % warp_size;
+
+    if (range == 1) {
+      for (std::uint32_t b = 2; b <= N; b <<= 1) {
+        for (std::uint32_t c = b / 2; c >= 1; c >>= 1) {
+#pragma unroll
+          for (std::uint32_t i = 0; i < N; i++) {
+            std::uint32_t j = i ^ c;
+            if (i >= j) continue;
+            const auto line_id = i + (N * lane_id);
+            const auto p       = static_cast<bool>(line_id & b) == static_cast<bool>(line_id & c);
+            swap_if_needed(k[i], v[i], k[j], v[j], p);
+          }
+        }
+      }
+      return;
+    }
+
+    const std::uint32_t b = range;
+    for (std::uint32_t c = b / 2; c >= 1; c >>= 1) {
+      const auto p = static_cast<bool>(lane_id & b) == static_cast<bool>(lane_id & c);
+#pragma unroll
+      for (std::uint32_t i = 0; i < N; i++) {
+        swap_if_needed(k[i], v[i], c, p);
+      }
+    }
+    const auto p = ((lane_id & b) == 0);
+    for (std::uint32_t c = N / 2; c >= 1; c >>= 1) {
+#pragma unroll
+      for (std::uint32_t i = 0; i < N; i++) {
+        std::uint32_t j = i ^ c;
+        if (i >= j) continue;
+        swap_if_needed(k[i], v[i], k[j], v[j], p);
+      }
+    }
+  }
+};
+
+template <class K, class V, unsigned warp_size>
+struct warp_merge_core<K, V, 6, warp_size> {
+  _RAFT_DEVICE inline void operator()(K k[6], V v[6], const std::uint32_t range, const bool asc)
+  {
+    constexpr unsigned N = 6;
+    const auto lane_id   = threadIdx.x % warp_size;
+
+    if (range == 1) {
+      for (std::uint32_t i = 0; i < N; i += 3) {
+        const auto p = (i == 0);
+        swap_if_needed(k[0 + i], v[0 + i], k[1 + i], v[1 + i], p);
+        swap_if_needed(k[1 + i], v[1 + i], k[2 + i], v[2 + i], p);
+        swap_if_needed(k[0 + i], v[0 + i], k[1 + i], v[1 + i], p);
+      }
+      const auto p = ((lane_id & 1) == 0);
+      for (std::uint32_t i = 0; i < 3; i++) {
+        std::uint32_t j = i + 3;
+        swap_if_needed(k[i], v[i], k[j], v[j], p);
+      }
+      for (std::uint32_t i = 0; i < N; i += 3) {
+        swap_if_needed(k[0 + i], v[0 + i], k[1 + i], v[1 + i], p);
+        swap_if_needed(k[1 + i], v[1 + i], k[2 + i], v[2 + i], p);
+        swap_if_needed(k[0 + i], v[0 + i], k[1 + i], v[1 + i], p);
+      }
+      return;
+    }
+
+    const std::uint32_t b = range;
+    for (std::uint32_t c = b / 2; c >= 1; c >>= 1) {
+      const auto p = static_cast<bool>(lane_id & b) == static_cast<bool>(lane_id & c);
+#pragma unroll
+      for (std::uint32_t i = 0; i < N; i++) {
+        swap_if_needed(k[i], v[i], c, p);
+      }
+    }
+    const auto p = ((lane_id & b) == 0);
+    for (std::uint32_t i = 0; i < 3; i++) {
+      std::uint32_t j = i + 3;
+      swap_if_needed(k[i], v[i], k[j], v[j], p);
+    }
+    for (std::uint32_t i = 0; i < N; i += N / 2) {
+      swap_if_needed(k[0 + i], v[0 + i], k[1 + i], v[1 + i], p);
+      swap_if_needed(k[1 + i], v[1 + i], k[2 + i], v[2 + i], p);
+      swap_if_needed(k[0 + i], v[0 + i], k[1 + i], v[1 + i], p);
+    }
+  }
+};
+
+template <class K, class V, unsigned warp_size>
+struct warp_merge_core<K, V, 3, warp_size> {
+  _RAFT_DEVICE inline void operator()(K k[3], V v[3], const std::uint32_t range, const bool asc)
+  {
+    constexpr unsigned N = 3;
+    const auto lane_id   = threadIdx.x % warp_size;
+
+    if (range == 1) {
+      const auto p = ((lane_id & 1) == 0);
+      swap_if_needed(k[0], v[0], k[1], v[1], p);
+      swap_if_needed(k[1], v[1], k[2], v[2], p);
+      swap_if_needed(k[0], v[0], k[1], v[1], p);
+      return;
+    }
+
+    const std::uint32_t b = range;
+    for (std::uint32_t c = b / 2; c >= 1; c >>= 1) {
+      const auto p = static_cast<bool>(lane_id & b) == static_cast<bool>(lane_id & c);
+#pragma unroll
+      for (std::uint32_t i = 0; i < N; i++) {
+        swap_if_needed(k[i], v[i], c, p);
+      }
+    }
+    const auto p = ((lane_id & b) == 0);
+    swap_if_needed(k[0], v[0], k[1], v[1], p);
+    swap_if_needed(k[1], v[1], k[2], v[2], p);
+    swap_if_needed(k[0], v[0], k[1], v[1], p);
+  }
+};
+
+template <class K, class V, unsigned warp_size>
+struct warp_merge_core<K, V, 2, warp_size> {
+  _RAFT_DEVICE inline void operator()(K k[2], V v[2], const std::uint32_t range, const bool asc)
+  {
+    constexpr unsigned N = 2;
+    const auto lane_id   = threadIdx.x % warp_size;
+
+    if (range == 1) {
+      const auto p = ((lane_id & 1) == 0);
+      swap_if_needed(k[0], v[0], k[1], v[1], p);
+      return;
+    }
+
+    const std::uint32_t b = range;
+    for (std::uint32_t c = b / 2; c >= 1; c >>= 1) {
+      const auto p = static_cast<bool>(lane_id & b) == static_cast<bool>(lane_id & c);
+#pragma unroll
+      for (std::uint32_t i = 0; i < N; i++) {
+        swap_if_needed(k[i], v[i], c, p);
+      }
+    }
+    const auto p = ((lane_id & b) == 0);
+    swap_if_needed(k[0], v[0], k[1], v[1], p);
+  }
+};
+
+template <class K, class V, unsigned warp_size>
+struct warp_merge_core<K, V, 1, warp_size> {
+  _RAFT_DEVICE inline void operator()(K k[1], V v[1], const std::uint32_t range, const bool asc)
+  {
+    const auto lane_id    = threadIdx.x % warp_size;
+    const std::uint32_t b = range;
+    for (std::uint32_t c = b / 2; c >= 1; c >>= 1) {
+      const auto p = static_cast<bool>(lane_id & b) == static_cast<bool>(lane_id & c);
+      swap_if_needed(k[0], v[0], c, p);
+    }
+  }
+};
+
+}  // namespace detail
+
+template <class K, class V, unsigned N, unsigned warp_size = 32>
+__device__ void warp_merge(K k[N], V v[N], unsigned range, const bool asc = true)
+{
+  detail::warp_merge_core<K, V, N, warp_size>{}(k, v, range, asc);
+}
+
+template <class K, class V, unsigned N, unsigned warp_size = 32>
+__device__ void warp_sort(K k[N], V v[N], const bool asc = true)
+{
+  for (std::uint32_t range = 1; range <= warp_size; range <<= 1) {
+    warp_merge<K, V, N, warp_size>(k, v, range, asc);
+  }
+}
+
+}  // namespace bitonic
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
new file mode 100644
index 0000000000..4d63fb7999
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../../cagra_types.hpp"
+#include "graph_core.cuh"
+#include <chrono>
+#include <cstdio>
+#include <vector>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_device_accessor.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
+#include <raft/neighbors/detail/refine.cuh>
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+#include <raft/neighbors/refine.cuh>
+
+namespace raft::neighbors::experimental::cagra::detail {
+
+using INDEX_T = std::uint32_t;
+
+template <typename DataT, typename IdxT, typename accessor>
+void build_knn_graph(raft::device_resources const& res,
+                     mdspan<const DataT, matrix_extent<IdxT>, row_major, accessor> dataset,
+                     raft::host_matrix_view<IdxT, IdxT, row_major> knn_graph,
+                     std::optional<float> refine_rate                   = std::nullopt,
+                     std::optional<ivf_pq::index_params> build_params   = std::nullopt,
+                     std::optional<ivf_pq::search_params> search_params = std::nullopt)
+{
+  RAFT_EXPECTS(
+    dataset.extent(1) * sizeof(DataT) % 8 == 0,
+    "Dataset rows are expected to have at least 8 bytes alignment. Try padding feature dims.");
+
+  RAFT_EXPECTS(!build_params || build_params->metric == distance::DistanceType::L2Expanded,
+               "Currently only L2Expanded metric is supported");
+
+  uint32_t node_degree = knn_graph.extent(1);
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::build_graph(%zu, %zu, %u)",
+                                                            size_t(dataset.extent(0)),
+                                                            size_t(dataset.extent(1)),
+                                                            node_degree);
+
+  if (!build_params) {
+    build_params          = ivf_pq::index_params{};
+    build_params->n_lists = dataset.extent(0) < 4 * 2500 ? 4 : (uint32_t)(dataset.extent(0) / 2500);
+    build_params->pq_dim  = raft::Pow2<8>::roundUp(dataset.extent(1) / 2);
+    build_params->pq_bits = 8;
+    build_params->kmeans_trainset_fraction = dataset.extent(0) < 10000 ? 1 : 10;
+    build_params->kmeans_n_iters           = 25;
+    build_params->add_data_on_build        = true;
+  }
+
+  // Make model name
+  const std::string model_name = [&]() {
+    char model_name[1024];
+    sprintf(model_name,
+            "%s-%lux%lu.cluster_%u.pq_%u.%ubit.itr_%u.metric_%u.pqcenter_%u",
+            "IVF-PQ",
+            static_cast<size_t>(dataset.extent(0)),
+            static_cast<size_t>(dataset.extent(1)),
+            build_params->n_lists,
+            build_params->pq_dim,
+            build_params->pq_bits,
+            build_params->kmeans_n_iters,
+            build_params->metric,
+            static_cast<uint32_t>(build_params->codebook_kind));
+    return std::string(model_name);
+  }();
+
+  RAFT_LOG_DEBUG("# Building IVF-PQ index %s", model_name.c_str());
+  auto index = ivf_pq::build<DataT, int64_t>(
+    res, *build_params, dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+
+  //
+  // search top (k + 1) neighbors
+  //
+  if (!search_params) {
+    search_params                          = ivf_pq::search_params{};
+    search_params->n_probes                = std::min(dataset.extent(1) * 2, build_params->n_lists);
+    search_params->lut_dtype               = CUDA_R_8U;
+    search_params->internal_distance_dtype = CUDA_R_32F;
+  }
+  const auto top_k          = node_degree + 1;
+  uint32_t gpu_top_k        = node_degree * refine_rate.value_or(2.0f);
+  gpu_top_k                 = std::min(std::max(gpu_top_k, top_k), dataset.extent(0));
+  const auto num_queries    = dataset.extent(0);
+  const auto max_batch_size = 1024;
+  RAFT_LOG_DEBUG(
+    "IVF-PQ search node_degree: %d, top_k: %d,  gpu_top_k: %d,  max_batch_size:: %d, n_probes: %u",
+    node_degree,
+    top_k,
+    gpu_top_k,
+    max_batch_size,
+    search_params->n_probes);
+
+  // TODO(tfeher): shall we use uint32_t?
+  auto distances = raft::make_device_matrix<float, int64_t>(res, max_batch_size, gpu_top_k);
+  auto neighbors = raft::make_device_matrix<int64_t, int64_t>(res, max_batch_size, gpu_top_k);
+  auto refined_distances = raft::make_device_matrix<float, int64_t>(res, max_batch_size, top_k);
+  auto refined_neighbors = raft::make_device_matrix<int64_t, int64_t>(res, max_batch_size, top_k);
+  auto neighbors_host    = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, gpu_top_k);
+  auto queries_host = raft::make_host_matrix<DataT, int64_t>(max_batch_size, dataset.extent(1));
+  auto refined_neighbors_host = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, top_k);
+  auto refined_distances_host = raft::make_host_matrix<float, int64_t>(max_batch_size, top_k);
+
+  // TODO(tfeher): batched search with multiple GPUs
+  std::size_t num_self_included = 0;
+  bool first                    = true;
+  const auto start_clock        = std::chrono::system_clock::now();
+
+  rmm::mr::device_memory_resource* device_memory = nullptr;
+  auto pool_guard = raft::get_pool_memory_resource(device_memory, 1024 * 1024);
+  if (pool_guard) {
+    RAFT_LOG_DEBUG("ivf_pq using pool memory resource with initial size %zu bytes",
+                   pool_guard->pool_size());
+  }
+
+  raft::spatial::knn::detail::utils::batch_load_iterator<DataT> vec_batches(dataset.data_handle(),
+                                                                            dataset.extent(0),
+                                                                            dataset.extent(1),
+                                                                            max_batch_size,
+                                                                            res.get_stream(),
+                                                                            device_memory);
+
+  for (const auto& batch : vec_batches) {
+    auto queries_view = raft::make_device_matrix_view<const DataT, int64_t>(
+      batch.data(), batch.size(), batch.row_width());
+    auto neighbors_view = make_device_matrix_view<int64_t, int64_t>(
+      neighbors.data_handle(), batch.size(), neighbors.extent(1));
+    auto distances_view = make_device_matrix_view<float, int64_t>(
+      distances.data_handle(), batch.size(), distances.extent(1));
+
+    ivf_pq::search(res, *search_params, index, queries_view, neighbors_view, distances_view);
+
+    if constexpr (is_host_mdspan_v<decltype(dataset)>) {
+      raft::copy(neighbors_host.data_handle(),
+                 neighbors.data_handle(),
+                 neighbors_view.size(),
+                 res.get_stream());
+      raft::copy(queries_host.data_handle(), batch.data(), queries_view.size(), res.get_stream());
+      auto queries_host_view = make_host_matrix_view<const DataT, int64_t>(
+        queries_host.data_handle(), batch.size(), batch.row_width());
+      auto neighbors_host_view = make_host_matrix_view<const int64_t, int64_t>(
+        neighbors_host.data_handle(), batch.size(), neighbors.extent(1));
+      auto refined_neighbors_host_view = make_host_matrix_view<int64_t, int64_t>(
+        refined_neighbors_host.data_handle(), batch.size(), top_k);
+      auto refined_distances_host_view = make_host_matrix_view<float, int64_t>(
+        refined_distances_host.data_handle(), batch.size(), top_k);
+      res.sync_stream();
+
+      raft::neighbors::detail::refine_host<int64_t, DataT, float, int64_t>(  // res,
+        dataset,
+        queries_host_view,
+        neighbors_host_view,
+        refined_neighbors_host_view,
+        refined_distances_host_view,
+        build_params->metric);
+    } else {
+      auto neighbor_candidates_view = make_device_matrix_view<const int64_t, uint64_t>(
+        neighbors.data_handle(), batch.size(), gpu_top_k);
+      auto refined_neighbors_view = make_device_matrix_view<int64_t, int64_t>(
+        refined_neighbors.data_handle(), batch.size(), top_k);
+      auto refined_distances_view = make_device_matrix_view<float, int64_t>(
+        refined_distances.data_handle(), batch.size(), top_k);
+
+      auto dataset_view = make_device_matrix_view<const DataT, int64_t>(
+        dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+      raft::neighbors::detail::refine_device<int64_t, DataT, float, int64_t>(
+        res,
+        dataset_view,
+        queries_view,
+        neighbor_candidates_view,
+        refined_neighbors_view,
+        refined_distances_view,
+        build_params->metric);
+      raft::copy(refined_neighbors_host.data_handle(),
+                 refined_neighbors_view.data_handle(),
+                 refined_neighbors_view.size(),
+                 res.get_stream());
+      res.sync_stream();
+    }
+    // omit itself & write out
+    // TODO(tfeher): do this in parallel with GPU processing of next batch
+    for (std::size_t i = 0; i < batch.size(); i++) {
+      size_t vec_idx = i + batch.offset();
+      for (std::size_t j = 0, num_added = 0; j < top_k && num_added < node_degree; j++) {
+        const auto v = refined_neighbors_host(i, j);
+        if (static_cast<size_t>(v) == vec_idx) {
+          num_self_included++;
+          continue;
+        }
+        knn_graph(vec_idx, num_added) = v;
+        num_added++;
+      }
+    }
+
+    size_t num_queries_done = batch.offset() + batch.size();
+    const auto end_clock    = std::chrono::system_clock::now();
+    const auto time =
+      std::chrono::duration_cast<std::chrono::microseconds>(end_clock - start_clock).count() * 1e-6;
+    const auto throughput = num_queries_done / time;
+    RAFT_LOG_DEBUG(
+      "# Search %12lu / %12lu (%3.2f %%), %e queries/sec, %.2f minutes ETA, self included = "
+      "%3.2f %%    \r",
+      num_queries_done,
+      dataset.extent(0),
+      num_queries_done / static_cast<double>(dataset.extent(0)) * 100,
+      throughput,
+      (num_queries - num_queries_done) / throughput / 60,
+      static_cast<double>(num_self_included) / num_queries_done * 100.);
+    first = false;
+  }
+  if (!first) RAFT_LOG_DEBUG("# Finished building kNN graph");
+}
+
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
new file mode 100644
index 0000000000..79cbb6198f
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/neighbors/cagra_types.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+#include "factory.cuh"
+#include "search_multi_cta.cuh"
+#include "search_multi_kernel.cuh"
+#include "search_plan.cuh"
+#include "search_single_cta.cuh"
+
+namespace raft::neighbors::experimental::cagra::detail {
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [build](#build) documentation for a usage example.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] idx ivf-pq constructed index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ */
+
+template <typename T, typename IdxT = uint32_t, typename DistanceT = float>
+void search_main(raft::device_resources const& res,
+                 search_params params,
+                 const index<T, IdxT>& index,
+                 raft::device_matrix_view<const T, IdxT, row_major> queries,
+                 raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
+                 raft::device_matrix_view<DistanceT, IdxT, row_major> distances)
+{
+  RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n",
+                 static_cast<size_t>(index.dataset().extent(0)),
+                 static_cast<size_t>(index.dataset().extent(1)));
+  RAFT_LOG_DEBUG("# query size = %lu, dim = %lu\n",
+                 static_cast<size_t>(queries.extent(0)),
+                 static_cast<size_t>(queries.extent(1)));
+  RAFT_EXPECTS(queries.extent(1) == index.dim(), "Querise and index dim must match");
+  uint32_t topk = neighbors.extent(1);
+
+  std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>> plan =
+    factory<T, IdxT, DistanceT>::create(res, params, index.dim(), index.graph_degree(), topk);
+
+  plan->check(neighbors.extent(1));
+
+  RAFT_LOG_DEBUG("Cagra search");
+  uint32_t max_queries = plan->max_queries;
+  uint32_t query_dim   = queries.extent(1);
+
+  for (unsigned qid = 0; qid < queries.extent(0); qid += max_queries) {
+    const uint32_t n_queries       = std::min<std::size_t>(max_queries, queries.extent(0) - qid);
+    IdxT* _topk_indices_ptr        = neighbors.data_handle() + (topk * qid);
+    DistanceT* _topk_distances_ptr = distances.data_handle() + (topk * qid);
+    // todo(tfeher): one could keep distances optional and pass nullptr
+    const T* _query_ptr = queries.data_handle() + (query_dim * qid);
+    const IdxT* _seed_ptr =
+      plan->num_seeds > 0 ? plan->dev_seed.data() + (plan->num_seeds * qid) : nullptr;
+    uint32_t* _num_executed_iterations = nullptr;
+
+    (*plan)(res,
+            index.dataset(),
+            index.graph(),
+            _topk_indices_ptr,
+            _topk_distances_ptr,
+            _query_ptr,
+            n_queries,
+            _seed_ptr,
+            _num_executed_iterations,
+            topk);
+  }
+}
+/** @} */  // end group cagra
+
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
new file mode 100644
index 0000000000..171f261cf3
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_serialize.cuh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/mdarray.hpp>
+#include <raft/core/serialize.hpp>
+#include <raft/neighbors/cagra_types.hpp>
+
+#include <fstream>
+
+namespace raft::neighbors::experimental::cagra::detail {
+
+// Serialization version 1.
+constexpr int serialization_version = 1;
+
+// NB: we wrap this check in a struct, so that the updated RealSize is easy to see in the error
+// message.
+template <size_t RealSize, size_t ExpectedSize>
+struct check_index_layout {
+  static_assert(RealSize == ExpectedSize,
+                "The size of the index struct has changed since the last update; "
+                "paste in the new size and consider updating the serialization logic");
+};
+
+template struct check_index_layout<sizeof(index<double, std::uint64_t>), 136>;
+
+/**
+ * Save the index to file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] res the raft resource handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index_ CAGRA index
+ *
+ */
+template <typename T, typename IdxT>
+void serialize(raft::device_resources const& res, std::ostream& os, const index<T, IdxT>& index_)
+{
+  RAFT_LOG_DEBUG(
+    "Saving CAGRA index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
+
+  serialize_scalar(res, os, serialization_version);
+  serialize_scalar(res, os, index_.size());
+  serialize_scalar(res, os, index_.dim());
+  serialize_scalar(res, os, index_.graph_degree());
+  serialize_scalar(res, os, index_.metric());
+  serialize_mdspan(res, os, index_.dataset());
+  serialize_mdspan(res, os, index_.graph());
+}
+
+template <typename T, typename IdxT>
+void serialize(raft::device_resources const& res,
+               const std::string& filename,
+               const index<T, IdxT>& index_)
+{
+  std::ofstream of(filename, std::ios::out | std::ios::binary);
+  if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+
+  detail::serialize(res, of, index_);
+
+  of.close();
+  if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); }
+}
+
+/** Load an index from file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @param[in] res the raft resource handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[in] index_ CAGRA index
+ *
+ */
+template <typename T, typename IdxT>
+auto deserialize(raft::device_resources const& res, std::istream& is) -> index<T, IdxT>
+{
+  auto ver = deserialize_scalar<int>(res, is);
+  if (ver != serialization_version) {
+    RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver);
+  }
+  auto n_rows       = deserialize_scalar<IdxT>(res, is);
+  auto dim          = deserialize_scalar<std::uint32_t>(res, is);
+  auto graph_degree = deserialize_scalar<std::uint32_t>(res, is);
+  auto metric       = deserialize_scalar<raft::distance::DistanceType>(res, is);
+
+  auto dataset = raft::make_host_matrix<T, IdxT>(n_rows, dim);
+  auto graph   = raft::make_host_matrix<IdxT, IdxT>(n_rows, graph_degree);
+
+  deserialize_mdspan(res, is, dataset.view());
+  deserialize_mdspan(res, is, graph.view());
+
+  return index<T, IdxT>(res, metric, raft::make_const_mdspan(dataset.view()), graph.view());
+}
+
+template <typename T, typename IdxT>
+auto deserialize(raft::device_resources const& res, const std::string& filename) -> index<T, IdxT>
+{
+  std::ifstream is(filename, std::ios::in | std::ios::binary);
+
+  if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+
+  auto index = detail::deserialize<T, IdxT>(res, is);
+
+  is.close();
+
+  return index;
+}
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
new file mode 100644
index 0000000000..a05c714700
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "device_common.hpp"
+#include "hashmap.hpp"
+#include "utils.hpp"
+#include <type_traits>
+
+namespace raft::neighbors::experimental::cagra::detail {
+namespace device {
+
+// using LOAD_256BIT_T = ulonglong4;
+using LOAD_128BIT_T = uint4;
+using LOAD_64BIT_T  = uint64_t;
+
+template <class LOAD_T, class DATA_T>
+_RAFT_DEVICE constexpr unsigned get_vlen()
+{
+  return utils::size_of<LOAD_T>() / utils::size_of<DATA_T>();
+}
+
+template <class LOAD_T, class DATA_T, unsigned VLEN>
+struct data_load_t {
+  union {
+    LOAD_T load;
+    DATA_T data[VLEN];
+  };
+};
+
+template <unsigned TEAM_SIZE,
+          unsigned MAX_DATASET_DIM,
+          class LOAD_T,
+          class DATA_T,
+          class DISTANCE_T,
+          class INDEX_T>
+_RAFT_DEVICE void compute_distance_to_random_nodes(
+  INDEX_T* const result_indices_ptr,       // [num_pickup]
+  DISTANCE_T* const result_distances_ptr,  // [num_pickup]
+  const float* const query_buffer,
+  const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
+  const std::size_t dataset_dim,
+  const std::size_t dataset_size,
+  const std::size_t num_pickup,
+  const unsigned num_distilation,
+  const uint64_t rand_xor_mask,
+  const INDEX_T* seed_ptr,  // [num_seeds]
+  const uint32_t num_seeds,
+  uint32_t* const visited_hash_ptr,
+  const uint32_t hash_bitlen,
+  const uint32_t block_id   = 0,
+  const uint32_t num_blocks = 1)
+{
+  const unsigned lane_id   = threadIdx.x % TEAM_SIZE;
+  constexpr unsigned vlen  = get_vlen<LOAD_T, DATA_T>();
+  constexpr unsigned nelem = (MAX_DATASET_DIM + (TEAM_SIZE * vlen) - 1) / (TEAM_SIZE * vlen);
+  struct data_load_t<LOAD_T, DATA_T, vlen> dl_buff[nelem];
+  uint32_t max_i = num_pickup;
+  if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); }
+  for (uint32_t i = threadIdx.x / TEAM_SIZE; i < max_i; i += blockDim.x / TEAM_SIZE) {
+    const bool valid_i = (i < num_pickup);
+
+    INDEX_T best_index_team_local;
+    DISTANCE_T best_norm2_team_local = utils::get_max_value<DISTANCE_T>();
+    for (uint32_t j = 0; j < num_distilation; j++) {
+      // Select a node randomly and compute the distance to it
+      uint32_t seed_index;
+      DISTANCE_T norm2 = 0.0;
+      if (valid_i) {
+        // uint32_t gid = i + (num_pickup * (j + (num_distilation * block_id)));
+        uint32_t gid = block_id + (num_blocks * (i + (num_pickup * j)));
+        if (seed_ptr && (gid < num_seeds)) {
+          seed_index = seed_ptr[gid];
+        } else {
+          seed_index = device::xorshift64(gid ^ rand_xor_mask) % dataset_size;
+        }
+#pragma unroll
+        for (uint32_t e = 0; e < nelem; e++) {
+          const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen;
+          if (k >= dataset_dim) break;
+          dl_buff[e].load = ((LOAD_T*)(dataset_ptr + k + (dataset_dim * seed_index)))[0];
+        }
+#pragma unroll
+        for (uint32_t e = 0; e < nelem; e++) {
+          const uint32_t k = (lane_id + (TEAM_SIZE * e)) * vlen;
+          if (k >= dataset_dim) break;
+#pragma unroll
+          for (uint32_t v = 0; v < vlen; v++) {
+            const uint32_t kv = k + v;
+            // if (kv >= dataset_dim) break;
+            DISTANCE_T diff = query_buffer[device::swizzling(kv)];
+            diff -= static_cast<float>(dl_buff[e].data[v]) * device::fragment_scale<DATA_T>();
+            norm2 += diff * diff;
+          }
+        }
+      }
+      for (uint32_t offset = TEAM_SIZE / 2; offset > 0; offset >>= 1) {
+        norm2 += __shfl_xor_sync(0xffffffff, norm2, offset);
+      }
+
+      if (valid_i && (norm2 < best_norm2_team_local)) {
+        best_norm2_team_local = norm2;
+        best_index_team_local = seed_index;
+      }
+    }
+
+    if (valid_i && (threadIdx.x % TEAM_SIZE == 0)) {
+      if (hashmap::insert(visited_hash_ptr, hash_bitlen, best_index_team_local)) {
+        result_distances_ptr[i] = best_norm2_team_local;
+        result_indices_ptr[i]   = best_index_team_local;
+      } else {
+        result_distances_ptr[i] = utils::get_max_value<DISTANCE_T>();
+        result_indices_ptr[i]   = utils::get_max_value<INDEX_T>();
+      }
+    }
+  }
+}
+
+template <unsigned TEAM_SIZE,
+          unsigned BLOCK_SIZE,
+          unsigned MAX_DATASET_DIM,
+          unsigned MAX_N_FRAGS,
+          class LOAD_T,
+          class DATA_T,
+          class DISTANCE_T,
+          class INDEX_T>
+_RAFT_DEVICE void compute_distance_to_child_nodes(INDEX_T* const result_child_indices_ptr,
+                                                  DISTANCE_T* const result_child_distances_ptr,
+                                                  // query
+                                                  const float* const query_buffer,
+                                                  // [dataset_dim, dataset_size]
+                                                  const DATA_T* const dataset_ptr,
+                                                  const std::size_t dataset_dim,
+                                                  // [knn_k, dataset_size]
+                                                  const INDEX_T* const knn_graph,
+                                                  const std::uint32_t knn_k,
+                                                  // hashmap
+                                                  std::uint32_t* const visited_hashmap_ptr,
+                                                  const std::uint32_t hash_bitlen,
+                                                  const INDEX_T* const parent_indices,
+                                                  const std::uint32_t num_parents)
+{
+  const INDEX_T invalid_index = utils::get_max_value<INDEX_T>();
+
+  // Read child indices of parents from knn graph and check if the distance
+  // computaiton is necessary.
+  for (uint32_t i = threadIdx.x; i < knn_k * num_parents; i += BLOCK_SIZE) {
+    const INDEX_T parent_id = parent_indices[i / knn_k];
+    INDEX_T child_id        = invalid_index;
+    if (parent_id != invalid_index) {
+      child_id = knn_graph[(i % knn_k) + ((uint64_t)knn_k * parent_id)];
+    }
+    if (child_id != invalid_index) {
+      if (hashmap::insert(visited_hashmap_ptr, hash_bitlen, child_id) == 0) {
+        child_id = invalid_index;
+      }
+    }
+    result_child_indices_ptr[i] = child_id;
+  }
+
+  constexpr unsigned vlen  = get_vlen<LOAD_T, DATA_T>();
+  constexpr unsigned nelem = (MAX_DATASET_DIM + (TEAM_SIZE * vlen) - 1) / (TEAM_SIZE * vlen);
+  const unsigned lane_id   = threadIdx.x % TEAM_SIZE;
+
+  // [Notice]
+  //   Loading the query vector here from shared memory into registers reduces
+  //   shared memory trafiic. However, register usage increase. The
+  //   MAX_N_FRAGS below is used as the threshold to enable or disable this,
+  //   but the appropriate value should be discussed.
+  constexpr unsigned N_FRAGS = (MAX_DATASET_DIM + TEAM_SIZE - 1) / TEAM_SIZE;
+  float query_frags[N_FRAGS];
+  if (N_FRAGS <= MAX_N_FRAGS) {
+    // Pre-load query vectors into registers when register usage is not too large.
+#pragma unroll
+    for (unsigned e = 0; e < nelem; e++) {
+      const unsigned k = (lane_id + (TEAM_SIZE * e)) * vlen;
+      // if (k >= dataset_dim) break;
+#pragma unroll
+      for (unsigned v = 0; v < vlen; v++) {
+        const unsigned kv = k + v;
+        const unsigned ev = (vlen * e) + v;
+        query_frags[ev]   = query_buffer[device::swizzling(kv)];
+      }
+    }
+  }
+  __syncthreads();
+
+  // Compute the distance to child nodes
+  std::uint32_t max_i = knn_k * num_parents;
+  if (max_i % (32 / TEAM_SIZE)) { max_i += (32 / TEAM_SIZE) - (max_i % (32 / TEAM_SIZE)); }
+  for (std::uint32_t i = threadIdx.x / TEAM_SIZE; i < max_i; i += BLOCK_SIZE / TEAM_SIZE) {
+    const bool valid_i = (i < (knn_k * num_parents));
+    INDEX_T child_id   = invalid_index;
+    if (valid_i) { child_id = result_child_indices_ptr[i]; }
+
+    DISTANCE_T norm2 = 0.0;
+    struct data_load_t<LOAD_T, DATA_T, vlen> dl_buff[nelem];
+    if (child_id != invalid_index) {
+#pragma unroll
+      for (unsigned e = 0; e < nelem; e++) {
+        const unsigned k = (lane_id + (TEAM_SIZE * e)) * vlen;
+        if (k >= dataset_dim) break;
+        dl_buff[e].load = ((LOAD_T*)(dataset_ptr + k + (dataset_dim * child_id)))[0];
+      }
+#pragma unroll
+      for (unsigned e = 0; e < nelem; e++) {
+        const unsigned k = (lane_id + (TEAM_SIZE * e)) * vlen;
+        if (k >= dataset_dim) break;
+#pragma unroll
+        for (unsigned v = 0; v < vlen; v++) {
+          DISTANCE_T diff;
+          if (N_FRAGS <= MAX_N_FRAGS) {
+            const unsigned ev = (vlen * e) + v;
+            diff              = query_frags[ev];
+          } else {
+            const unsigned kv = k + v;
+            diff              = query_buffer[device::swizzling(kv)];
+          }
+          diff -= static_cast<float>(dl_buff[e].data[v]) * device::fragment_scale<DATA_T>();
+          norm2 += diff * diff;
+        }
+      }
+    }
+    for (unsigned offset = TEAM_SIZE / 2; offset > 0; offset >>= 1) {
+      norm2 += __shfl_xor_sync(0xffffffff, norm2, offset);
+    }
+
+    // Store the distance
+    if (valid_i && (threadIdx.x % TEAM_SIZE == 0)) {
+      if (child_id != invalid_index) {
+        result_child_distances_ptr[i] = norm2;
+      } else {
+        result_child_distances_ptr[i] = utils::get_max_value<DISTANCE_T>();
+      }
+    }
+  }
+}
+
+}  // namespace device
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/device_common.hpp b/cpp/include/raft/neighbors/detail/cagra/device_common.hpp
new file mode 100644
index 0000000000..20f30d9f11
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/device_common.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "utils.hpp"
+#include <cfloat>
+#include <cstdint>
+#include <cuda_fp16.h>
+#include <raft/core/detail/macros.hpp>
+
+namespace raft::neighbors::experimental::cagra::detail {
+namespace device {
+
+// warpSize for compile time calculation
+constexpr unsigned warp_size = 32;
+
+// scaling factor for distance computation
+template <class T>
+_RAFT_HOST_DEVICE constexpr float fragment_scale();
+template <>
+_RAFT_HOST_DEVICE constexpr float fragment_scale<float>()
+{
+  return 1.0;
+};
+template <>
+_RAFT_HOST_DEVICE constexpr float fragment_scale<half>()
+{
+  return 1.0;
+};
+template <>
+_RAFT_HOST_DEVICE constexpr float fragment_scale<uint8_t>()
+{
+  return 1.0 / 256.0;
+};
+template <>
+_RAFT_HOST_DEVICE constexpr float fragment_scale<int8_t>()
+{
+  return 1.0 / 128.0;
+};
+
+/** Xorshift rondem number generator.
+ *
+ * See https://en.wikipedia.org/wiki/Xorshift#xorshift for reference.
+ */
+_RAFT_HOST_DEVICE inline uint64_t xorshift64(uint64_t u)
+{
+  u ^= u >> 12;
+  u ^= u << 25;
+  u ^= u >> 27;
+  return u * 0x2545F4914F6CDD1DULL;
+}
+
+template <class T>
+_RAFT_DEVICE inline T swizzling(T x)
+{
+  // Address swizzling reduces bank conflicts in shared memory, but increases
+  // the amount of operation instead.
+  // return x;
+  return x ^ (x >> 5);  // "x" must be less than 1024
+}
+
+}  // namespace device
+}  // namespace raft::neighbors::experimental::cagra::detail
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/detail/cagra/factory.cuh b/cpp/include/raft/neighbors/detail/cagra/factory.cuh
new file mode 100644
index 0000000000..beeebc605c
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/factory.cuh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "search_multi_cta.cuh"
+#include "search_multi_kernel.cuh"
+#include "search_plan.cuh"
+#include "search_single_cta.cuh"
+
+namespace raft::neighbors::experimental::cagra::detail {
+
+template <typename T, typename IdxT = uint32_t, typename DistanceT = float>
+class factory {
+ public:
+  /**
+   * Create a search structure for dataset with dim features.
+   */
+  static std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>> create(
+    raft::device_resources const& res,
+    search_params const& params,
+    int64_t dim,
+    int64_t graph_degree,
+    uint32_t topk)
+  {
+    search_plan_impl_base plan(params, dim, graph_degree, topk);
+    switch (plan.max_dim) {
+      case 128:
+        switch (plan.team_size) {
+          case 8: return dispatch_kernel<128, 8>(res, plan); break;
+          default: THROW("Incorrect team size %lu", plan.team_size);
+        }
+        break;
+      case 256:
+        switch (plan.team_size) {
+          case 16: return dispatch_kernel<256, 16>(res, plan); break;
+          default: THROW("Incorrect team size %lu", plan.team_size);
+        }
+        break;
+      case 512:
+        switch (plan.team_size) {
+          case 32: return dispatch_kernel<512, 32>(res, plan); break;
+          default: THROW("Incorrect team size %lu", plan.team_size);
+        }
+        break;
+      case 1024:
+        switch (plan.team_size) {
+          case 32: return dispatch_kernel<1024, 32>(res, plan); break;
+          default: THROW("Incorrect team size %lu", plan.team_size);
+        }
+        break;
+      default: RAFT_LOG_DEBUG("Incorrect max_dim (%lu)\n", plan.max_dim);
+    }
+    return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>();
+  }
+
+ private:
+  template <unsigned MAX_DATASET_DIM, unsigned TEAM_SIZE>
+  static std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>> dispatch_kernel(
+    raft::device_resources const& res, search_plan_impl_base& plan)
+  {
+    if (plan.algo == search_algo::SINGLE_CTA) {
+      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>(
+        new single_cta_search::search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT>(
+          res, plan, plan.dim, plan.graph_degree, plan.topk));
+    } else if (plan.algo == search_algo::MULTI_CTA) {
+      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>(
+        new multi_cta_search::search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT>(
+          res, plan, plan.dim, plan.graph_degree, plan.topk));
+    } else {
+      return std::unique_ptr<search_plan_impl<T, IdxT, DistanceT>>(
+        new multi_kernel_search::search<TEAM_SIZE, MAX_DATASET_DIM, T, IdxT, DistanceT>(
+          res, plan, plan.dim, plan.graph_degree, plan.topk));
+    }
+  }
+};
+};  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/fragment.hpp b/cpp/include/raft/neighbors/detail/cagra/fragment.hpp
new file mode 100644
index 0000000000..d5ec2207e7
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/fragment.hpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "device_common.hpp"
+#include "utils.hpp"
+#include <raft/core/logger.hpp>
+#include <type_traits>
+
+namespace raft::neighbors::experimental::cagra::detail {
+namespace device {
+
+namespace detail {
+template <unsigned SIZE>
+struct load_unit_t {
+  using type = uint4;
+};
+template <>
+struct load_unit_t<8> {
+  using type = std::uint64_t;
+};
+template <>
+struct load_unit_t<4> {
+  using type = std::uint32_t;
+};
+template <>
+struct load_unit_t<2> {
+  using type = std::uint16_t;
+};
+template <>
+struct load_unit_t<1> {
+  using type = std::uint8_t;
+};
+}  // namespace detail
+
+// One dataset or query vector is distributed within a warp and stored as `fragment`.
+template <int DIM, class T, unsigned TEAM_SIZE, class ENABLED>
+struct fragment_base {
+};
+template <int DIM, class T, unsigned TEAM_SIZE = warp_size>
+struct fragment
+  : fragment_base<DIM,
+                  T,
+                  TEAM_SIZE,
+                  typename std::enable_if<DIM % (TEAM_SIZE * utils::size_of<T>()) == 0>::type> {
+  static constexpr unsigned num_elements = DIM / TEAM_SIZE;
+  using block_t = typename detail::load_unit_t<num_elements * utils::size_of<T>()>::type;
+  static constexpr unsigned num_load_blocks =
+    num_elements * utils::size_of<T>() / utils::size_of<block_t>();
+
+  union {
+    T x[num_elements];
+    block_t load_block[num_load_blocks];
+  };
+};
+
+// Load a vector from device/shared memory
+template <int DIM, class T, unsigned TEAM_SIZE, class INPUT_T>
+_RAFT_DEVICE void load_vector_sync(device::fragment<DIM, T, TEAM_SIZE>& frag,
+                                   const INPUT_T* const input_vector_ptr,
+                                   const unsigned input_vector_length,
+                                   const bool sync = true)
+{
+  const auto lane_id = threadIdx.x % TEAM_SIZE;
+  if (DIM == input_vector_length) {
+    for (unsigned i = 0; i < frag.num_load_blocks; i++) {
+      const auto vector_index = i * TEAM_SIZE + lane_id;
+      frag.load_block[i] =
+        reinterpret_cast<const typename device::fragment<DIM, T, TEAM_SIZE>::block_t*>(
+          input_vector_ptr)[vector_index];
+    }
+  } else {
+    for (unsigned i = 0; i < frag.num_elements; i++) {
+      const auto vector_index = i * TEAM_SIZE + lane_id;
+
+      INPUT_T v;
+      if (vector_index < input_vector_length) {
+        v = static_cast<INPUT_T>(input_vector_ptr[vector_index]);
+      } else {
+        v = static_cast<INPUT_T>(0);
+      }
+
+      frag.x[i] = v;
+    }
+  }
+  if (sync) { __syncwarp(); }
+}
+
+// Compute the square of the L2 norm of two vectors
+template <class COMPUTE_T, int DIM, class T, unsigned TEAM_SIZE>
+_RAFT_DEVICE COMPUTE_T norm2(const device::fragment<DIM, T, TEAM_SIZE>& a,
+                             const device::fragment<DIM, T, TEAM_SIZE>& b)
+{
+  COMPUTE_T sum = 0;
+
+  // Compute the thread-local norm2
+  for (unsigned i = 0; i < a.num_elements; i++) {
+    const auto diff = static_cast<COMPUTE_T>(a.x[i]) - static_cast<COMPUTE_T>(b.x[i]);
+    sum += diff * diff;
+  }
+
+  // Compute the result norm2 summing up the thread-local norm2s.
+  for (unsigned offset = TEAM_SIZE / 2; offset > 0; offset >>= 1)
+    sum += __shfl_xor_sync(0xffffffff, sum, offset);
+
+  return sum;
+}
+
+template <class COMPUTE_T, int DIM, class T, unsigned TEAM_SIZE>
+_RAFT_DEVICE COMPUTE_T norm2(const device::fragment<DIM, T, TEAM_SIZE>& a,
+                             const device::fragment<DIM, T, TEAM_SIZE>& b,
+                             const float scale)
+{
+  COMPUTE_T sum = 0;
+
+  // Compute the thread-local norm2
+  for (unsigned i = 0; i < a.num_elements; i++) {
+    const auto diff =
+      static_cast<COMPUTE_T>((static_cast<float>(a.x[i]) - static_cast<float>(b.x[i])) * scale);
+    sum += diff * diff;
+  }
+
+  // Compute the result norm2 summing up the thread-local norm2s.
+  for (unsigned offset = TEAM_SIZE / 2; offset > 0; offset >>= 1)
+    sum += __shfl_xor_sync(0xffffffff, sum, offset);
+
+  return sum;
+}
+
+template <class COMPUTE_T, int DIM, class T, unsigned TEAM_SIZE>
+_RAFT_DEVICE COMPUTE_T norm2(const device::fragment<DIM, T, TEAM_SIZE>& a,
+                             const T* b,  // [DIM]
+                             const float scale)
+{
+  COMPUTE_T sum = 0;
+
+  // Compute the thread-local norm2
+  const unsigned chunk_size = a.num_elements / a.num_load_blocks;
+  const unsigned lane_id    = threadIdx.x % TEAM_SIZE;
+  for (unsigned i = 0; i < a.num_elements; i++) {
+    unsigned j      = (i % chunk_size) + chunk_size * (lane_id + TEAM_SIZE * (i / chunk_size));
+    const auto diff = static_cast<COMPUTE_T>(a.x[i] * scale) - static_cast<COMPUTE_T>(b[j] * scale);
+    sum += diff * diff;
+  }
+
+  // Compute the result norm2 summing up the thread-local norm2s.
+  for (unsigned offset = TEAM_SIZE / 2; offset > 0; offset >>= 1)
+    sum += __shfl_xor_sync(0xffffffff, sum, offset);
+
+  return sum;
+}
+
+template <class COMPUTE_T, int DIM, class T, unsigned TEAM_SIZE>
+_RAFT_DEVICE inline COMPUTE_T norm2x(const device::fragment<DIM, T, TEAM_SIZE>& a,
+                                     const COMPUTE_T* b,  // [dim]
+                                     const uint32_t dim,
+                                     const float scale)
+{
+  // Compute the thread-local norm2
+  COMPUTE_T sum          = 0;
+  const unsigned lane_id = threadIdx.x % TEAM_SIZE;
+  if (dim == DIM) {
+    const unsigned chunk_size = a.num_elements / a.num_load_blocks;
+    for (unsigned i = 0; i < a.num_elements; i++) {
+      unsigned j      = (i % chunk_size) + chunk_size * (lane_id + TEAM_SIZE * (i / chunk_size));
+      const auto diff = static_cast<COMPUTE_T>(a.x[i] * scale) - b[j];
+      sum += diff * diff;
+    }
+  } else {
+    for (unsigned i = 0; i < a.num_elements; i++) {
+      unsigned j = lane_id + (TEAM_SIZE * i);
+      if (j >= dim) break;
+      const auto diff = static_cast<COMPUTE_T>(a.x[i] * scale) - b[j];
+      sum += diff * diff;
+    }
+  }
+
+  // Compute the result norm2 summing up the thread-local norm2s.
+  for (unsigned offset = TEAM_SIZE / 2; offset > 0; offset >>= 1)
+    sum += __shfl_xor_sync(0xffffffff, sum, offset);
+
+  return sum;
+}
+
+template <int DIM, class T, unsigned TEAM_SIZE>
+_RAFT_DEVICE void print_fragment(const device::fragment<DIM, T, TEAM_SIZE>& a)
+{
+  for (unsigned i = 0; i < TEAM_SIZE; i++) {
+    if ((threadIdx.x % TEAM_SIZE) == i) {
+      for (unsigned j = 0; j < a.num_elements; j++) {
+        RAFT_LOG_DEBUG("%+e ", static_cast<float>(a.x[j]));
+      }
+    }
+    __syncwarp();
+  }
+}
+
+}  // namespace device
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
new file mode 100644
index 0000000000..568ad0826c
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/graph_core.cuh
@@ -0,0 +1,809 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cassert>
+#include <climits>
+#include <cuda_fp16.h>
+#include <float.h>
+#include <iostream>
+#include <memory>
+#include <omp.h>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_device_accessor.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <random>
+#include <sys/time.h>
+
+#include <raft/util/cuda_rt_essentials.hpp>
+
+namespace raft::neighbors::experimental::cagra::detail {
+namespace graph {
+
+template <class T>
+__host__ __device__ float compute_norm2(const T* a,
+                                        const T* b,
+                                        const std::size_t dim,
+                                        const float scale)
+{
+  float sum = 0.f;
+  for (std::size_t j = 0; j < dim; j++) {
+    const auto diff = a[j] * scale - b[j] * scale;
+    sum += diff * diff;
+  }
+  return sum;
+}
+
+inline double cur_time(void)
+{
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return ((double)tv.tv_sec + (double)tv.tv_usec * 1e-6);
+}
+
+template <typename T>
+__device__ inline void swap(T& val1, T& val2)
+{
+  T val0 = val1;
+  val1   = val2;
+  val2   = val0;
+}
+
+template <typename K, typename V>
+__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool ascending)
+{
+  if (key1 == key2) { return false; }
+  if ((key1 > key2) == ascending) {
+    swap<K>(key1, key2);
+    swap<V>(val1, val2);
+    return true;
+  }
+  return false;
+}
+
+template <class DATA_T, int blockDim_x, int numElementsPerThread>
+__global__ void kern_sort(
+  DATA_T** dataset,  // [num_gpus][dataset_chunk_size, dataset_dim]
+  uint32_t dataset_size,
+  uint32_t dataset_chunk_size,  // (*) num_gpus * dataset_chunk_size >= dataset_size
+  uint32_t dataset_dim,
+  float scale,
+  uint32_t** knn_graph,  // [num_gpus][graph_chunk_size, graph_degree]
+  uint32_t graph_size,
+  uint32_t graph_chunk_size,  // (*) num_gpus * graph_chunk_size >= graph_size
+  uint32_t graph_degree,
+  int dev_id)
+{
+  __shared__ float smem_keys[blockDim_x * numElementsPerThread];
+  __shared__ uint32_t smem_vals[blockDim_x * numElementsPerThread];
+
+  uint64_t srcNode     = blockIdx.x + ((uint64_t)graph_chunk_size * dev_id);
+  uint64_t srcNode_dev = srcNode / graph_chunk_size;
+  uint64_t srcNode_loc = srcNode % graph_chunk_size;
+  if (srcNode >= graph_size) { return; }
+
+  const uint32_t num_warps = blockDim_x / 32;
+  const uint32_t warp_id   = threadIdx.x / 32;
+  const uint32_t lane_id   = threadIdx.x % 32;
+
+  // Compute distance from a src node to its neighbors
+  for (int k = warp_id; k < graph_degree; k += num_warps) {
+    uint64_t dstNode     = knn_graph[srcNode_dev][k + ((uint64_t)graph_degree * srcNode_loc)];
+    uint64_t dstNode_dev = dstNode / graph_chunk_size;
+    uint64_t dstNode_loc = dstNode % graph_chunk_size;
+    float dist           = 0.0;
+    for (int d = lane_id; d < dataset_dim; d += 32) {
+      float diff =
+        (float)(dataset[srcNode_dev][d + ((uint64_t)dataset_dim * srcNode_loc)]) * scale -
+        (float)(dataset[dstNode_dev][d + ((uint64_t)dataset_dim * dstNode_loc)]) * scale;
+      dist += diff * diff;
+    }
+    dist += __shfl_xor_sync(0xffffffff, dist, 1);
+    dist += __shfl_xor_sync(0xffffffff, dist, 2);
+    dist += __shfl_xor_sync(0xffffffff, dist, 4);
+    dist += __shfl_xor_sync(0xffffffff, dist, 8);
+    dist += __shfl_xor_sync(0xffffffff, dist, 16);
+    if (lane_id == 0) {
+      smem_keys[k] = dist;
+      smem_vals[k] = dstNode;
+    }
+  }
+  __syncthreads();
+
+  float my_keys[numElementsPerThread];
+  uint32_t my_vals[numElementsPerThread];
+  for (int i = 0; i < numElementsPerThread; i++) {
+    int k = i + (numElementsPerThread * threadIdx.x);
+    if (k < graph_degree) {
+      my_keys[i] = smem_keys[k];
+      my_vals[i] = smem_vals[k];
+    } else {
+      my_keys[i] = FLT_MAX;
+      my_vals[i] = 0xffffffffU;
+    }
+  }
+  __syncthreads();
+
+  // Sorting by thread
+  uint32_t mask  = 1;
+  bool ascending = ((threadIdx.x & mask) == 0);
+  for (int j = 0; j < numElementsPerThread; j += 2) {
+#pragma unroll
+    for (int i = 0; i < numElementsPerThread; i += 2) {
+      swap_if_needed<float, uint32_t>(
+        my_keys[i], my_keys[i + 1], my_vals[i], my_vals[i + 1], ascending);
+    }
+#pragma unroll
+    for (int i = 1; i < numElementsPerThread - 1; i += 2) {
+      swap_if_needed<float, uint32_t>(
+        my_keys[i], my_keys[i + 1], my_vals[i], my_vals[i + 1], ascending);
+    }
+  }
+
+  // Bitonic Sorting
+  while (mask < blockDim_x) {
+    uint32_t next_mask = mask << 1;
+
+    for (uint32_t curr_mask = mask; curr_mask > 0; curr_mask >>= 1) {
+      bool ascending = ((threadIdx.x & curr_mask) == 0) == ((threadIdx.x & next_mask) == 0);
+      if (mask >= 32) {
+        // inter warp
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < numElementsPerThread; i++) {
+          smem_keys[threadIdx.x + (blockDim_x * i)] = my_keys[i];
+          smem_vals[threadIdx.x + (blockDim_x * i)] = my_vals[i];
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < numElementsPerThread; i++) {
+          float opp_key    = smem_keys[(threadIdx.x ^ curr_mask) + (blockDim_x * i)];
+          uint32_t opp_val = smem_vals[(threadIdx.x ^ curr_mask) + (blockDim_x * i)];
+          swap_if_needed<float, uint32_t>(my_keys[i], opp_key, my_vals[i], opp_val, ascending);
+        }
+      } else {
+// intra warp
+#pragma unroll
+        for (int i = 0; i < numElementsPerThread; i++) {
+          float opp_key    = __shfl_xor_sync(0xffffffff, my_keys[i], curr_mask);
+          uint32_t opp_val = __shfl_xor_sync(0xffffffff, my_vals[i], curr_mask);
+          swap_if_needed<float, uint32_t>(my_keys[i], opp_key, my_vals[i], opp_val, ascending);
+        }
+      }
+    }
+
+    bool ascending = ((threadIdx.x & next_mask) == 0);
+#pragma unroll
+    for (uint32_t curr_mask = numElementsPerThread / 2; curr_mask > 0; curr_mask >>= 1) {
+#pragma unroll
+      for (int i = 0; i < numElementsPerThread; i++) {
+        int j = i ^ curr_mask;
+        if (i > j) continue;
+        swap_if_needed<float, uint32_t>(my_keys[i], my_keys[j], my_vals[i], my_vals[j], ascending);
+      }
+    }
+    mask = next_mask;
+  }
+
+  // Update knn_graph
+  for (int i = 0; i < numElementsPerThread; i++) {
+    int k = i + (numElementsPerThread * threadIdx.x);
+    if (k < graph_degree) {
+      knn_graph[srcNode_dev][k + ((uint64_t)graph_degree * srcNode_loc)] = my_vals[i];
+    }
+  }
+}
+
+template <int MAX_DEGREE>
+__global__ void kern_prune(
+  uint32_t** knn_graph,  // [num_gpus][graph_chunk_size, graph_degree]
+  uint32_t graph_size,
+  uint32_t graph_chunk_size,  // (*) num_gpus * graph_chunk_size >= graph_size
+  uint32_t graph_degree,
+  uint32_t degree,
+  int dev_id,
+  uint32_t batch_size,
+  uint32_t batch_id,
+  uint8_t** detour_count,          // [num_gpus][graph_chunk_size, graph_degree]
+  uint32_t** num_no_detour_edges,  // [num_gpus][graph_size]
+  uint64_t* stats)
+{
+  __shared__ uint32_t smem_num_detour[MAX_DEGREE];
+  uint64_t* num_retain = stats;
+  uint64_t* num_full   = stats + 1;
+
+  uint64_t nid = blockIdx.x + (batch_size * batch_id);
+  if (nid >= graph_chunk_size) { return; }
+  for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) {
+    smem_num_detour[k] = 0;
+  }
+  __syncthreads();
+
+  uint64_t iA     = nid + ((uint64_t)graph_chunk_size * dev_id);
+  uint64_t iA_dev = iA / graph_chunk_size;
+  uint64_t iA_loc = iA % graph_chunk_size;
+  if (iA >= graph_size) { return; }
+
+  // count number of detours (A->D->B)
+  for (uint32_t kAD = 0; kAD < graph_degree - 1; kAD++) {
+    uint64_t iD     = knn_graph[iA_dev][kAD + (graph_degree * iA_loc)];
+    uint64_t iD_dev = iD / graph_chunk_size;
+    uint64_t iD_loc = iD % graph_chunk_size;
+    for (uint32_t kDB = threadIdx.x; kDB < graph_degree; kDB += blockDim.x) {
+      uint64_t iB_candidate = knn_graph[iD_dev][kDB + ((uint64_t)graph_degree * iD_loc)];
+      for (uint32_t kAB = kAD + 1; kAB < graph_degree; kAB++) {
+        // if ( kDB < kAB )
+        {
+          uint64_t iB = knn_graph[iA_dev][kAB + (graph_degree * iA_loc)];
+          if (iB == iB_candidate) {
+            atomicAdd(smem_num_detour + kAB, 1);
+            break;
+          }
+        }
+      }
+    }
+    __syncthreads();
+  }
+
+  uint32_t num_edges_no_detour = 0;
+  for (uint32_t k = threadIdx.x; k < graph_degree; k += blockDim.x) {
+    detour_count[iA_dev][k + (graph_degree * iA_loc)] = min(smem_num_detour[k], (uint32_t)255);
+    if (smem_num_detour[k] == 0) { num_edges_no_detour++; }
+  }
+  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 1);
+  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 2);
+  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 4);
+  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 8);
+  num_edges_no_detour += __shfl_xor_sync(0xffffffff, num_edges_no_detour, 16);
+  num_edges_no_detour = min(num_edges_no_detour, degree);
+
+  if (threadIdx.x == 0) {
+    num_no_detour_edges[iA_dev][iA_loc] = num_edges_no_detour;
+    atomicAdd((unsigned long long int*)num_retain, (unsigned long long int)num_edges_no_detour);
+    if (num_edges_no_detour >= degree) { atomicAdd((unsigned long long int*)num_full, 1); }
+  }
+}
+
+// unnamed namespace to avoid multiple definition error
+namespace {
+__global__ void kern_make_rev_graph(const uint32_t i_gpu,
+                                    const uint32_t* dest_nodes,  // [global_graph_size]
+                                    const uint32_t global_graph_size,
+                                    uint32_t* rev_graph,        // [graph_size, degree]
+                                    uint32_t* rev_graph_count,  // [graph_size]
+                                    const uint32_t graph_size,
+                                    const uint32_t degree)
+{
+  const uint32_t tid  = threadIdx.x + (blockDim.x * blockIdx.x);
+  const uint32_t tnum = blockDim.x * gridDim.x;
+
+  for (uint32_t gl_src_id = tid; gl_src_id < global_graph_size; gl_src_id += tnum) {
+    uint32_t gl_dest_id = dest_nodes[gl_src_id];
+    if (gl_dest_id < graph_size * i_gpu) continue;
+    if (gl_dest_id >= graph_size * (i_gpu + 1)) continue;
+    if (gl_dest_id >= global_graph_size) continue;
+
+    uint32_t dest_id = gl_dest_id - (graph_size * i_gpu);
+    uint32_t pos     = atomicAdd(rev_graph_count + dest_id, 1);
+    if (pos < degree) { rev_graph[pos + ((uint64_t)degree * dest_id)] = gl_src_id; }
+  }
+}
+}  // namespace
+template <class T>
+T*** mgpu_alloc(int n_gpus, uint32_t chunk, uint32_t nelems)
+{
+  T** arrays;                                      // [n_gpus][chunk, nelems]
+  arrays       = (T**)malloc(sizeof(T*) * n_gpus); /* h1 */
+  size_t bsize = sizeof(T) * chunk * nelems;
+  // RAFT_LOG_DEBUG("[%s, %s, %d] n_gpus: %d, chunk: %u, nelems: %u, bsize: %lu (%lu MiB)\n",
+  //         __FILE__, __func__, __LINE__, n_gpus, chunk, nelems, bsize, bsize / 1024 / 1024);
+  for (int i_gpu = 0; i_gpu < n_gpus; i_gpu++) {
+    RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+    RAFT_CUDA_TRY(cudaMalloc(&(arrays[i_gpu]), bsize)); /* d1 */
+  }
+  T*** d_arrays;                                       // [n_gpus+1][n_gpus][chunk, nelems]
+  d_arrays = (T***)malloc(sizeof(T**) * (n_gpus + 1)); /* h2 */
+  bsize    = sizeof(T*) * n_gpus;
+  for (int i_gpu = 0; i_gpu < n_gpus; i_gpu++) {
+    RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+    RAFT_CUDA_TRY(cudaMalloc(&(d_arrays[i_gpu]), bsize)); /* d2 */
+    RAFT_CUDA_TRY(cudaMemcpy(d_arrays[i_gpu], arrays, bsize, cudaMemcpyDefault));
+  }
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+  d_arrays[n_gpus] = arrays;
+  return d_arrays;
+}
+
+template <class T>
+void mgpu_free(T*** d_arrays, int n_gpus)
+{
+  for (int i_gpu = 0; i_gpu < n_gpus; i_gpu++) {
+    RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+    RAFT_CUDA_TRY(cudaFree(d_arrays[n_gpus][i_gpu])); /* d1 */
+    RAFT_CUDA_TRY(cudaFree(d_arrays[i_gpu]));         /* d2 */
+  }
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+  free(d_arrays[n_gpus]); /* h1 */
+  free(d_arrays);         /* h2 */
+}
+
+template <class T>
+void mgpu_H2D(T*** d_arrays,     // [n_gpus+1][n_gpus][chunk, nelems]
+              const T* h_array,  // [size, nelems]
+              int n_gpus,
+              uint32_t size,
+              uint32_t chunk,  // (*) n_gpus * chunk >= size
+              uint32_t nelems)
+{
+#pragma omp parallel num_threads(n_gpus)
+  {
+    int i_gpu = omp_get_thread_num();
+    RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+    uint32_t _chunk = std::min(size - (chunk * i_gpu), chunk);
+    size_t bsize    = sizeof(T) * _chunk * nelems;
+    RAFT_CUDA_TRY(cudaMemcpy(d_arrays[n_gpus][i_gpu],
+                             h_array + ((uint64_t)chunk * nelems * i_gpu),
+                             bsize,
+                             cudaMemcpyDefault));
+  }
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+}
+
+template <class T>
+void mgpu_D2H(T*** d_arrays,  // [n_gpus+1][n_gpus][chunk, nelems]
+              T* h_array,     // [size, nelems]
+              int n_gpus,
+              uint32_t size,
+              uint32_t chunk,  // (*) n_gpus * chunk >= size
+              uint32_t nelems)
+{
+#pragma omp parallel num_threads(n_gpus)
+  {
+    int i_gpu = omp_get_thread_num();
+    RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+    uint32_t _chunk = std::min(size - (chunk * i_gpu), chunk);
+    size_t bsize    = sizeof(T) * _chunk * nelems;
+    RAFT_CUDA_TRY(cudaMemcpy(h_array + ((uint64_t)chunk * nelems * i_gpu),
+                             d_arrays[n_gpus][i_gpu],
+                             bsize,
+                             cudaMemcpyDefault));
+  }
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+}
+
+template <class T>
+uint64_t pos_in_array(T val, const T* array, uint64_t num)
+{
+  for (uint64_t i = 0; i < num; i++) {
+    if (val == array[i]) { return i; }
+  }
+  return num;
+}
+
+template <class T>
+void shift_array(T* array, uint64_t num)
+{
+  for (uint64_t i = num; i > 0; i--) {
+    array[i] = array[i - 1];
+  }
+}
+
+/** Input arrays can be both host and device*/
+template <class DATA_T,
+          typename IdxT = uint32_t,
+          typename d_accessor =
+            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::device>,
+          typename g_accessor =
+            host_device_accessor<std::experimental::default_accessor<DATA_T>, memory_type::host>>
+void prune(raft::device_resources const& res,
+           mdspan<const DATA_T, matrix_extent<IdxT>, row_major, d_accessor> dataset,
+           mdspan<IdxT, matrix_extent<IdxT>, row_major, g_accessor> knn_graph,
+           raft::host_matrix_view<IdxT, IdxT, row_major> new_graph)
+{
+  RAFT_LOG_DEBUG(
+    "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
+
+  RAFT_EXPECTS(
+    dataset.extent(0) == knn_graph.extent(0) && knn_graph.extent(0) == new_graph.extent(0),
+    "Each input array is expected to have the same number of rows");
+  RAFT_EXPECTS(new_graph.extent(1) <= knn_graph.extent(1),
+               "output graph cannot have more columns than input graph");
+  const uint32_t dataset_size        = dataset.extent(0);
+  const uint32_t dataset_dim         = dataset.extent(1);
+  const uint32_t input_graph_degree  = knn_graph.extent(1);
+  const uint32_t output_graph_degree = new_graph.extent(1);
+  const DATA_T* dataset_ptr          = dataset.data_handle();
+  uint32_t* input_graph_ptr          = (uint32_t*)knn_graph.data_handle();
+  uint32_t* output_graph_ptr         = new_graph.data_handle();
+  float scale                  = 1.0f / raft::spatial::knn::detail::utils::config<DATA_T>::kDivisor;
+  const std::size_t graph_size = dataset_size;
+  size_t array_size;
+
+  // Setup GPUs
+  int num_gpus = 0;
+
+  // Setup GPUs
+  RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus));
+  RAFT_LOG_DEBUG("# num_gpus: %d\n", num_gpus);
+  for (int self = 0; self < num_gpus; self++) {
+    RAFT_CUDA_TRY(cudaSetDevice(self));
+    for (int peer = 0; peer < num_gpus; peer++) {
+      if (self == peer) { continue; }
+      RAFT_CUDA_TRY(cudaDeviceEnablePeerAccess(peer, 0));
+    }
+  }
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+
+  uint32_t graph_chunk_size     = graph_size;
+  uint32_t*** d_input_graph_ptr = NULL;  // [...][num_gpus][graph_chunk_size, input_graph_degree]
+  graph_chunk_size              = (graph_size + num_gpus - 1) / num_gpus;
+  d_input_graph_ptr = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, input_graph_degree);
+
+  uint32_t dataset_chunk_size = dataset_size;
+  DATA_T*** d_dataset_ptr     = NULL;  // [num_gpus+1][...][...]
+  dataset_chunk_size          = (dataset_size + num_gpus - 1) / num_gpus;
+  assert(dataset_chunk_size == graph_chunk_size);
+  d_dataset_ptr = mgpu_alloc<DATA_T>(num_gpus, dataset_chunk_size, dataset_dim);
+
+  mgpu_H2D<DATA_T>(
+    d_dataset_ptr, dataset_ptr, num_gpus, dataset_size, dataset_chunk_size, dataset_dim);
+
+  //
+  // Sorting kNN graph
+  //
+  double time_sort_start = cur_time();
+  RAFT_LOG_DEBUG("# Sorting kNN Graph on GPUs ");
+  mgpu_H2D<uint32_t>(
+    d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
+  void (*kernel_sort)(
+    DATA_T**, uint32_t, uint32_t, uint32_t, float, uint32_t**, uint32_t, uint32_t, uint32_t, int);
+  constexpr int numElementsPerThread = 4;
+  dim3 threads_sort(1, 1, 1);
+  if (input_graph_degree <= numElementsPerThread * 32) {
+    constexpr int blockDim_x = 32;
+    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    threads_sort.x           = blockDim_x;
+  } else if (input_graph_degree <= numElementsPerThread * 64) {
+    constexpr int blockDim_x = 64;
+    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    threads_sort.x           = blockDim_x;
+  } else if (input_graph_degree <= numElementsPerThread * 128) {
+    constexpr int blockDim_x = 128;
+    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    threads_sort.x           = blockDim_x;
+  } else if (input_graph_degree <= numElementsPerThread * 256) {
+    constexpr int blockDim_x = 256;
+    kernel_sort              = kern_sort<DATA_T, blockDim_x, numElementsPerThread>;
+    threads_sort.x           = blockDim_x;
+  } else {
+    fprintf(stderr,
+            "[ERROR] The degree of input knn graph is too large (%u). "
+            "It must be equal to or small than %d.\n",
+            input_graph_degree,
+            numElementsPerThread * 256);
+    exit(-1);
+  }
+  dim3 blocks_sort(graph_chunk_size, 1, 1);
+  for (int i_gpu = 0; i_gpu < num_gpus; i_gpu++) {
+    RAFT_LOG_DEBUG(".");
+    RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+    kernel_sort<<<blocks_sort, threads_sort>>>(d_dataset_ptr[i_gpu],
+                                               dataset_size,
+                                               dataset_chunk_size,
+                                               dataset_dim,
+                                               scale,
+                                               d_input_graph_ptr[i_gpu],
+                                               graph_size,
+                                               graph_chunk_size,
+                                               input_graph_degree,
+                                               i_gpu);
+  }
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  RAFT_LOG_DEBUG(".");
+  mgpu_D2H<uint32_t>(
+    d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
+  RAFT_LOG_DEBUG("\n");
+  double time_sort_end = cur_time();
+  RAFT_LOG_DEBUG("# Sorting kNN graph time: %.1lf sec\n", time_sort_end - time_sort_start);
+
+  mgpu_free<DATA_T>(d_dataset_ptr, num_gpus);
+
+  //
+  uint8_t* detour_count;  // [graph_size, input_graph_degree]
+  array_size   = sizeof(uint8_t) * graph_size * input_graph_degree;
+  detour_count = (uint8_t*)malloc(array_size);
+  memset(detour_count, 0xff, array_size);
+
+  uint8_t*** d_detour_count = NULL;  // [...][num_gpus][graph_chunk_size, input_graph_degree]
+  d_detour_count            = mgpu_alloc<uint8_t>(num_gpus, graph_chunk_size, input_graph_degree);
+  mgpu_H2D<uint8_t>(
+    d_detour_count, detour_count, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
+
+  //
+  uint32_t* num_no_detour_edges;  // [graph_size]
+  array_size          = sizeof(uint32_t) * graph_size;
+  num_no_detour_edges = (uint32_t*)malloc(array_size);
+  memset(num_no_detour_edges, 0, array_size);
+
+  uint32_t*** d_num_no_detour_edges = NULL;  // [...][num_gpus][graph_chunk_size]
+  d_num_no_detour_edges             = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, 1);
+  mgpu_H2D<uint32_t>(
+    d_num_no_detour_edges, num_no_detour_edges, num_gpus, graph_size, graph_chunk_size, 1);
+
+  //
+  uint64_t** dev_stats  = NULL;  // [num_gpus][2]
+  uint64_t** host_stats = NULL;  // [num_gpus][2]
+  dev_stats             = (uint64_t**)malloc(sizeof(uint64_t*) * num_gpus);
+  host_stats            = (uint64_t**)malloc(sizeof(uint64_t*) * num_gpus);
+  array_size            = sizeof(uint64_t) * 2;
+  for (int i_gpu = 0; i_gpu < num_gpus; i_gpu++) {
+    RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+    RAFT_CUDA_TRY(cudaMalloc(&(dev_stats[i_gpu]), array_size));
+    host_stats[i_gpu] = (uint64_t*)malloc(array_size);
+  }
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+
+  //
+  // Prune unimportant edges.
+  //
+  // The edge to be retained is determined without explicitly considering
+  // distance or angle. Suppose the edge is the k-th edge of some node-A to
+  // node-B (A->B). Among the edges originating at node-A, there are k-1 edges
+  // shorter than the edge A->B. Each of these k-1 edges are connected to a
+  // different k-1 nodes. Among these k-1 nodes, count the number of nodes with
+  // edges to node-B, which is the number of 2-hop detours for the edge A->B.
+  // Once the number of 2-hop detours has been counted for all edges, the
+  // specified number of edges are picked up for each node, starting with the
+  // edge with the lowest number of 2-hop detours.
+  //
+  double time_prune_start = cur_time();
+  uint64_t num_keep       = 0;
+  uint64_t num_full       = 0;
+  RAFT_LOG_DEBUG("# Pruning kNN Graph on GPUs\r");
+  mgpu_H2D<uint32_t>(
+    d_input_graph_ptr, input_graph_ptr, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
+  void (*kernel_prune)(uint32_t**,
+                       uint32_t,
+                       uint32_t,
+                       uint32_t,
+                       uint32_t,
+                       int,
+                       uint32_t,
+                       uint32_t,
+                       uint8_t**,
+                       uint32_t**,
+                       uint64_t*);
+  if (input_graph_degree <= 1024) {
+    constexpr int MAX_DEGREE = 1024;
+    kernel_prune             = kern_prune<MAX_DEGREE>;
+  } else {
+    fprintf(stderr,
+            "[ERROR] The degree of input knn graph is too large (%u). "
+            "It must be equal to or small than %d.\n",
+            input_graph_degree,
+            1024);
+    exit(-1);
+  }
+  uint32_t batch_size = std::min(graph_chunk_size, (uint32_t)256 * 1024);
+  uint32_t num_batch  = (graph_chunk_size + batch_size - 1) / batch_size;
+  dim3 threads_prune(32, 1, 1);
+  dim3 blocks_prune(batch_size, 1, 1);
+  for (int i_gpu = 0; i_gpu < num_gpus; i_gpu++) {
+    RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+    RAFT_CUDA_TRY(cudaMemset(dev_stats[i_gpu], 0, sizeof(uint64_t) * 2));
+  }
+  for (uint32_t i_batch = 0; i_batch < num_batch; i_batch++) {
+    for (int i_gpu = 0; i_gpu < num_gpus; i_gpu++) {
+      RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+      kernel_prune<<<blocks_prune, threads_prune>>>(d_input_graph_ptr[i_gpu],
+                                                    graph_size,
+                                                    graph_chunk_size,
+                                                    input_graph_degree,
+                                                    output_graph_degree,
+                                                    i_gpu,
+                                                    batch_size,
+                                                    i_batch,
+                                                    d_detour_count[i_gpu],
+                                                    d_num_no_detour_edges[i_gpu],
+                                                    dev_stats[i_gpu]);
+    }
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    fprintf(
+      stderr,
+      "# Pruning kNN Graph on GPUs (%.1lf %%)\r",
+      (double)std::min((i_batch + 1) * batch_size, graph_chunk_size) / graph_chunk_size * 100);
+  }
+  for (int i_gpu = 0; i_gpu < num_gpus; i_gpu++) {
+    RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+    RAFT_CUDA_TRY(
+      cudaMemcpy(host_stats[i_gpu], dev_stats[i_gpu], sizeof(uint64_t) * 2, cudaMemcpyDefault));
+    num_keep += host_stats[i_gpu][0];
+    num_full += host_stats[i_gpu][1];
+  }
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+  RAFT_LOG_DEBUG("\n");
+
+  mgpu_D2H<uint8_t>(
+    d_detour_count, detour_count, num_gpus, graph_size, graph_chunk_size, input_graph_degree);
+  mgpu_D2H<uint32_t>(
+    d_num_no_detour_edges, num_no_detour_edges, num_gpus, graph_size, graph_chunk_size, 1);
+
+  mgpu_free<uint32_t>(d_input_graph_ptr, num_gpus);
+  mgpu_free<uint8_t>(d_detour_count, num_gpus);
+  mgpu_free<uint32_t>(d_num_no_detour_edges, num_gpus);
+
+  // Create pruned kNN graph
+  array_size                 = sizeof(uint32_t) * graph_size * output_graph_degree;
+  uint32_t* pruned_graph_ptr = (uint32_t*)malloc(array_size);
+  uint32_t max_detour        = 0;
+#pragma omp parallel for reduction(max : max_detour)
+  for (uint64_t i = 0; i < graph_size; i++) {
+    uint64_t pk = 0;
+    for (uint32_t num_detour = 0; num_detour < output_graph_degree; num_detour++) {
+      if (max_detour < num_detour) { max_detour = num_detour; /* stats */ }
+      for (uint64_t k = 0; k < input_graph_degree; k++) {
+        if (detour_count[k + (input_graph_degree * i)] != num_detour) { continue; }
+        pruned_graph_ptr[pk + (output_graph_degree * i)] =
+          input_graph_ptr[k + (input_graph_degree * i)];
+        pk += 1;
+        if (pk >= output_graph_degree) break;
+      }
+      if (pk >= output_graph_degree) break;
+    }
+    assert(pk == output_graph_degree);
+  }
+  // RAFT_LOG_DEBUG("# max_detour: %u\n", max_detour);
+
+  double time_prune_end = cur_time();
+  fprintf(stderr,
+          "# Pruning time: %.1lf sec, "
+          "avg_no_detour_edges_per_node: %.2lf/%u, "
+          "nodes_with_no_detour_at_all_edges: %.1lf%%\n",
+          time_prune_end - time_prune_start,
+          (double)num_keep / graph_size,
+          output_graph_degree,
+          (double)num_full / graph_size * 100);
+
+  //
+  // Make reverse graph
+  //
+  double time_make_start = cur_time();
+
+  array_size              = sizeof(uint32_t) * graph_size * output_graph_degree;
+  uint32_t* rev_graph_ptr = (uint32_t*)malloc(array_size);
+  memset(rev_graph_ptr, 0xff, array_size);
+
+  uint32_t*** d_rev_graph_ptr;  // [...][num_gpus][graph_chunk_size, output_graph_degree]
+  d_rev_graph_ptr = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, output_graph_degree);
+  mgpu_H2D<uint32_t>(
+    d_rev_graph_ptr, rev_graph_ptr, num_gpus, graph_size, graph_chunk_size, output_graph_degree);
+
+  array_size                = sizeof(uint32_t) * graph_size;
+  uint32_t* rev_graph_count = (uint32_t*)malloc(array_size);
+  memset(rev_graph_count, 0, array_size);
+
+  uint32_t*** d_rev_graph_count;  // [...][num_gpus][graph_chunk_size, 1]
+  d_rev_graph_count = mgpu_alloc<uint32_t>(num_gpus, graph_chunk_size, 1);
+  mgpu_H2D<uint32_t>(d_rev_graph_count, rev_graph_count, num_gpus, graph_size, graph_chunk_size, 1);
+
+  uint32_t* dest_nodes;  // [graph_size]
+  dest_nodes = (uint32_t*)malloc(sizeof(uint32_t) * graph_size);
+  uint32_t** d_dest_nodes;  // [num_gpus][graph_size]
+  d_dest_nodes = (uint32_t**)malloc(sizeof(uint32_t*) * num_gpus);
+  for (int i_gpu = 0; i_gpu < num_gpus; i_gpu++) {
+    RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+    RAFT_CUDA_TRY(cudaMalloc(&(d_dest_nodes[i_gpu]), sizeof(uint32_t) * graph_size));
+  }
+
+  for (uint64_t k = 0; k < output_graph_degree; k++) {
+#pragma omp parallel for
+    for (uint64_t i = 0; i < graph_size; i++) {
+      dest_nodes[i] = pruned_graph_ptr[k + (output_graph_degree * i)];
+    }
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+#pragma omp parallel num_threads(num_gpus)
+    {
+      int i_gpu = omp_get_thread_num();
+      RAFT_CUDA_TRY(cudaSetDevice(i_gpu));
+      RAFT_CUDA_TRY(cudaMemcpy(
+        d_dest_nodes[i_gpu], dest_nodes, sizeof(uint32_t) * graph_size, cudaMemcpyHostToDevice));
+      dim3 threads(256, 1, 1);
+      dim3 blocks(1024, 1, 1);
+      kern_make_rev_graph<<<blocks, threads>>>(i_gpu,
+                                               d_dest_nodes[i_gpu],
+                                               graph_size,
+                                               d_rev_graph_ptr[num_gpus][i_gpu],
+                                               d_rev_graph_count[num_gpus][i_gpu],
+                                               graph_chunk_size,
+                                               output_graph_degree);
+    }
+    RAFT_LOG_DEBUG("# Making reverse graph on GPUs: %lu / %u    \r", k, output_graph_degree);
+  }
+  RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  RAFT_CUDA_TRY(cudaSetDevice(0));
+  RAFT_LOG_DEBUG("\n");
+
+  mgpu_D2H<uint32_t>(
+    d_rev_graph_ptr, rev_graph_ptr, num_gpus, graph_size, graph_chunk_size, output_graph_degree);
+  mgpu_D2H<uint32_t>(d_rev_graph_count, rev_graph_count, num_gpus, graph_size, graph_chunk_size, 1);
+  mgpu_free<uint32_t>(d_rev_graph_ptr, num_gpus);
+  mgpu_free<uint32_t>(d_rev_graph_count, num_gpus);
+
+  double time_make_end = cur_time();
+  RAFT_LOG_DEBUG("# Making reverse graph time: %.1lf sec", time_make_end - time_make_start);
+
+  //
+  // Replace some edges with reverse edges
+  //
+  double time_replace_start = cur_time();
+
+  uint64_t num_protected_edges = output_graph_degree / 2;
+  RAFT_LOG_DEBUG("# num_protected_edges: %lu", num_protected_edges);
+
+  array_size = sizeof(uint32_t) * graph_size * output_graph_degree;
+  memcpy(output_graph_ptr, pruned_graph_ptr, array_size);
+
+  constexpr int _omp_chunk = 1024;
+#pragma omp parallel for schedule(dynamic, _omp_chunk)
+  for (uint64_t j = 0; j < graph_size; j++) {
+    for (uint64_t _k = 0; _k < rev_graph_count[j]; _k++) {
+      uint64_t k = rev_graph_count[j] - 1 - _k;
+      uint64_t i = rev_graph_ptr[k + (output_graph_degree * j)];
+
+      uint64_t pos = pos_in_array<uint32_t>(
+        i, output_graph_ptr + (output_graph_degree * j), output_graph_degree);
+      if (pos < num_protected_edges) { continue; }
+      uint64_t num_shift = pos - num_protected_edges;
+      if (pos == output_graph_degree) { num_shift = output_graph_degree - num_protected_edges - 1; }
+      shift_array<uint32_t>(output_graph_ptr + num_protected_edges + (output_graph_degree * j),
+                            num_shift);
+      output_graph_ptr[num_protected_edges + (output_graph_degree * j)] = i;
+    }
+    if ((omp_get_thread_num() == 0) && ((j % _omp_chunk) == 0)) {
+      RAFT_LOG_DEBUG("# Replacing reverse edges: %lu / %lu    ", j, graph_size);
+    }
+  }
+  RAFT_LOG_DEBUG("\n");
+  free(rev_graph_ptr);
+  free(rev_graph_count);
+
+  double time_replace_end = cur_time();
+  RAFT_LOG_DEBUG("# Replacing edges time: %.1lf sec", time_replace_end - time_replace_start);
+
+  /* stats */
+  uint64_t num_replaced_edges = 0;
+#pragma omp parallel for reduction(+ : num_replaced_edges)
+  for (uint64_t i = 0; i < graph_size; i++) {
+    for (uint64_t k = 0; k < output_graph_degree; k++) {
+      uint64_t j   = pruned_graph_ptr[k + (output_graph_degree * i)];
+      uint64_t pos = pos_in_array<uint32_t>(
+        j, output_graph_ptr + (output_graph_degree * i), output_graph_degree);
+      if (pos == output_graph_degree) { num_replaced_edges += 1; }
+    }
+  }
+  fprintf(stderr,
+          "# Average number of replaced edges per node: %.2f",
+          (double)num_replaced_edges / graph_size);
+}
+
+}  // namespace graph
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp b/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp
new file mode 100644
index 0000000000..18f4006367
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/hashmap.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "utils.hpp"
+#include <cstdint>
+#include <raft/core/detail/macros.hpp>
+
+// #pragma GCC diagnostic push
+// #pragma GCC diagnostic ignored
+// #pragma GCC diagnostic pop
+namespace raft::neighbors::experimental::cagra::detail {
+namespace hashmap {
+
+_RAFT_HOST_DEVICE inline uint32_t get_size(const uint32_t bitlen) { return 1U << bitlen; }
+
+template <unsigned FIRST_TID = 0>
+_RAFT_DEVICE inline void init(uint32_t* table, const uint32_t bitlen)
+{
+  if (threadIdx.x < FIRST_TID) return;
+  for (unsigned i = threadIdx.x - FIRST_TID; i < get_size(bitlen); i += blockDim.x - FIRST_TID) {
+    table[i] = utils::get_max_value<uint32_t>();
+  }
+}
+
+template <unsigned FIRST_TID, unsigned LAST_TID>
+_RAFT_DEVICE inline void init(uint32_t* table, const uint32_t bitlen)
+{
+  if ((FIRST_TID > 0 && threadIdx.x < FIRST_TID) || threadIdx.x >= LAST_TID) return;
+  for (unsigned i = threadIdx.x - FIRST_TID; i < get_size(bitlen); i += LAST_TID - FIRST_TID) {
+    table[i] = utils::get_max_value<uint32_t>();
+  }
+}
+
+_RAFT_DEVICE inline uint32_t insert(uint32_t* table, const uint32_t bitlen, const uint32_t key)
+{
+  // Open addressing is used for collision resolution
+  const uint32_t size     = get_size(bitlen);
+  const uint32_t bit_mask = size - 1;
+#if 1
+  // Linear probing
+  uint32_t index            = (key ^ (key >> bitlen)) & bit_mask;
+  constexpr uint32_t stride = 1;
+#else
+  // Double hashing
+  uint32_t index        = key & bit_mask;
+  const uint32_t stride = (key >> bitlen) * 2 + 1;
+#endif
+  for (unsigned i = 0; i < size; i++) {
+    const uint32_t old = atomicCAS(&table[index], ~0u, key);
+    if (old == ~0u) {
+      return 1;
+    } else if (old == key) {
+      return 0;
+    }
+    index = (index + stride) & bit_mask;
+  }
+  return 0;
+}
+
+template <unsigned TEAM_SIZE>
+_RAFT_DEVICE inline uint32_t insert(uint32_t* table, const uint32_t bitlen, const uint32_t key)
+{
+  uint32_t ret = 0;
+  if (threadIdx.x % TEAM_SIZE == 0) { ret = insert(table, bitlen, key); }
+  for (unsigned offset = 1; offset < TEAM_SIZE; offset *= 2) {
+    ret |= __shfl_xor_sync(0xffffffff, ret, offset);
+  }
+  return ret;
+}
+
+}  // namespace hashmap
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
new file mode 100644
index 0000000000..2c0ac98417
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+
+#include <vector>
+
+#include "bitonic.hpp"
+#include "compute_distance.hpp"
+#include "device_common.hpp"
+#include "hashmap.hpp"
+#include "search_plan.cuh"
+#include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk if possible
+#include "utils.hpp"
+#include <raft/core/logger.hpp>
+#include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
+
+namespace raft::neighbors::experimental::cagra::detail {
+namespace multi_cta_search {
+
+// #define _CLK_BREAKDOWN
+
+template <class INDEX_T>
+__device__ void pickup_next_parents(INDEX_T* const next_parent_indices,  // [num_parents]
+                                    const uint32_t num_parents,
+                                    INDEX_T* const itopk_indices,  // [num_itopk]
+                                    const size_t num_itopk,
+                                    uint32_t* const terminate_flag)
+{
+  const unsigned warp_id = threadIdx.x / 32;
+  if (warp_id > 0) { return; }
+  const unsigned lane_id = threadIdx.x % 32;
+  for (uint32_t i = lane_id; i < num_parents; i += 32) {
+    next_parent_indices[i] = utils::get_max_value<INDEX_T>();
+  }
+  uint32_t max_itopk = num_itopk;
+  if (max_itopk % 32) { max_itopk += 32 - (max_itopk % 32); }
+  uint32_t num_new_parents = 0;
+  for (uint32_t j = lane_id; j < max_itopk; j += 32) {
+    INDEX_T index;
+    int new_parent = 0;
+    if (j < num_itopk) {
+      index = itopk_indices[j];
+      if ((index & 0x80000000) == 0) {  // check if most significant bit is set
+        new_parent = 1;
+      }
+    }
+    const uint32_t ballot_mask = __ballot_sync(0xffffffff, new_parent);
+    if (new_parent) {
+      const auto i = __popc(ballot_mask & ((1 << lane_id) - 1)) + num_new_parents;
+      if (i < num_parents) {
+        next_parent_indices[i] = index;
+        itopk_indices[j] |= 0x80000000;  // set most significant bit as used node
+      }
+    }
+    num_new_parents += __popc(ballot_mask);
+    if (num_new_parents >= num_parents) { break; }
+  }
+  if (threadIdx.x == 0 && (num_new_parents == 0)) { *terminate_flag = 1; }
+}
+
+template <unsigned MAX_ELEMENTS>
+__device__ inline void topk_by_bitonic_sort(float* distances,   // [num_elements]
+                                            uint32_t* indices,  // [num_elements]
+                                            const uint32_t num_elements,
+                                            const uint32_t num_itopk  // num_itopk <= num_elements
+)
+{
+  const unsigned warp_id = threadIdx.x / 32;
+  if (warp_id > 0) { return; }
+  const unsigned lane_id = threadIdx.x % 32;
+  constexpr unsigned N   = (MAX_ELEMENTS + 31) / 32;
+  float key[N];
+  uint32_t val[N];
+  for (unsigned i = 0; i < N; i++) {
+    unsigned j = lane_id + (32 * i);
+    if (j < num_elements) {
+      key[i] = distances[j];
+      val[i] = indices[j];
+    } else {
+      key[i] = utils::get_max_value<float>();
+      val[i] = utils::get_max_value<uint32_t>();
+    }
+  }
+  /* Warp Sort */
+  bitonic::warp_sort<float, uint32_t, N>(key, val);
+  /* Store itopk sorted results */
+  for (unsigned i = 0; i < N; i++) {
+    unsigned j = (N * lane_id) + i;
+    if (j < num_itopk) {
+      distances[j] = key[i];
+      indices[j]   = val[i];
+    }
+  }
+}
+
+//
+// multiple CTAs per single query
+//
+template <unsigned TEAM_SIZE,
+          unsigned BLOCK_SIZE,
+          unsigned BLOCK_COUNT,
+          unsigned MAX_ELEMENTS,
+          unsigned MAX_DATASET_DIM,
+          class DATA_T,
+          class DISTANCE_T,
+          class INDEX_T,
+          class LOAD_T>
+__launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__ void search_kernel(
+  INDEX_T* const result_indices_ptr,       // [num_queries, num_cta_per_query, itopk_size]
+  DISTANCE_T* const result_distances_ptr,  // [num_queries, num_cta_per_query, itopk_size]
+  const DATA_T* const dataset_ptr,         // [dataset_size, dataset_dim]
+  const size_t dataset_dim,
+  const size_t dataset_size,
+  const DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
+  const INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
+  const uint32_t graph_degree,
+  const unsigned num_distilation,
+  const uint64_t rand_xor_mask,
+  const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+  const uint32_t num_seeds,
+  uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+  const uint32_t hash_bitlen,
+  const uint32_t itopk_size,
+  const uint32_t num_parents,
+  const uint32_t min_iteration,
+  const uint32_t max_iteration,
+  uint32_t* const num_executed_iterations /* stats */
+)
+{
+  assert(blockDim.x == BLOCK_SIZE);
+  assert(dataset_dim <= MAX_DATASET_DIM);
+
+  // const auto num_queries = gridDim.y;
+  const auto query_id          = blockIdx.y;
+  const auto num_cta_per_query = gridDim.x;
+  const auto cta_id            = blockIdx.x;  // local CTA ID
+
+#ifdef _CLK_BREAKDOWN
+  uint64_t clk_init                 = 0;
+  uint64_t clk_compute_1st_distance = 0;
+  uint64_t clk_topk                 = 0;
+  uint64_t clk_pickup_parents       = 0;
+  uint64_t clk_compute_distance     = 0;
+  uint64_t clk_start;
+#define _CLK_START() clk_start = clock64()
+#define _CLK_REC(V)  V += clock64() - clk_start;
+#else
+#define _CLK_START()
+#define _CLK_REC(V)
+#endif
+  _CLK_START();
+
+  extern __shared__ uint32_t smem[];
+
+  // Layout of result_buffer
+  // +----------------+------------------------------+---------+
+  // | internal_top_k | neighbors of parent nodes    | padding |
+  // | <itopk_size>   | <num_parents * graph_degree> | upto 32 |
+  // +----------------+------------------------------+---------+
+  // |<---          result_buffer_size           --->|
+  uint32_t result_buffer_size    = itopk_size + (num_parents * graph_degree);
+  uint32_t result_buffer_size_32 = result_buffer_size;
+  if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); }
+  assert(result_buffer_size_32 <= MAX_ELEMENTS);
+
+  auto query_buffer          = reinterpret_cast<float*>(smem);
+  auto result_indices_buffer = reinterpret_cast<INDEX_T*>(query_buffer + MAX_DATASET_DIM);
+  auto result_distances_buffer =
+    reinterpret_cast<DISTANCE_T*>(result_indices_buffer + result_buffer_size_32);
+  auto parent_indices_buffer =
+    reinterpret_cast<uint32_t*>(result_distances_buffer + result_buffer_size_32);
+  auto terminate_flag = reinterpret_cast<uint32_t*>(parent_indices_buffer + num_parents);
+
+#if 0
+    /* debug */
+    for (unsigned i = threadIdx.x; i < result_buffer_size_32; i += BLOCK_SIZE) {
+        result_indices_buffer[i] = utils::get_max_value<INDEX_T>();
+        result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+    }
+#endif
+
+  const DATA_T* const query_ptr = queries_ptr + (dataset_dim * query_id);
+  for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) {
+    unsigned j = device::swizzling(i);
+    if (i < dataset_dim) {
+      query_buffer[j] = static_cast<float>(query_ptr[i]) * device::fragment_scale<DATA_T>();
+    } else {
+      query_buffer[j] = 0.0;
+    }
+  }
+  if (threadIdx.x == 0) { terminate_flag[0] = 0; }
+  uint32_t* local_visited_hashmap_ptr =
+    visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * query_id);
+  __syncthreads();
+  _CLK_REC(clk_init);
+
+  // compute distance to randomly selecting nodes
+  _CLK_START();
+  const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr;
+  device::compute_distance_to_random_nodes<TEAM_SIZE, MAX_DATASET_DIM, LOAD_T>(
+    result_indices_buffer,
+    result_distances_buffer,
+    query_buffer,
+    dataset_ptr,
+    dataset_dim,
+    dataset_size,
+    result_buffer_size,
+    num_distilation,
+    rand_xor_mask,
+    local_seed_ptr,
+    num_seeds,
+    local_visited_hashmap_ptr,
+    hash_bitlen,
+    cta_id,
+    num_cta_per_query);
+  __syncthreads();
+  _CLK_REC(clk_compute_1st_distance);
+
+  uint32_t iter = 0;
+  while (1) {
+    // topk with bitonic sort
+    _CLK_START();
+    topk_by_bitonic_sort<MAX_ELEMENTS>(result_distances_buffer,
+                                       result_indices_buffer,
+                                       itopk_size + (num_parents * graph_degree),
+                                       itopk_size);
+    _CLK_REC(clk_topk);
+
+    if (iter + 1 == max_iteration) {
+      __syncthreads();
+      break;
+    }
+
+    // pick up next parents
+    _CLK_START();
+    pickup_next_parents<INDEX_T>(
+      parent_indices_buffer, num_parents, result_indices_buffer, itopk_size, terminate_flag);
+    _CLK_REC(clk_pickup_parents);
+
+    __syncthreads();
+    if (*terminate_flag && iter >= min_iteration) { break; }
+
+    // compute the norms between child nodes and query node
+    _CLK_START();
+    // constexpr unsigned max_n_frags = 16;
+    constexpr unsigned max_n_frags = 0;
+    device::
+      compute_distance_to_child_nodes<TEAM_SIZE, BLOCK_SIZE, MAX_DATASET_DIM, max_n_frags, LOAD_T>(
+        result_indices_buffer + itopk_size,
+        result_distances_buffer + itopk_size,
+        query_buffer,
+        dataset_ptr,
+        dataset_dim,
+        knn_graph,
+        graph_degree,
+        local_visited_hashmap_ptr,
+        hash_bitlen,
+        parent_indices_buffer,
+        num_parents);
+    _CLK_REC(clk_compute_distance);
+    __syncthreads();
+
+    iter++;
+  }
+
+  for (uint32_t i = threadIdx.x; i < itopk_size; i += BLOCK_SIZE) {
+    uint32_t j = i + (itopk_size * (cta_id + (num_cta_per_query * query_id)));
+    if (result_distances_ptr != nullptr) { result_distances_ptr[j] = result_distances_buffer[i]; }
+    result_indices_ptr[j] = result_indices_buffer[i] & ~0x80000000;  // clear most significant bit
+  }
+
+  if (threadIdx.x == 0 && cta_id == 0 && num_executed_iterations != nullptr) {
+    num_executed_iterations[query_id] = iter + 1;
+  }
+
+#ifdef _CLK_BREAKDOWN
+  if ((threadIdx.x == 0 || threadIdx.x == BLOCK_SIZE - 1) && (blockIdx.x == 0) &&
+      ((query_id * 3) % gridDim.y < 3)) {
+    RAFT_LOG_DEBUG(
+      "query, %d, thread, %d"
+      ", init, %d"
+      ", 1st_distance, %lu"
+      ", topk, %lu"
+      ", pickup_parents, %lu"
+      ", distance, %lu"
+      "\n",
+      query_id,
+      threadIdx.x,
+      clk_init,
+      clk_compute_1st_distance,
+      clk_topk,
+      clk_pickup_parents,
+      clk_compute_distance);
+  }
+#endif
+}
+
+#define SET_MC_KERNEL_3(BLOCK_SIZE, BLOCK_COUNT, MAX_ELEMENTS, LOAD_T) \
+  kernel = search_kernel<TEAM_SIZE,                                    \
+                         BLOCK_SIZE,                                   \
+                         BLOCK_COUNT,                                  \
+                         MAX_ELEMENTS,                                 \
+                         MAX_DATASET_DIM,                              \
+                         DATA_T,                                       \
+                         DISTANCE_T,                                   \
+                         INDEX_T,                                      \
+                         LOAD_T>;
+
+#define SET_MC_KERNEL_2(BLOCK_SIZE, BLOCK_COUNT, MAX_ELEMENTS)                    \
+  if (load_bit_length == 128) {                                                   \
+    SET_MC_KERNEL_3(BLOCK_SIZE, BLOCK_COUNT, MAX_ELEMENTS, device::LOAD_128BIT_T) \
+  } else if (load_bit_length == 64) {                                             \
+    SET_MC_KERNEL_3(BLOCK_SIZE, BLOCK_COUNT, MAX_ELEMENTS, device::LOAD_64BIT_T)  \
+  }
+
+#define SET_MC_KERNEL_1(MAX_ELEMENTS)         \
+  /* if ( block_size == 32 ) {                \
+      SET_MC_KERNEL_2( 32, 32, MAX_ELEMENTS ) \
+  } else */                                   \
+  if (block_size == 64) {                     \
+    SET_MC_KERNEL_2(64, 16, MAX_ELEMENTS)     \
+  } else if (block_size == 128) {             \
+    SET_MC_KERNEL_2(128, 8, MAX_ELEMENTS)     \
+  } else if (block_size == 256) {             \
+    SET_MC_KERNEL_2(256, 4, MAX_ELEMENTS)     \
+  } else if (block_size == 512) {             \
+    SET_MC_KERNEL_2(512, 2, MAX_ELEMENTS)     \
+  } else {                                    \
+    SET_MC_KERNEL_2(1024, 1, MAX_ELEMENTS)    \
+  }
+
+#define SET_MC_KERNEL                                                       \
+  typedef void (*search_kernel_t)(INDEX_T* const result_indices_ptr,        \
+                                  DISTANCE_T* const result_distances_ptr,   \
+                                  const DATA_T* const dataset_ptr,          \
+                                  const size_t dataset_dim,                 \
+                                  const size_t dataset_size,                \
+                                  const DATA_T* const queries_ptr,          \
+                                  const INDEX_T* const knn_graph,           \
+                                  const uint32_t graph_degree,              \
+                                  const unsigned num_distilation,           \
+                                  const uint64_t rand_xor_mask,             \
+                                  const INDEX_T* seed_ptr,                  \
+                                  const uint32_t num_seeds,                 \
+                                  uint32_t* const visited_hashmap_ptr,      \
+                                  const uint32_t hash_bitlen,               \
+                                  const uint32_t itopk_size,                \
+                                  const uint32_t num_parents,               \
+                                  const uint32_t min_iteration,             \
+                                  const uint32_t max_iteration,             \
+                                  uint32_t* const num_executed_iterations); \
+  search_kernel_t kernel;                                                   \
+  if (result_buffer_size <= 64) {                                           \
+    SET_MC_KERNEL_1(64)                                                     \
+  } else if (result_buffer_size <= 128) {                                   \
+    SET_MC_KERNEL_1(128)                                                    \
+  } else if (result_buffer_size <= 256) {                                   \
+    SET_MC_KERNEL_1(256)                                                    \
+  }
+
+template <class T>
+__global__ void set_value_batch_kernel(T* const dev_ptr,
+                                       const std::size_t ld,
+                                       const T val,
+                                       const std::size_t count,
+                                       const std::size_t batch_size)
+{
+  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= count * batch_size) { return; }
+  const auto batch_id              = tid / count;
+  const auto elem_id               = tid % count;
+  dev_ptr[elem_id + ld * batch_id] = val;
+}
+
+template <class T>
+void set_value_batch(T* const dev_ptr,
+                     const std::size_t ld,
+                     const T val,
+                     const std::size_t count,
+                     const std::size_t batch_size,
+                     cudaStream_t cuda_stream)
+{
+  constexpr std::uint32_t block_size = 256;
+  const auto grid_size               = (count * batch_size + block_size - 1) / block_size;
+  set_value_batch_kernel<T>
+    <<<grid_size, block_size, 0, cuda_stream>>>(dev_ptr, ld, val, count, batch_size);
+}
+
+template <unsigned TEAM_SIZE,
+          unsigned MAX_DATASET_DIM,
+          typename DATA_T,
+          typename INDEX_T,
+          typename DISTANCE_T>
+
+struct search : public search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_queries;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::itopk_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::algo;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::team_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_parents;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::min_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::load_bit_length;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::thread_block_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_mode;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_min_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_max_fill_rate;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_random_samplings;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::rand_xor_mask;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::graph_degree;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::topk;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hash_bitlen;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_reset_interval;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dataset_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::result_buffer_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::smem_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::load_bit_lenght;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_executed_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dev_seed;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_seeds;
+
+  uint32_t num_cta_per_query;
+  rmm::device_uvector<uint32_t> intermediate_indices;
+  rmm::device_uvector<float> intermediate_distances;
+  size_t topk_workspace_size;
+  rmm::device_uvector<uint32_t> topk_workspace;
+
+  search(raft::device_resources const& res,
+         search_params params,
+         int64_t dim,
+         int64_t graph_degree,
+         uint32_t topk)
+    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>(res, params, dim, graph_degree, topk),
+      intermediate_indices(0, res.get_stream()),
+      intermediate_distances(0, res.get_stream()),
+      topk_workspace(0, res.get_stream())
+
+  {
+    set_params(res);
+  }
+
+  void set_params(raft::device_resources const& res)
+  {
+    this->itopk_size   = 32;
+    num_parents        = 1;
+    num_cta_per_query  = max(num_parents, itopk_size / 32);
+    result_buffer_size = itopk_size + num_parents * graph_degree;
+    typedef raft::Pow2<32> AlignBytes;
+    unsigned result_buffer_size_32 = AlignBytes::roundUp(result_buffer_size);
+    // constexpr unsigned max_result_buffer_size = 256;
+    RAFT_EXPECTS(result_buffer_size_32 <= 256, "Result buffer size cannot exceed 256");
+
+    smem_size = sizeof(float) * max_dim +
+                (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 +
+                sizeof(uint32_t) * num_parents + sizeof(uint32_t);
+    RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
+
+    //
+    // Determine the thread block size
+    //
+    constexpr unsigned min_block_size = 64;
+    constexpr unsigned max_block_size = 1024;
+    uint32_t block_size               = thread_block_size;
+    if (block_size == 0) {
+      block_size = min_block_size;
+
+      // Increase block size according to shared memory requirements.
+      // If block size is 32, upper limit of shared memory size per
+      // thread block is set to 4096. This is GPU generation dependent.
+      constexpr unsigned ulimit_smem_size_cta32 = 4096;
+      while (smem_size > ulimit_smem_size_cta32 / 32 * block_size) {
+        block_size *= 2;
+      }
+
+      // Increase block size to improve GPU occupancy when total number of
+      // CTAs (= num_cta_per_query * max_queries) is small.
+      cudaDeviceProp deviceProp = res.get_device_properties();
+      RAFT_LOG_DEBUG("# multiProcessorCount: %d", deviceProp.multiProcessorCount);
+      while ((block_size < max_block_size) &&
+             (graph_degree * num_parents * team_size >= block_size * 2) &&
+             (num_cta_per_query * max_queries <=
+              (1024 / (block_size * 2)) * deviceProp.multiProcessorCount)) {
+        block_size *= 2;
+      }
+    }
+    RAFT_LOG_DEBUG("# thread_block_size: %u", block_size);
+    RAFT_EXPECTS(block_size >= min_block_size,
+                 "block_size cannot be smaller than min_block size, %u",
+                 min_block_size);
+    RAFT_EXPECTS(block_size <= max_block_size,
+                 "block_size cannot be larger than max_block size %u",
+                 max_block_size);
+    thread_block_size = block_size;
+
+    //
+    // Determine load bit length
+    //
+    const uint32_t total_bit_length = dim * sizeof(DATA_T) * 8;
+    if (load_bit_length == 0) {
+      load_bit_length = 128;
+      while (total_bit_length % load_bit_length) {
+        load_bit_length /= 2;
+      }
+    }
+    RAFT_LOG_DEBUG("# load_bit_length: %u  (%u loads per vector)",
+                   load_bit_length,
+                   total_bit_length / load_bit_length);
+    RAFT_EXPECTS(total_bit_length % load_bit_length == 0,
+                 "load_bit_length must be a divisor of dim*sizeof(data_t)*8=%u",
+                 total_bit_length);
+    RAFT_EXPECTS(load_bit_length >= 64, "load_bit_lenght cannot be less than 64");
+
+    //
+    // Allocate memory for intermediate buffer and workspace.
+    //
+    uint32_t num_intermediate_results = num_cta_per_query * itopk_size;
+    intermediate_indices.resize(num_intermediate_results, res.get_stream());
+    intermediate_distances.resize(num_intermediate_results, res.get_stream());
+
+    hashmap.resize(hashmap_size, res.get_stream());
+
+    topk_workspace_size = _cuann_find_topk_bufferSize(
+      topk, max_queries, num_intermediate_results, utils::get_cuda_data_type<DATA_T>());
+    RAFT_LOG_DEBUG("# topk_workspace_size: %lu", topk_workspace_size);
+    topk_workspace.resize(topk_workspace_size, res.get_stream());
+  }
+
+  ~search() {}
+
+  void operator()(raft::device_resources const& res,
+                  raft::device_matrix_view<const DATA_T, INDEX_T, row_major> dataset,
+                  raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph,
+                  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
+                  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
+                  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
+                  const uint32_t num_queries,
+                  const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
+                  uint32_t* const num_executed_iterations,  // [num_queries,]
+                  uint32_t topk)
+  {
+    cudaStream_t stream = res.get_stream();
+    uint32_t block_size = thread_block_size;
+
+    SET_MC_KERNEL;
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    // Initialize hash table
+    const uint32_t hash_size = hashmap::get_size(hash_bitlen);
+    set_value_batch(
+      hashmap.data(), hash_size, utils::get_max_value<uint32_t>(), hash_size, num_queries, stream);
+
+    dim3 block_dims(block_size, 1, 1);
+    dim3 grid_dims(num_cta_per_query, num_queries, 1);
+    RAFT_LOG_DEBUG("Launching kernel with %u threads, (%u, %u) blocks %lu smem",
+                   block_size,
+                   num_cta_per_query,
+                   num_queries,
+                   smem_size);
+    kernel<<<grid_dims, block_dims, smem_size, stream>>>(intermediate_indices.data(),
+                                                         intermediate_distances.data(),
+                                                         dataset.data_handle(),
+                                                         dataset.extent(1),
+                                                         dataset.extent(0),
+                                                         queries_ptr,
+                                                         graph.data_handle(),
+                                                         graph.extent(1),
+                                                         num_random_samplings,
+                                                         rand_xor_mask,
+                                                         dev_seed_ptr,
+                                                         num_seeds,
+                                                         hashmap.data(),
+                                                         hash_bitlen,
+                                                         itopk_size,
+                                                         num_parents,
+                                                         min_iterations,
+                                                         max_iterations,
+                                                         num_executed_iterations);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    // Select the top-k results from the intermediate results
+    const uint32_t num_intermediate_results = num_cta_per_query * itopk_size;
+    _cuann_find_topk(topk,
+                     num_queries,
+                     num_intermediate_results,
+                     intermediate_distances.data(),
+                     num_intermediate_results,
+                     intermediate_indices.data(),
+                     num_intermediate_results,
+                     topk_distances_ptr,
+                     topk,
+                     topk_indices_ptr,
+                     topk,
+                     topk_workspace.data(),
+                     true,
+                     NULL,
+                     stream);
+  }
+};
+
+}  // namespace multi_cta_search
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
new file mode 100644
index 0000000000..f688941239
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <vector>
+
+#include "compute_distance.hpp"
+#include "device_common.hpp"
+#include "fragment.hpp"
+#include "hashmap.hpp"
+#include "search_plan.cuh"
+#include "topk_for_cagra/topk_core.cuh"  //todo replace with raft kernel
+#include "utils.hpp"
+#include <raft/core/logger.hpp>
+#include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
+
+namespace raft::neighbors::experimental::cagra::detail {
+namespace multi_kernel_search {
+
+template <class T>
+__global__ void set_value_kernel(T* const dev_ptr, const T val)
+{
+  *dev_ptr = val;
+}
+
+template <class T>
+__global__ void set_value_kernel(T* const dev_ptr, const T val, const std::size_t count)
+{
+  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= count) { return; }
+  dev_ptr[tid] = val;
+}
+
+template <class T>
+void set_value(T* const dev_ptr, const T val, cudaStream_t cuda_stream)
+{
+  set_value_kernel<T><<<1, 1, 0, cuda_stream>>>(dev_ptr, val);
+}
+
+template <class T>
+void set_value(T* const dev_ptr, const T val, const std::size_t count, cudaStream_t cuda_stream)
+{
+  constexpr std::uint32_t block_size = 256;
+  const auto grid_size               = (count + block_size - 1) / block_size;
+  set_value_kernel<T><<<grid_size, block_size, 0, cuda_stream>>>(dev_ptr, val, count);
+}
+
+template <class T>
+__global__ void get_value_kernel(T* const host_ptr, const T* const dev_ptr)
+{
+  *host_ptr = *dev_ptr;
+}
+
+template <class T>
+void get_value(T* const host_ptr, const T* const dev_ptr, cudaStream_t cuda_stream)
+{
+  get_value_kernel<T><<<1, 1, 0, cuda_stream>>>(host_ptr, dev_ptr);
+}
+
+// MAX_DATASET_DIM : must equal to or greater than dataset_dim
+template <unsigned TEAM_SIZE,
+          unsigned MAX_DATASET_DIM,
+          class DATA_T,
+          class DISTANCE_T,
+          class INDEX_T>
+__global__ void random_pickup_kernel(
+  const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
+  const std::size_t dataset_dim,
+  const std::size_t dataset_size,
+  const DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
+  const std::size_t num_pickup,
+  const unsigned num_distilation,
+  const uint64_t rand_xor_mask,
+  const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+  const uint32_t num_seeds,
+  INDEX_T* const result_indices_ptr,         // [num_queries, ldr]
+  DISTANCE_T* const result_distances_ptr,    // [num_queries, ldr]
+  const std::uint32_t ldr,                   // (*) ldr >= num_pickup
+  std::uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
+  const std::uint32_t hash_bitlen)
+{
+  const auto ldb               = hashmap::get_size(hash_bitlen);
+  const auto global_team_index = (blockIdx.x * blockDim.x + threadIdx.x) / TEAM_SIZE;
+  const uint32_t query_id      = blockIdx.y;
+  if (global_team_index >= num_pickup) { return; }
+  // Load a query
+  device::fragment<MAX_DATASET_DIM, DATA_T, TEAM_SIZE> query_frag;
+  device::load_vector_sync(query_frag, queries_ptr + query_id * dataset_dim, dataset_dim);
+
+  INDEX_T best_index_team_local;
+  DISTANCE_T best_norm2_team_local = utils::get_max_value<DISTANCE_T>();
+  for (unsigned i = 0; i < num_distilation; i++) {
+    INDEX_T seed_index;
+    if (seed_ptr && (global_team_index < num_seeds)) {
+      seed_index = seed_ptr[global_team_index + (num_seeds * query_id)];
+    } else {
+      // Chose a seed node randomly
+      seed_index = device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_size;
+    }
+    device::fragment<MAX_DATASET_DIM, DATA_T, TEAM_SIZE> random_data_frag;
+    device::load_vector_sync(
+      random_data_frag, dataset_ptr + (dataset_dim * seed_index), dataset_dim);
+
+    // Compute the norm of two data
+    const auto norm2 =
+      device::norm2<DISTANCE_T>(query_frag, random_data_frag, device::fragment_scale<DATA_T>()
+                                /*, scale*/
+      );
+
+    if (norm2 < best_norm2_team_local) {
+      best_norm2_team_local = norm2;
+      best_index_team_local = seed_index;
+    }
+  }
+
+  const auto store_gmem_index = global_team_index + (ldr * query_id);
+  if (threadIdx.x % TEAM_SIZE == 0) {
+    if (hashmap::insert(
+          visited_hashmap_ptr + (ldb * query_id), hash_bitlen, best_index_team_local)) {
+      result_distances_ptr[store_gmem_index] = best_norm2_team_local;
+      result_indices_ptr[store_gmem_index]   = best_index_team_local;
+    } else {
+      result_distances_ptr[store_gmem_index] = utils::get_max_value<DISTANCE_T>();
+      result_indices_ptr[store_gmem_index]   = utils::get_max_value<INDEX_T>();
+    }
+  }
+}
+
+// MAX_DATASET_DIM : must be equal to or greater than dataset_dim
+template <unsigned TEAM_SIZE,
+          unsigned MAX_DATASET_DIM,
+          class DATA_T,
+          class DISTANCE_T,
+          class INDEX_T>
+void random_pickup(const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
+                   const std::size_t dataset_dim,
+                   const std::size_t dataset_size,
+                   const DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
+                   const std::size_t num_queries,
+                   const std::size_t num_pickup,
+                   const unsigned num_distilation,
+                   const uint64_t rand_xor_mask,
+                   const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+                   const uint32_t num_seeds,
+                   INDEX_T* const result_indices_ptr,         // [num_queries, ldr]
+                   DISTANCE_T* const result_distances_ptr,    // [num_queries, ldr]
+                   const std::size_t ldr,                     // (*) ldr >= num_pickup
+                   std::uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
+                   const std::uint32_t hash_bitlen,
+                   cudaStream_t const cuda_stream = 0)
+{
+  const auto block_size                = 256u;
+  const auto num_teams_per_threadblock = block_size / TEAM_SIZE;
+  const dim3 grid_size((num_pickup + num_teams_per_threadblock - 1) / num_teams_per_threadblock,
+                       num_queries);
+
+  random_pickup_kernel<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, DISTANCE_T, INDEX_T>
+    <<<grid_size, block_size, 0, cuda_stream>>>(dataset_ptr,
+                                                dataset_dim,
+                                                dataset_size,
+                                                queries_ptr,
+                                                num_pickup,
+                                                num_distilation,
+                                                rand_xor_mask,
+                                                seed_ptr,
+                                                num_seeds,
+                                                result_indices_ptr,
+                                                result_distances_ptr,
+                                                ldr,
+                                                visited_hashmap_ptr,
+                                                hash_bitlen);
+}
+
+template <class INDEX_T>
+__global__ void pickup_next_parents_kernel(
+  INDEX_T* const parent_candidates_ptr,        // [num_queries, lds]
+  const std::size_t lds,                       // (*) lds >= parent_candidates_size
+  const std::uint32_t parent_candidates_size,  //
+  std::uint32_t* const visited_hashmap_ptr,    // [num_queries, 1 << hash_bitlen]
+  const std::size_t hash_bitlen,
+  const std::uint32_t small_hash_bitlen,
+  INDEX_T* const parent_list_ptr,      // [num_queries, ldd]
+  const std::size_t ldd,               // (*) ldd >= parent_list_size
+  const std::size_t parent_list_size,  //
+  std::uint32_t* const terminate_flag)
+{
+  const std::size_t ldb   = hashmap::get_size(hash_bitlen);
+  const uint32_t query_id = blockIdx.x;
+  if (threadIdx.x < 32) {
+    // pickup next parents with single warp
+    for (std::uint32_t i = threadIdx.x; i < parent_list_size; i += 32) {
+      parent_list_ptr[i + (ldd * query_id)] = utils::get_max_value<INDEX_T>();
+    }
+    std::uint32_t parent_candidates_size_max = parent_candidates_size;
+    if (parent_candidates_size % 32) {
+      parent_candidates_size_max += 32 - (parent_candidates_size % 32);
+    }
+    std::uint32_t num_new_parents = 0;
+    for (std::uint32_t j = threadIdx.x; j < parent_candidates_size_max; j += 32) {
+      INDEX_T index;
+      int new_parent = 0;
+      if (j < parent_candidates_size) {
+        index = parent_candidates_ptr[j + (lds * query_id)];
+        if ((index & 0x80000000) == 0) {  // check most significant bit
+          new_parent = 1;
+        }
+      }
+      const std::uint32_t ballot_mask = __ballot_sync(0xffffffff, new_parent);
+      if (new_parent) {
+        const auto i = __popc(ballot_mask & ((1 << threadIdx.x) - 1)) + num_new_parents;
+        if (i < parent_list_size) {
+          parent_list_ptr[i + (ldd * query_id)] = index;
+          parent_candidates_ptr[j + (lds * query_id)] |=
+            0x80000000;  // set most significant bit as used node
+        }
+      }
+      num_new_parents += __popc(ballot_mask);
+      if (num_new_parents >= parent_list_size) { break; }
+    }
+    if ((num_new_parents > 0) && (threadIdx.x == 0)) { *terminate_flag = 0; }
+  } else if (small_hash_bitlen) {
+    // reset small-hash
+    hashmap::init<32>(visited_hashmap_ptr + (ldb * query_id), hash_bitlen);
+  }
+
+  if (small_hash_bitlen) {
+    __syncthreads();
+    // insert internal-topk indices into small-hash
+    for (unsigned i = threadIdx.x; i < parent_candidates_size; i += blockDim.x) {
+      auto key =
+        parent_candidates_ptr[i + (lds * query_id)] & ~0x80000000;  // clear most significant bit
+      hashmap::insert(visited_hashmap_ptr + (ldb * query_id), hash_bitlen, key);
+    }
+  }
+}
+
+template <class INDEX_T>
+void pickup_next_parents(
+  INDEX_T* const parent_candidates_ptr,      // [num_queries, lds]
+  const std::size_t lds,                     // (*) lds >= parent_candidates_size
+  const std::size_t parent_candidates_size,  //
+  const std::size_t num_queries,
+  std::uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+  const std::size_t hash_bitlen,
+  const std::size_t small_hash_bitlen,
+  INDEX_T* const parent_list_ptr,      // [num_queries, ldd]
+  const std::size_t ldd,               // (*) ldd >= parent_list_size
+  const std::size_t parent_list_size,  //
+  std::uint32_t* const terminate_flag,
+  cudaStream_t cuda_stream = 0)
+{
+  std::uint32_t block_size = 32;
+  if (small_hash_bitlen) {
+    block_size = 128;
+    while (parent_candidates_size > block_size) {
+      block_size *= 2;
+    }
+    block_size = min(block_size, (uint32_t)512);
+  }
+  pickup_next_parents_kernel<INDEX_T>
+    <<<num_queries, block_size, 0, cuda_stream>>>(parent_candidates_ptr,
+                                                  lds,
+                                                  parent_candidates_size,
+                                                  visited_hashmap_ptr,
+                                                  hash_bitlen,
+                                                  small_hash_bitlen,
+                                                  parent_list_ptr,
+                                                  ldd,
+                                                  parent_list_size,
+                                                  terminate_flag);
+}
+
+template <unsigned TEAM_SIZE,
+          unsigned MAX_DATASET_DIM,
+          class DATA_T,
+          class INDEX_T,
+          class DISTANCE_T>
+__global__ void compute_distance_to_child_nodes_kernel(
+  const INDEX_T* const parent_node_list,  // [num_queries, num_parents]
+  const std::uint32_t num_parents,
+  const DATA_T* const dataset_ptr,  // [dataset_size, data_dim]
+  const std::uint32_t data_dim,
+  const std::uint32_t dataset_size,
+  const INDEX_T* const neighbor_graph_ptr,  // [dataset_size, graph_degree]
+  const std::uint32_t graph_degree,
+  const DATA_T* query_ptr,                   // [num_queries, data_dim]
+  std::uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+  const std::uint32_t hash_bitlen,
+  INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
+  DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
+  const std::uint32_t ldd                  // (*) ldd >= num_parents * graph_degree
+)
+{
+  const uint32_t ldb        = hashmap::get_size(hash_bitlen);
+  const auto tid            = threadIdx.x + blockDim.x * blockIdx.x;
+  const auto global_team_id = tid / TEAM_SIZE;
+  if (global_team_id >= num_parents * graph_degree) { return; }
+
+  const std::size_t parent_index =
+    parent_node_list[global_team_id / graph_degree + (num_parents * blockIdx.y)];
+  if (parent_index == utils::get_max_value<INDEX_T>()) {
+    result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value<DISTANCE_T>();
+    return;
+  }
+  const auto neighbor_list_head_ptr = neighbor_graph_ptr + (graph_degree * parent_index);
+
+  const std::size_t child_id = neighbor_list_head_ptr[global_team_id % graph_degree];
+
+  if (hashmap::insert<TEAM_SIZE>(visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id)) {
+    device::fragment<MAX_DATASET_DIM, DATA_T, TEAM_SIZE> frag_target;
+    device::load_vector_sync(frag_target, dataset_ptr + (data_dim * child_id), data_dim);
+
+    device::fragment<MAX_DATASET_DIM, DATA_T, TEAM_SIZE> frag_query;
+    device::load_vector_sync(frag_query, query_ptr + blockIdx.y * data_dim, data_dim);
+
+    const auto norm2 =
+      device::norm2<DISTANCE_T>(frag_target, frag_query, device::fragment_scale<DATA_T>());
+
+    if (threadIdx.x % TEAM_SIZE == 0) {
+      result_indices_ptr[ldd * blockIdx.y + global_team_id]   = child_id;
+      result_distances_ptr[ldd * blockIdx.y + global_team_id] = norm2;
+    }
+  } else {
+    if (threadIdx.x % TEAM_SIZE == 0) {
+      result_distances_ptr[ldd * blockIdx.y + global_team_id] = utils::get_max_value<DISTANCE_T>();
+    }
+  }
+}
+
+template <unsigned TEAM_SIZE,
+          unsigned MAX_DATASET_DIM,
+          class DATA_T,
+          class INDEX_T,
+          class DISTANCE_T>
+void compute_distance_to_child_nodes(
+  const INDEX_T* const parent_node_list,  // [num_queries, num_parents]
+  const uint32_t num_parents,
+  const DATA_T* const dataset_ptr,  // [dataset_size, data_dim]
+  const std::uint32_t data_dim,
+  const std::uint32_t dataset_size,
+  const INDEX_T* const neighbor_graph_ptr,  // [dataset_size, graph_degree]
+  const std::uint32_t graph_degree,
+  const DATA_T* query_ptr,  // [num_queries, data_dim]
+  const std::uint32_t num_queries,
+  std::uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+  const std::uint32_t hash_bitlen,
+  INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
+  DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
+  const std::uint32_t ldd,                 // (*) ldd >= num_parants * graph_degree
+  cudaStream_t cuda_stream = 0)
+{
+  const auto block_size = 128;
+  const dim3 grid_size(
+    (num_parents * graph_degree + (block_size / TEAM_SIZE) - 1) / (block_size / TEAM_SIZE),
+    num_queries);
+  compute_distance_to_child_nodes_kernel<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, INDEX_T, DISTANCE_T>
+    <<<grid_size, block_size, 0, cuda_stream>>>(parent_node_list,
+                                                num_parents,
+                                                dataset_ptr,
+                                                data_dim,
+                                                dataset_size,
+                                                neighbor_graph_ptr,
+                                                graph_degree,
+                                                query_ptr,
+                                                visited_hashmap_ptr,
+                                                hash_bitlen,
+                                                result_indices_ptr,
+                                                result_distances_ptr,
+                                                ldd);
+}
+
+template <class INDEX_T>
+__global__ void remove_parent_bit_kernel(const std::uint32_t num_queries,
+                                         const std::uint32_t num_topk,
+                                         INDEX_T* const topk_indices_ptr,  // [ld, num_queries]
+                                         const std::uint32_t ld)
+{
+  uint32_t i_query = blockIdx.x;
+  if (i_query >= num_queries) return;
+
+  for (unsigned i = threadIdx.x; i < num_topk; i += blockDim.x) {
+    topk_indices_ptr[i + (ld * i_query)] &= ~0x80000000;  // clear most significant bit
+  }
+}
+
+template <class INDEX_T>
+void remove_parent_bit(const std::uint32_t num_queries,
+                       const std::uint32_t num_topk,
+                       INDEX_T* const topk_indices_ptr,  // [ld, num_queries]
+                       const std::uint32_t ld,
+                       cudaStream_t cuda_stream = 0)
+{
+  const std::size_t grid_size  = num_queries;
+  const std::size_t block_size = 256;
+  remove_parent_bit_kernel<<<grid_size, block_size, 0, cuda_stream>>>(
+    num_queries, num_topk, topk_indices_ptr, ld);
+}
+
+template <class T>
+__global__ void batched_memcpy_kernel(T* const dst,  // [batch_size, ld_dst]
+                                      const uint64_t ld_dst,
+                                      const T* const src,  // [batch_size, ld_src]
+                                      const uint64_t ld_src,
+                                      const uint64_t count,
+                                      const uint64_t batch_size)
+{
+  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= count * batch_size) { return; }
+  const auto i          = tid % count;
+  const auto j          = tid / count;
+  dst[i + (ld_dst * j)] = src[i + (ld_src * j)];
+}
+
+template <class T>
+void batched_memcpy(T* const dst,  // [batch_size, ld_dst]
+                    const uint64_t ld_dst,
+                    const T* const src,  // [batch_size, ld_src]
+                    const uint64_t ld_src,
+                    const uint64_t count,
+                    const uint64_t batch_size,
+                    cudaStream_t cuda_stream)
+{
+  assert(ld_dst >= count);
+  assert(ld_src >= count);
+  constexpr uint32_t block_size = 256;
+  const auto grid_size          = (batch_size * count + block_size - 1) / block_size;
+  batched_memcpy_kernel<T>
+    <<<grid_size, block_size, 0, cuda_stream>>>(dst, ld_dst, src, ld_src, count, batch_size);
+}
+
+template <class T>
+__global__ void set_value_batch_kernel(T* const dev_ptr,
+                                       const std::size_t ld,
+                                       const T val,
+                                       const std::size_t count,
+                                       const std::size_t batch_size)
+{
+  const auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= count * batch_size) { return; }
+  const auto batch_id              = tid / count;
+  const auto elem_id               = tid % count;
+  dev_ptr[elem_id + ld * batch_id] = val;
+}
+
+template <class T>
+void set_value_batch(T* const dev_ptr,
+                     const std::size_t ld,
+                     const T val,
+                     const std::size_t count,
+                     const std::size_t batch_size,
+                     cudaStream_t cuda_stream)
+{
+  constexpr std::uint32_t block_size = 256;
+  const auto grid_size               = (count * batch_size + block_size - 1) / block_size;
+  set_value_batch_kernel<T>
+    <<<grid_size, block_size, 0, cuda_stream>>>(dev_ptr, ld, val, count, batch_size);
+}
+
+// result_buffer (work buffer) for "multi-kernel"
+// +--------------------+------------------------------+-------------------+
+// | internal_top_k (A) | neighbors of internal_top_k  | internal_topk (B) |
+// | <itopk_size>       | <num_parents * graph_degree> | <itopk_size>      |
+// +--------------------+------------------------------+-------------------+
+// |<---                 result_buffer_allocation_size                 --->|
+// |<---                       result_buffer_size  --->|                     // Double buffer (A)
+//                      |<---  result_buffer_size                      --->| // Double buffer (B)
+template <unsigned TEAM_SIZE,
+          unsigned MAX_DATASET_DIM,
+          typename DATA_T,
+          typename INDEX_T,
+          typename DISTANCE_T>
+struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_queries;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::itopk_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::algo;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::team_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_parents;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::min_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::load_bit_length;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::thread_block_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_mode;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_min_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_max_fill_rate;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_random_samplings;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::rand_xor_mask;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::graph_degree;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::topk;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hash_bitlen;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_reset_interval;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dataset_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::result_buffer_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::smem_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::load_bit_lenght;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_executed_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dev_seed;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_seeds;
+
+  size_t result_buffer_allocation_size;
+  rmm::device_uvector<uint32_t> result_indices;  // results_indices_buffer
+  rmm::device_uvector<float> result_distances;   // result_distances_buffer
+  rmm::device_uvector<uint32_t> parent_node_list;
+  rmm::device_uvector<uint32_t> topk_hint;
+  rmm::device_scalar<uint32_t> terminate_flag;  // dev_terminate_flag, host_terminate_flag.;
+  rmm::device_uvector<uint32_t> topk_workspace;
+
+  search(raft::device_resources const& res,
+         search_params params,
+         int64_t dim,
+         int64_t graph_degree,
+         uint32_t topk)
+    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>(res, params, dim, graph_degree, topk),
+      result_indices(0, res.get_stream()),
+      result_distances(0, res.get_stream()),
+      parent_node_list(0, res.get_stream()),
+      topk_hint(0, res.get_stream()),
+      topk_workspace(0, res.get_stream()),
+      terminate_flag(res.get_stream())
+  {
+    set_params(res);
+  }
+
+  void set_params(raft::device_resources const& res)
+  {
+    //
+    // Allocate memory for intermediate buffer and workspace.
+    //
+    result_buffer_size            = itopk_size + (num_parents * graph_degree);
+    result_buffer_allocation_size = result_buffer_size + itopk_size;
+    result_indices.resize(result_buffer_allocation_size * max_queries, res.get_stream());
+    result_distances.resize(result_buffer_allocation_size * max_queries, res.get_stream());
+
+    parent_node_list.resize(max_queries * num_parents, res.get_stream());
+    topk_hint.resize(max_queries, res.get_stream());
+
+    size_t topk_workspace_size = _cuann_find_topk_bufferSize(
+      itopk_size, max_queries, result_buffer_size, utils::get_cuda_data_type<DATA_T>());
+    RAFT_LOG_DEBUG("# topk_workspace_size: %lu", topk_workspace_size);
+    topk_workspace.resize(topk_workspace_size, res.get_stream());
+
+    hashmap.resize(hashmap_size, res.get_stream());
+  }
+
+  ~search() {}
+
+  void operator()(raft::device_resources const& res,
+                  raft::device_matrix_view<const DATA_T, INDEX_T, row_major> dataset,
+                  raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph,
+                  INDEX_T* const topk_indices_ptr,       // [num_queries, topk]
+                  DISTANCE_T* const topk_distances_ptr,  // [num_queries, topk]
+                  const DATA_T* const queries_ptr,       // [num_queries, dataset_dim]
+                  const uint32_t num_queries,
+                  const INDEX_T* dev_seed_ptr,              // [num_queries, num_seeds]
+                  uint32_t* const num_executed_iterations,  // [num_queries,]
+                  uint32_t topk)
+  {
+    // Init hashmap
+    cudaStream_t stream      = res.get_stream();
+    const uint32_t hash_size = hashmap::get_size(hash_bitlen);
+    set_value_batch(
+      hashmap.data(), hash_size, utils::get_max_value<uint32_t>(), hash_size, num_queries, stream);
+    // Init topk_hint
+    if (topk_hint.size() > 0) { set_value(topk_hint.data(), 0xffffffffu, num_queries, stream); }
+
+    // Choose initial entry point candidates at random
+    random_pickup<TEAM_SIZE, MAX_DATASET_DIM, DATA_T, DISTANCE_T, INDEX_T>(
+      dataset.data_handle(),
+      dataset.extent(1),
+      dataset.extent(0),
+      queries_ptr,
+      num_queries,
+      result_buffer_size,
+      num_random_samplings,
+      rand_xor_mask,
+      dev_seed_ptr,
+      num_seeds,
+      result_indices.data(),
+      result_distances.data(),
+      result_buffer_allocation_size,
+      hashmap.data(),
+      hash_bitlen,
+      stream);
+
+    unsigned iter = 0;
+    while (1) {
+      // Make an index list of internal top-k nodes
+      _cuann_find_topk(itopk_size,
+                       num_queries,
+                       result_buffer_size,
+                       result_distances.data() + (iter & 0x1) * itopk_size,
+                       result_buffer_allocation_size,
+                       result_indices.data() + (iter & 0x1) * itopk_size,
+                       result_buffer_allocation_size,
+                       result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size,
+                       result_buffer_allocation_size,
+                       result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size,
+                       result_buffer_allocation_size,
+                       topk_workspace.data(),
+                       true,
+                       topk_hint.data(),
+                       stream);
+
+      // termination (1)
+      if ((iter + 1 == max_iterations)) {
+        iter++;
+        break;
+      }
+
+      if (iter + 1 >= min_iterations) { set_value<uint32_t>(terminate_flag.data(), 1, stream); }
+
+      // pickup parent nodes
+      uint32_t _small_hash_bitlen = 0;
+      if ((iter + 1) % small_hash_reset_interval == 0) { _small_hash_bitlen = small_hash_bitlen; }
+      pickup_next_parents(result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size,
+                          result_buffer_allocation_size,
+                          itopk_size,
+                          num_queries,
+                          hashmap.data(),
+                          hash_bitlen,
+                          _small_hash_bitlen,
+                          parent_node_list.data(),
+                          num_parents,
+                          num_parents,
+                          terminate_flag.data(),
+                          stream);
+
+      // termination (2)
+      if (iter + 1 >= min_iterations && terminate_flag.value(stream)) {
+        iter++;
+        break;
+      }
+
+      // Compute distance to child nodes that are adjacent to the parent node
+      compute_distance_to_child_nodes<TEAM_SIZE, MAX_DATASET_DIM>(
+        parent_node_list.data(),
+        num_parents,
+        dataset.data_handle(),
+        dataset.extent(1),
+        dataset.extent(0),
+        graph.data_handle(),
+        graph.extent(1),
+        queries_ptr,
+        num_queries,
+        hashmap.data(),
+        hash_bitlen,
+        result_indices.data() + itopk_size,
+        result_distances.data() + itopk_size,
+        result_buffer_allocation_size,
+        stream);
+
+      iter++;
+    }  // while ( 1 )
+
+    // Remove parent bit in search results
+    remove_parent_bit(num_queries,
+                      itopk_size,
+                      result_indices.data() + (iter & 0x1) * result_buffer_size,
+                      result_buffer_allocation_size,
+                      stream);
+
+    // Copy results from working buffer to final buffer
+    batched_memcpy(topk_indices_ptr,
+                   topk,
+                   result_indices.data() + (iter & 0x1) * result_buffer_size,
+                   result_buffer_allocation_size,
+                   topk,
+                   num_queries,
+                   stream);
+    if (topk_distances_ptr) {
+      batched_memcpy(topk_distances_ptr,
+                     topk,
+                     result_distances.data() + (iter & 0x1) * result_buffer_size,
+                     result_buffer_allocation_size,
+                     topk,
+                     num_queries,
+                     stream);
+    }
+
+    if (num_executed_iterations) {
+      for (std::uint32_t i = 0; i < num_queries; i++) {
+        num_executed_iterations[i] = iter;
+      }
+    }
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+};
+
+}  // namespace multi_kernel_search
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
new file mode 100644
index 0000000000..d9613b345c
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "hashmap.hpp"
+// #include "search_single_cta.cuh"
+// #include "topk_for_cagra/topk_core.cuh"
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/neighbors/cagra_types.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+namespace raft::neighbors::experimental::cagra::detail {
+
+struct search_plan_impl_base : public search_params {
+  int64_t max_dim;
+  int64_t dim;
+  int64_t graph_degree;
+  uint32_t topk;
+  search_plan_impl_base(search_params params, int64_t dim, int64_t graph_degree, uint32_t topk)
+    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk)
+  {
+    set_max_dim_team(dim);
+    if (algo == search_algo::AUTO) {
+      if (itopk_size <= 512) {
+        algo = search_algo::SINGLE_CTA;
+        RAFT_LOG_DEBUG("Auto strategy: selecting single-cta");
+      } else {
+        algo = search_algo::MULTI_KERNEL;
+        RAFT_LOG_DEBUG("Auto strategy: selecting multi-kernel");
+      }
+    }
+  }
+
+  void set_max_dim_team(int64_t dim)
+  {
+    max_dim = 128;
+    while (max_dim < dim && max_dim <= 1024)
+      max_dim *= 2;
+    if (team_size != 0) { RAFT_LOG_WARN("Overriding team size parameter."); }
+    // To keep binary size in check we limit only one team size specialization for each max_dim.
+    // TODO(tfeher): revise this decision.
+    switch (max_dim) {
+      case 128: team_size = 8; break;
+      case 256: team_size = 16; break;
+      case 512: team_size = 32; break;
+      case 1024: team_size = 32; break;
+      default: RAFT_LOG_DEBUG("Dataset dimension is too large (%lu)\n", dim);
+    }
+  }
+};
+
+template <class DATA_T, class INDEX_T, class DISTANCE_T>
+struct search_plan_impl : public search_plan_impl_base {
+  int64_t hash_bitlen;
+
+  size_t small_hash_bitlen;
+  size_t small_hash_reset_interval;
+  size_t hashmap_size;
+  uint32_t dataset_size;
+  uint32_t result_buffer_size;
+
+  uint32_t smem_size;
+  uint32_t load_bit_lenght;
+  uint32_t topk;
+  uint32_t num_seeds;
+
+  rmm::device_uvector<uint32_t> hashmap;
+  rmm::device_uvector<uint32_t> num_executed_iterations;  // device or managed?
+  rmm::device_uvector<uint32_t> dev_seed;                 // IdxT
+
+  search_plan_impl(raft::device_resources const& res,
+                   search_params params,
+                   int64_t dim,
+                   int64_t graph_degree,
+                   uint32_t topk)
+    : search_plan_impl_base(params, dim, graph_degree, topk),
+      hashmap(0, res.get_stream()),
+      num_executed_iterations(0, res.get_stream()),
+      dev_seed(0, res.get_stream()),
+      num_seeds(0)
+  {
+    adjust_search_params();
+    check_params();
+    calc_hashmap_params(res);
+    set_max_dim_team(dim);
+    num_executed_iterations.resize(max_queries, res.get_stream());
+    RAFT_LOG_DEBUG("# algo = %d", static_cast<int>(algo));
+  }
+
+  virtual ~search_plan_impl() {}
+
+  virtual void operator()(raft::device_resources const& res,
+                          raft::device_matrix_view<const DATA_T, INDEX_T, row_major> dataset,
+                          raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph,
+                          INDEX_T* const result_indices_ptr,       // [num_queries, topk]
+                          DISTANCE_T* const result_distances_ptr,  // [num_queries, topk]
+                          const DATA_T* const queries_ptr,         // [num_queries, dataset_dim]
+                          const std::uint32_t num_queries,
+                          const INDEX_T* dev_seed_ptr,                   // [num_queries, num_seeds]
+                          std::uint32_t* const num_executed_iterations,  // [num_queries]
+                          uint32_t topk){};
+
+  void adjust_search_params()
+  {
+    uint32_t _max_iterations = max_iterations;
+    if (max_iterations == 0) {
+      if (algo == search_algo::MULTI_CTA) {
+        _max_iterations = 1 + std::min(32 * 1.1, 32 + 10.0);  // TODO(anaruse)
+      } else {
+        _max_iterations =
+          1 + std::min((itopk_size / num_parents) * 1.1, (itopk_size / num_parents) + 10.0);
+      }
+    }
+    if (max_iterations < min_iterations) { _max_iterations = min_iterations; }
+    if (max_iterations < _max_iterations) {
+      RAFT_LOG_DEBUG(
+        "# max_iterations is increased from %u to %u.", max_iterations, _max_iterations);
+      max_iterations = _max_iterations;
+    }
+    if (itopk_size % 32) {
+      uint32_t itopk32 = itopk_size;
+      itopk32 += 32 - (itopk_size % 32);
+      RAFT_LOG_DEBUG("# internal_topk is increased from %u to %u, as it must be multiple of 32.",
+                     itopk_size,
+                     itopk32);
+      itopk_size = itopk32;
+    }
+  }
+
+  // defines hash_bitlen, small_hash_bitlen, small_hash_reset interval, hash_size
+  inline void calc_hashmap_params(raft::device_resources const& res)
+  {
+    // for multipel CTA search
+    uint32_t mc_num_cta_per_query = 0;
+    uint32_t mc_num_parents       = 0;
+    uint32_t mc_itopk_size        = 0;
+    if (algo == search_algo::MULTI_CTA) {
+      mc_itopk_size        = 32;
+      mc_num_parents       = 1;
+      mc_num_cta_per_query = max(num_parents, itopk_size / 32);
+      RAFT_LOG_DEBUG("# mc_itopk_size: %u", mc_itopk_size);
+      RAFT_LOG_DEBUG("# mc_num_parents: %u", mc_num_parents);
+      RAFT_LOG_DEBUG("# mc_num_cta_per_query: %u", mc_num_cta_per_query);
+    }
+
+    // Determine hash size (bit length)
+    hashmap_size              = 0;
+    hash_bitlen               = 0;
+    small_hash_bitlen         = 0;
+    small_hash_reset_interval = 1024 * 1024;
+    float max_fill_rate       = hashmap_max_fill_rate;
+    while (hashmap_mode == hash_mode::AUTO || hashmap_mode == hash_mode::SMALL) {
+      //
+      // The small-hash reduces hash table size by initializing the hash table
+      // for each iteraton and re-registering only the nodes that should not be
+      // re-visited in that iteration. Therefore, the size of small-hash should
+      // be determined based on the internal topk size and the number of nodes
+      // visited per iteration.
+      //
+      const auto max_visited_nodes = itopk_size + (num_parents * graph_degree * 1);
+      unsigned min_bitlen          = 8;   // 256
+      unsigned max_bitlen          = 13;  // 8K
+      if (min_bitlen < hashmap_min_bitlen) { min_bitlen = hashmap_min_bitlen; }
+      hash_bitlen = min_bitlen;
+      while (max_visited_nodes > hashmap::get_size(hash_bitlen) * max_fill_rate) {
+        hash_bitlen += 1;
+      }
+      if (hash_bitlen > max_bitlen) {
+        // Switch to normal hash if hashmap_mode is AUTO, otherwise exit.
+        if (hashmap_mode == hash_mode::AUTO) {
+          hash_bitlen = 0;
+          break;
+        } else {
+          RAFT_LOG_DEBUG(
+            "[CAGRA Error]"
+            "small-hash cannot be used because the required hash size exceeds the limit (%u)",
+            hashmap::get_size(max_bitlen));
+          exit(-1);
+        }
+      }
+      small_hash_bitlen = hash_bitlen;
+      //
+      // Sincc the hash table size is limited to a power of 2, the requirement,
+      // the maximum fill rate, may be satisfied even if the frequency of hash
+      // table reset is reduced to once every 2 or more iterations without
+      // changing the hash table size. In that case, reduce the reset frequency.
+      //
+      small_hash_reset_interval = 1;
+      while (1) {
+        const auto max_visited_nodes =
+          itopk_size + (num_parents * graph_degree * (small_hash_reset_interval + 1));
+        if (max_visited_nodes > hashmap::get_size(hash_bitlen) * max_fill_rate) { break; }
+        small_hash_reset_interval += 1;
+      }
+      break;
+    }
+    if (hash_bitlen == 0) {
+      //
+      // The size of hash table is determined based on the maximum number of
+      // nodes that may be visited before the search is completed and the
+      // maximum fill rate of the hash table.
+      //
+      uint32_t max_visited_nodes = itopk_size + (num_parents * graph_degree * max_iterations);
+      if (algo == search_algo::MULTI_CTA) {
+        max_visited_nodes = mc_itopk_size + (mc_num_parents * graph_degree * max_iterations);
+        max_visited_nodes *= mc_num_cta_per_query;
+      }
+      unsigned min_bitlen = 11;  // 2K
+      if (min_bitlen < hashmap_min_bitlen) { min_bitlen = hashmap_min_bitlen; }
+      hash_bitlen = min_bitlen;
+      while (max_visited_nodes > hashmap::get_size(hash_bitlen) * max_fill_rate) {
+        hash_bitlen += 1;
+      }
+      RAFT_EXPECTS(hash_bitlen <= 20, "hash_bitlen cannot be largen than 20 (1M)");
+    }
+
+    RAFT_LOG_DEBUG("# internal topK = %lu", itopk_size);
+    RAFT_LOG_DEBUG("# parent size = %lu", num_parents);
+    RAFT_LOG_DEBUG("# min_iterations = %lu", min_iterations);
+    RAFT_LOG_DEBUG("# max_iterations = %lu", max_iterations);
+    RAFT_LOG_DEBUG("# max_queries = %lu", max_queries);
+    RAFT_LOG_DEBUG("# hashmap mode = %s%s-%u",
+                   (small_hash_bitlen > 0 ? "small-" : ""),
+                   "hash",
+                   hashmap::get_size(hash_bitlen));
+    if (small_hash_bitlen > 0) {
+      RAFT_LOG_DEBUG("# small_hash_reset_interval = %lu", small_hash_reset_interval);
+    }
+    hashmap_size = sizeof(std::uint32_t) * max_queries * hashmap::get_size(hash_bitlen);
+    RAFT_LOG_DEBUG("# hashmap size: %lu", hashmap_size);
+    if (hashmap_size >= 1024 * 1024 * 1024) {
+      RAFT_LOG_DEBUG(" (%.2f GiB)", (double)hashmap_size / (1024 * 1024 * 1024));
+    } else if (hashmap_size >= 1024 * 1024) {
+      RAFT_LOG_DEBUG(" (%.2f MiB)", (double)hashmap_size / (1024 * 1024));
+    } else if (hashmap_size >= 1024) {
+      RAFT_LOG_DEBUG(" (%.2f KiB)", (double)hashmap_size / (1024));
+    }
+  }
+
+  void check(uint32_t topk)
+  {
+    RAFT_EXPECTS(topk <= itopk_size, "topk must be smaller than itopk_size = %lu", itopk_size);
+    if (algo == search_algo::MULTI_CTA) {
+      uint32_t mc_num_cta_per_query = max(num_parents, itopk_size / 32);
+      RAFT_EXPECTS(mc_num_cta_per_query * 32 >= topk,
+                   "`mc_num_cta_per_query` (%u) * 32 must be equal to or greater than "
+                   "`topk` /%u) when 'search_mode' is \"multi-cta\"",
+                   mc_num_cta_per_query,
+                   topk);
+    }
+  }
+
+  inline void check_params()
+  {
+    std::string error_message = "";
+
+    if (itopk_size > 1024) {
+      if (algo == search_algo::MULTI_CTA) {
+      } else {
+        error_message += std::string("- `internal_topk` (" + std::to_string(itopk_size) +
+                                     ") must be smaller or equal to 1024");
+      }
+    }
+    if (algo != search_algo::SINGLE_CTA && algo != search_algo::MULTI_CTA &&
+        algo != search_algo::MULTI_KERNEL) {
+      error_message += "An invalid kernel mode has been given: " + std::to_string((int)algo) + "";
+    }
+    if (team_size != 0 && team_size != 4 && team_size != 8 && team_size != 16 && team_size != 32) {
+      error_message +=
+        "`team_size` must be 0, 4, 8, 16 or 32. " + std::to_string(team_size) + " has been given.";
+    }
+    if (load_bit_length != 0 && load_bit_length != 64 && load_bit_length != 128) {
+      error_message += "`load_bit_length` must be 0, 64 or 128. " +
+                       std::to_string(load_bit_length) + " has been given.";
+    }
+    if (thread_block_size != 0 && thread_block_size != 64 && thread_block_size != 128 &&
+        thread_block_size != 256 && thread_block_size != 512 && thread_block_size != 1024) {
+      error_message += "`thread_block_size` must be 0, 64, 128, 256 or 512. " +
+                       std::to_string(load_bit_length) + " has been given.";
+    }
+    if (hashmap_min_bitlen > 20) {
+      error_message += "`hashmap_min_bitlen` must be equal to or smaller than 20. " +
+                       std::to_string(hashmap_min_bitlen) + " has been given.";
+    }
+    if (hashmap_max_fill_rate < 0.1 || hashmap_max_fill_rate >= 0.9) {
+      error_message +=
+        "`hashmap_max_fill_rate` must be equal to or greater than 0.1 and smaller than 0.9. " +
+        std::to_string(hashmap_max_fill_rate) + " has been given.";
+    }
+    if (algo == search_algo::MULTI_CTA) {
+      if (hashmap_mode == hash_mode::SMALL) {
+        error_message += "`small_hash` is not available when 'search_mode' is \"multi-cta\"";
+      } else {
+        hashmap_mode = hash_mode::HASH;
+      }
+    }
+
+    if (error_message.length() != 0) { THROW("[CAGRA Error] %s", error_message.c_str()); }
+  }
+};
+
+// template <class DATA_T, class DISTANCE_T, class INDEX_T>
+// struct search_plan {
+//   search_plan(raft::device_resources const& res,
+//               search_params param,
+//               int64_t dim,
+//               int64_t graph_degree)
+//     : plan(res, param, dim, graph_degree)
+//   {
+//   }
+//   void check(uint32_t topk) { plan.check(topk); }
+
+//   // private:
+//   detail::search_plan_impl<DATA_T, DISTANCE_T, INDEX_T> plan;
+// };
+/** @} */  // end group cagra
+
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
new file mode 100644
index 0000000000..acd7ac321f
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
@@ -0,0 +1,1157 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <rmm/device_uvector.hpp>
+#include <vector>
+
+#include "bitonic.hpp"
+#include "compute_distance.hpp"
+#include "device_common.hpp"
+#include "hashmap.hpp"
+#include "search_plan.cuh"
+#include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk
+#include "utils.hpp"
+#include <raft/core/logger.hpp>
+#include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
+
+namespace raft::neighbors::experimental::cagra::detail {
+namespace single_cta_search {
+
+// #define _CLK_BREAKDOWN
+
+template <unsigned TOPK_BY_BITONIC_SORT, class INDEX_T>
+__device__ void pickup_next_parents(std::uint32_t* const terminate_flag,
+                                    INDEX_T* const next_parent_indices,
+                                    INDEX_T* const internal_topk_indices,
+                                    const std::size_t internal_topk_size,
+                                    const std::size_t dataset_size,
+                                    const std::uint32_t num_parents)
+{
+  // if (threadIdx.x >= 32) return;
+
+  for (std::uint32_t i = threadIdx.x; i < num_parents; i += 32) {
+    next_parent_indices[i] = utils::get_max_value<INDEX_T>();
+  }
+  std::uint32_t itopk_max = internal_topk_size;
+  if (itopk_max % 32) { itopk_max += 32 - (itopk_max % 32); }
+  std::uint32_t num_new_parents = 0;
+  for (std::uint32_t j = threadIdx.x; j < itopk_max; j += 32) {
+    std::uint32_t jj = j;
+    if (TOPK_BY_BITONIC_SORT) { jj = device::swizzling(j); }
+    INDEX_T index;
+    int new_parent = 0;
+    if (j < internal_topk_size) {
+      index = internal_topk_indices[jj];
+      if ((index & 0x80000000) == 0) {  // check if most significant bit is set
+        new_parent = 1;
+      }
+    }
+    const std::uint32_t ballot_mask = __ballot_sync(0xffffffff, new_parent);
+    if (new_parent) {
+      const auto i = __popc(ballot_mask & ((1 << threadIdx.x) - 1)) + num_new_parents;
+      if (i < num_parents) {
+        next_parent_indices[i] = index;
+        // set most significant bit as used node
+        internal_topk_indices[jj] |= 0x80000000;
+      }
+    }
+    num_new_parents += __popc(ballot_mask);
+    if (num_new_parents >= num_parents) { break; }
+  }
+  if (threadIdx.x == 0 && (num_new_parents == 0)) { *terminate_flag = 1; }
+}
+
+template <unsigned MAX_INTERNAL_TOPK>
+struct topk_by_radix_sort_base {
+  static constexpr std::uint32_t smem_size        = MAX_INTERNAL_TOPK * 2 + 2048 + 8;
+  static constexpr std::uint32_t state_bit_lenght = 0;
+  static constexpr std::uint32_t vecLen           = 2;  // TODO
+};
+template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE, class = void>
+struct topk_by_radix_sort : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {
+};
+
+template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE>
+struct topk_by_radix_sort<MAX_INTERNAL_TOPK,
+                          BLOCK_SIZE,
+                          std::enable_if_t<((MAX_INTERNAL_TOPK <= 64))>>
+  : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {
+  __device__ void operator()(uint32_t topk,
+                             uint32_t batch_size,
+                             uint32_t len_x,
+                             const uint32_t* _x,
+                             const uint32_t* _in_vals,
+                             uint32_t* _y,
+                             uint32_t* _out_vals,
+                             uint32_t* work,
+                             uint32_t* _hints,
+                             bool sort,
+                             uint32_t* _smem)
+  {
+    std::uint8_t* state = (std::uint8_t*)work;
+    topk_cta_11_core<BLOCK_SIZE,
+                     topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght,
+                     topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::vecLen,
+                     64,
+                     32>(topk, len_x, _x, _in_vals, _y, _out_vals, state, _hints, sort, _smem);
+  }
+};
+
+#define TOP_FUNC_PARTIAL_SPECIALIZATION(V)                                           \
+  template <unsigned MAX_INTERNAL_TOPK, unsigned BLOCK_SIZE>                         \
+  struct topk_by_radix_sort<                                                         \
+    MAX_INTERNAL_TOPK,                                                               \
+    BLOCK_SIZE,                                                                      \
+    std::enable_if_t<((MAX_INTERNAL_TOPK <= V) && (2 * MAX_INTERNAL_TOPK > V))>>     \
+    : topk_by_radix_sort_base<MAX_INTERNAL_TOPK> {                                   \
+    __device__ void operator()(uint32_t topk,                                        \
+                               uint32_t batch_size,                                  \
+                               uint32_t len_x,                                       \
+                               const uint32_t* _x,                                   \
+                               const uint32_t* _in_vals,                             \
+                               uint32_t* _y,                                         \
+                               uint32_t* _out_vals,                                  \
+                               uint32_t* work,                                       \
+                               uint32_t* _hints,                                     \
+                               bool sort,                                            \
+                               uint32_t* _smem)                                      \
+    {                                                                                \
+      assert(BLOCK_SIZE >= V / 4);                                                   \
+      std::uint8_t* state = (std::uint8_t*)work;                                     \
+      topk_cta_11_core<BLOCK_SIZE,                                                   \
+                       topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::state_bit_lenght, \
+                       topk_by_radix_sort_base<MAX_INTERNAL_TOPK>::vecLen,           \
+                       V,                                                            \
+                       V / 4>(                                                       \
+        topk, len_x, _x, _in_vals, _y, _out_vals, state, _hints, sort, _smem);       \
+    }                                                                                \
+  };
+TOP_FUNC_PARTIAL_SPECIALIZATION(128);
+TOP_FUNC_PARTIAL_SPECIALIZATION(256);
+TOP_FUNC_PARTIAL_SPECIALIZATION(512);
+TOP_FUNC_PARTIAL_SPECIALIZATION(1024);
+
+template <unsigned MAX_CANDIDATES, unsigned MULTI_WARPS = 0>
+__device__ inline void topk_by_bitonic_sort_1st(
+  float* candidate_distances,        // [num_candidates]
+  std::uint32_t* candidate_indices,  // [num_candidates]
+  const std::uint32_t num_candidates,
+  const std::uint32_t num_itopk)
+{
+  const unsigned lane_id = threadIdx.x % 32;
+  const unsigned warp_id = threadIdx.x / 32;
+  if (MULTI_WARPS == 0) {
+    if (warp_id > 0) { return; }
+    constexpr unsigned N = (MAX_CANDIDATES + 31) / 32;
+    float key[N];
+    std::uint32_t val[N];
+    /* Candidates -> Reg */
+    for (unsigned i = 0; i < N; i++) {
+      unsigned j = lane_id + (32 * i);
+      if (j < num_candidates) {
+        key[i] = candidate_distances[j];
+        val[i] = candidate_indices[j];
+      } else {
+        key[i] = utils::get_max_value<float>();
+        val[i] = utils::get_max_value<std::uint32_t>();
+      }
+    }
+    /* Sort */
+    bitonic::warp_sort<float, std::uint32_t, N>(key, val);
+    /* Reg -> Temp_itopk */
+    for (unsigned i = 0; i < N; i++) {
+      unsigned j = (N * lane_id) + i;
+      if (j < num_candidates && j < num_itopk) {
+        candidate_distances[device::swizzling(j)] = key[i];
+        candidate_indices[device::swizzling(j)]   = val[i];
+      }
+    }
+  } else {
+    // Use two warps (64 threads)
+    constexpr unsigned max_candidates_per_warp = (MAX_CANDIDATES + 1) / 2;
+    constexpr unsigned N                       = (max_candidates_per_warp + 31) / 32;
+    float key[N];
+    std::uint32_t val[N];
+    if (warp_id < 2) {
+      /* Candidates -> Reg */
+      for (unsigned i = 0; i < N; i++) {
+        unsigned jl = lane_id + (32 * i);
+        unsigned j  = jl + (max_candidates_per_warp * warp_id);
+        if (j < num_candidates) {
+          key[i] = candidate_distances[j];
+          val[i] = candidate_indices[j];
+        } else {
+          key[i] = utils::get_max_value<float>();
+          val[i] = utils::get_max_value<std::uint32_t>();
+        }
+      }
+      /* Sort */
+      bitonic::warp_sort<float, std::uint32_t, N>(key, val);
+      /* Reg -> Temp_candidates */
+      for (unsigned i = 0; i < N; i++) {
+        unsigned jl = (N * lane_id) + i;
+        unsigned j  = jl + (max_candidates_per_warp * warp_id);
+        if (j < num_candidates && jl < num_itopk) {
+          candidate_distances[device::swizzling(j)] = key[i];
+          candidate_indices[device::swizzling(j)]   = val[i];
+        }
+      }
+    }
+    __syncthreads();
+
+    unsigned num_warps_used = (num_itopk + max_candidates_per_warp - 1) / max_candidates_per_warp;
+    if (warp_id < num_warps_used) {
+      /* Temp_candidates -> Reg */
+      for (unsigned i = 0; i < N; i++) {
+        unsigned jl = (N * lane_id) + i;
+        unsigned kl = max_candidates_per_warp - 1 - jl;
+        unsigned j  = jl + (max_candidates_per_warp * warp_id);
+        unsigned k  = MAX_CANDIDATES - 1 - j;
+        if (j >= num_candidates || k >= num_candidates || kl >= num_itopk) continue;
+        float temp_key = candidate_distances[device::swizzling(k)];
+        if (key[i] == temp_key) continue;
+        if ((warp_id == 0) == (key[i] > temp_key)) {
+          key[i] = temp_key;
+          val[i] = candidate_indices[device::swizzling(k)];
+        }
+      }
+    }
+    if (num_warps_used > 1) { __syncthreads(); }
+    if (warp_id < num_warps_used) {
+      /* Merge */
+      bitonic::warp_merge<float, std::uint32_t, N>(key, val, 32);
+      /* Reg -> Temp_itopk */
+      for (unsigned i = 0; i < N; i++) {
+        unsigned jl = (N * lane_id) + i;
+        unsigned j  = jl + (max_candidates_per_warp * warp_id);
+        if (j < num_candidates && j < num_itopk) {
+          candidate_distances[device::swizzling(j)] = key[i];
+          candidate_indices[device::swizzling(j)]   = val[i];
+        }
+      }
+    }
+    if (num_warps_used > 1) { __syncthreads(); }
+  }
+}
+
+template <unsigned MAX_ITOPK, unsigned MULTI_WARPS = 0>
+__device__ inline void topk_by_bitonic_sort_2nd(
+  float* itopk_distances,        // [num_itopk]
+  std::uint32_t* itopk_indices,  // [num_itopk]
+  const std::uint32_t num_itopk,
+  float* candidate_distances,        // [num_candidates]
+  std::uint32_t* candidate_indices,  // [num_candidates]
+  const std::uint32_t num_candidates,
+  std::uint32_t* work_buf,
+  const bool first)
+{
+  const unsigned lane_id = threadIdx.x % 32;
+  const unsigned warp_id = threadIdx.x / 32;
+  if (MULTI_WARPS == 0) {
+    if (warp_id > 0) { return; }
+    constexpr unsigned N = (MAX_ITOPK + 31) / 32;
+    float key[N];
+    std::uint32_t val[N];
+    if (first) {
+      /* Load itopk results */
+      for (unsigned i = 0; i < N; i++) {
+        unsigned j = lane_id + (32 * i);
+        if (j < num_itopk) {
+          key[i] = itopk_distances[j];
+          val[i] = itopk_indices[j];
+        } else {
+          key[i] = utils::get_max_value<float>();
+          val[i] = utils::get_max_value<std::uint32_t>();
+        }
+      }
+      /* Warp Sort */
+      bitonic::warp_sort<float, std::uint32_t, N>(key, val);
+    } else {
+      /* Load itopk results */
+      for (unsigned i = 0; i < N; i++) {
+        unsigned j = (N * lane_id) + i;
+        if (j < num_itopk) {
+          key[i] = itopk_distances[device::swizzling(j)];
+          val[i] = itopk_indices[device::swizzling(j)];
+        } else {
+          key[i] = utils::get_max_value<float>();
+          val[i] = utils::get_max_value<std::uint32_t>();
+        }
+      }
+    }
+    /* Merge candidates */
+    for (unsigned i = 0; i < N; i++) {
+      unsigned j = (N * lane_id) + i;  // [0:MAX_ITOPK-1]
+      unsigned k = MAX_ITOPK - 1 - j;
+      if (k >= num_itopk || k >= num_candidates) continue;
+      float candidate_key = candidate_distances[device::swizzling(k)];
+      if (key[i] > candidate_key) {
+        key[i] = candidate_key;
+        val[i] = candidate_indices[device::swizzling(k)];
+      }
+    }
+    /* Warp Merge */
+    bitonic::warp_merge<float, std::uint32_t, N>(key, val, 32);
+    /* Store new itopk results */
+    for (unsigned i = 0; i < N; i++) {
+      unsigned j = (N * lane_id) + i;
+      if (j < num_itopk) {
+        itopk_distances[device::swizzling(j)] = key[i];
+        itopk_indices[device::swizzling(j)]   = val[i];
+      }
+    }
+  } else {
+    // Use two warps (64 threads) or more
+    constexpr unsigned max_itopk_per_warp = (MAX_ITOPK + 1) / 2;
+    constexpr unsigned N                  = (max_itopk_per_warp + 31) / 32;
+    float key[N];
+    std::uint32_t val[N];
+    if (first) {
+      /* Load itop results (not sorted) */
+      if (warp_id < 2) {
+        for (unsigned i = 0; i < N; i++) {
+          unsigned j = lane_id + (32 * i) + (max_itopk_per_warp * warp_id);
+          if (j < num_itopk) {
+            key[i] = itopk_distances[j];
+            val[i] = itopk_indices[j];
+          } else {
+            key[i] = utils::get_max_value<float>();
+            val[i] = utils::get_max_value<std::uint32_t>();
+          }
+        }
+        /* Warp Sort */
+        bitonic::warp_sort<float, std::uint32_t, N>(key, val);
+        /* Store intermedidate results */
+        for (unsigned i = 0; i < N; i++) {
+          unsigned j = (N * threadIdx.x) + i;
+          if (j >= num_itopk) continue;
+          itopk_distances[device::swizzling(j)] = key[i];
+          itopk_indices[device::swizzling(j)]   = val[i];
+        }
+      }
+      __syncthreads();
+      if (warp_id < 2) {
+        /* Load intermedidate results */
+        for (unsigned i = 0; i < N; i++) {
+          unsigned j = (N * threadIdx.x) + i;
+          unsigned k = MAX_ITOPK - 1 - j;
+          if (k >= num_itopk) continue;
+          float temp_key = itopk_distances[device::swizzling(k)];
+          if (key[i] == temp_key) continue;
+          if ((warp_id == 0) == (key[i] > temp_key)) {
+            key[i] = temp_key;
+            val[i] = itopk_indices[device::swizzling(k)];
+          }
+        }
+        /* Warp Merge */
+        bitonic::warp_merge<float, std::uint32_t, N>(key, val, 32);
+      }
+      __syncthreads();
+      /* Store itopk results (sorted) */
+      if (warp_id < 2) {
+        for (unsigned i = 0; i < N; i++) {
+          unsigned j = (N * threadIdx.x) + i;
+          if (j >= num_itopk) continue;
+          itopk_distances[device::swizzling(j)] = key[i];
+          itopk_indices[device::swizzling(j)]   = val[i];
+        }
+      }
+    }
+    const uint32_t num_itopk_div2 = num_itopk / 2;
+    if (threadIdx.x < 3) {
+      // work_buf is used to obtain turning points in 1st and 2nd half of itopk afer merge.
+      work_buf[threadIdx.x] = num_itopk_div2;
+    }
+    __syncthreads();
+
+    // Merge candidates (using whole threads)
+    for (unsigned k = threadIdx.x; k < min(num_candidates, num_itopk); k += blockDim.x) {
+      const unsigned j          = num_itopk - 1 - k;
+      const float itopk_key     = itopk_distances[device::swizzling(j)];
+      const float candidate_key = candidate_distances[device::swizzling(k)];
+      if (itopk_key > candidate_key) {
+        itopk_distances[device::swizzling(j)] = candidate_key;
+        itopk_indices[device::swizzling(j)]   = candidate_indices[device::swizzling(k)];
+        if (j < num_itopk_div2) {
+          atomicMin(work_buf + 2, j);
+        } else {
+          atomicMin(work_buf + 1, j - num_itopk_div2);
+        }
+      }
+    }
+    __syncthreads();
+
+    // Merge 1st and 2nd half of itopk (using whole threads)
+    for (unsigned j = threadIdx.x; j < num_itopk_div2; j += blockDim.x) {
+      const unsigned k = j + num_itopk_div2;
+      float key_0      = itopk_distances[device::swizzling(j)];
+      float key_1      = itopk_distances[device::swizzling(k)];
+      if (key_0 > key_1) {
+        itopk_distances[device::swizzling(j)] = key_1;
+        itopk_distances[device::swizzling(k)] = key_0;
+        std::uint32_t val_0                   = itopk_indices[device::swizzling(j)];
+        std::uint32_t val_1                   = itopk_indices[device::swizzling(k)];
+        itopk_indices[device::swizzling(j)]   = val_1;
+        itopk_indices[device::swizzling(k)]   = val_0;
+        atomicMin(work_buf + 0, j);
+      }
+    }
+    if (threadIdx.x == blockDim.x - 1) {
+      if (work_buf[2] < num_itopk_div2) { work_buf[1] = work_buf[2]; }
+    }
+    __syncthreads();
+    // if ((blockIdx.x == 0) && (threadIdx.x == 0)) {
+    //     RAFT_LOG_DEBUG( "work_buf: %u, %u, %u\n", work_buf[0], work_buf[1], work_buf[2] );
+    // }
+
+    // Warp-0 merges 1st half of itopk, warp-1 does 2nd half.
+    if (warp_id < 2) {
+      // Load intermedidate itopk results
+      const uint32_t turning_point = work_buf[warp_id];  // turning_point <= num_itopk_div2
+      for (unsigned i = 0; i < N; i++) {
+        unsigned k = num_itopk;
+        unsigned j = (N * lane_id) + i;
+        if (j < turning_point) {
+          k = j + (num_itopk_div2 * warp_id);
+        } else if (j >= (MAX_ITOPK / 2 - num_itopk_div2)) {
+          j -= (MAX_ITOPK / 2 - num_itopk_div2);
+          if ((turning_point <= j) && (j < num_itopk_div2)) { k = j + (num_itopk_div2 * warp_id); }
+        }
+        if (k < num_itopk) {
+          key[i] = itopk_distances[device::swizzling(k)];
+          val[i] = itopk_indices[device::swizzling(k)];
+        } else {
+          key[i] = utils::get_max_value<float>();
+          val[i] = utils::get_max_value<std::uint32_t>();
+        }
+      }
+      /* Warp Merge */
+      bitonic::warp_merge<float, std::uint32_t, N>(key, val, 32);
+      /* Store new itopk results */
+      for (unsigned i = 0; i < N; i++) {
+        const unsigned j = (N * lane_id) + i;
+        if (j < num_itopk_div2) {
+          unsigned k                            = j + (num_itopk_div2 * warp_id);
+          itopk_distances[device::swizzling(k)] = key[i];
+          itopk_indices[device::swizzling(k)]   = val[i];
+        }
+      }
+    }
+  }
+}
+
+template <unsigned MAX_ITOPK,
+          unsigned MAX_CANDIDATES,
+          unsigned MULTI_WARPS_1,
+          unsigned MULTI_WARPS_2>
+__device__ void topk_by_bitonic_sort(float* itopk_distances,        // [num_itopk]
+                                     std::uint32_t* itopk_indices,  // [num_itopk]
+                                     const std::uint32_t num_itopk,
+                                     float* candidate_distances,        // [num_candidates]
+                                     std::uint32_t* candidate_indices,  // [num_candidates]
+                                     const std::uint32_t num_candidates,
+                                     std::uint32_t* work_buf,
+                                     const bool first)
+{
+  // The results in candidate_distances/indices are sorted by bitonic sort.
+  topk_by_bitonic_sort_1st<MAX_CANDIDATES, MULTI_WARPS_1>(
+    candidate_distances, candidate_indices, num_candidates, num_itopk);
+
+  // The results sorted above are merged with the internal intermediate top-k
+  // results so far using bitonic merge.
+  topk_by_bitonic_sort_2nd<MAX_ITOPK, MULTI_WARPS_2>(itopk_distances,
+                                                     itopk_indices,
+                                                     num_itopk,
+                                                     candidate_distances,
+                                                     candidate_indices,
+                                                     num_candidates,
+                                                     work_buf,
+                                                     first);
+}
+
+template <unsigned FIRST_TID, unsigned LAST_TID, class INDEX_T>
+__device__ inline void hashmap_restore(uint32_t* hashmap_ptr,
+                                       const size_t hashmap_bitlen,
+                                       const INDEX_T* itopk_indices,
+                                       uint32_t itopk_size)
+{
+  if (threadIdx.x < FIRST_TID || threadIdx.x >= LAST_TID) return;
+  for (unsigned i = threadIdx.x - FIRST_TID; i < itopk_size; i += LAST_TID - FIRST_TID) {
+    auto key = itopk_indices[i] & ~0x80000000;  // clear most significant bit
+    hashmap::insert(hashmap_ptr, hashmap_bitlen, key);
+  }
+}
+
+template <class T, unsigned BLOCK_SIZE>
+__device__ inline void set_value_device(T* const ptr, const T fill, const std::uint32_t count)
+{
+  for (std::uint32_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+    ptr[i] = fill;
+  }
+}
+
+// One query one thread block
+template <unsigned TEAM_SIZE,
+          unsigned BLOCK_SIZE,
+          unsigned BLOCK_COUNT,
+          unsigned MAX_ITOPK,
+          unsigned MAX_CANDIDATES,
+          unsigned TOPK_BY_BITONIC_SORT,
+          unsigned MAX_DATASET_DIM,
+          class DATA_T,
+          class DISTANCE_T,
+          class INDEX_T,
+          class LOAD_T>
+__launch_bounds__(BLOCK_SIZE, BLOCK_COUNT) __global__
+  void search_kernel(INDEX_T* const result_indices_ptr,       // [num_queries, top_k]
+                     DISTANCE_T* const result_distances_ptr,  // [num_queries, top_k]
+                     const std::uint32_t top_k,
+                     const DATA_T* const dataset_ptr,  // [dataset_size, dataset_dim]
+                     const std::size_t dataset_dim,
+                     const std::size_t dataset_size,
+                     const DATA_T* const queries_ptr,  // [num_queries, dataset_dim]
+                     const INDEX_T* const knn_graph,   // [dataset_size, graph_degree]
+                     const std::uint32_t graph_degree,
+                     const unsigned num_distilation,
+                     const uint64_t rand_xor_mask,
+                     const INDEX_T* seed_ptr,  // [num_queries, num_seeds]
+                     const uint32_t num_seeds,
+                     std::uint32_t* const visited_hashmap_ptr,  // [num_queries, 1 << hash_bitlen]
+                     const std::uint32_t internal_topk,
+                     const std::uint32_t num_parents,
+                     const std::uint32_t min_iteration,
+                     const std::uint32_t max_iteration,
+                     std::uint32_t* const num_executed_iterations,  // [num_queries]
+                     const std::uint32_t hash_bitlen,
+                     const std::uint32_t small_hash_bitlen,
+                     const std::uint32_t small_hash_reset_interval)
+{
+  const auto query_id = blockIdx.y;
+
+#ifdef _CLK_BREAKDOWN
+  std::uint64_t clk_init                 = 0;
+  std::uint64_t clk_compute_1st_distance = 0;
+  std::uint64_t clk_topk                 = 0;
+  std::uint64_t clk_reset_hash           = 0;
+  std::uint64_t clk_pickup_parents       = 0;
+  std::uint64_t clk_restore_hash         = 0;
+  std::uint64_t clk_compute_distance     = 0;
+  std::uint64_t clk_start;
+#define _CLK_START() clk_start = clock64()
+#define _CLK_REC(V)  V += clock64() - clk_start;
+#else
+#define _CLK_START()
+#define _CLK_REC(V)
+#endif
+  _CLK_START();
+
+  extern __shared__ std::uint32_t smem[];
+
+  // Layout of result_buffer
+  // +----------------------+------------------------------+---------+
+  // | internal_top_k       | neighbors of internal_top_k  | padding |
+  // | <internal_topk_size> | <num_parents * graph_degree> | upto 32 |
+  // +----------------------+------------------------------+---------+
+  // |<---             result_buffer_size              --->|
+  std::uint32_t result_buffer_size    = internal_topk + (num_parents * graph_degree);
+  std::uint32_t result_buffer_size_32 = result_buffer_size;
+  if (result_buffer_size % 32) { result_buffer_size_32 += 32 - (result_buffer_size % 32); }
+  const auto small_hash_size = hashmap::get_size(small_hash_bitlen);
+  auto query_buffer          = reinterpret_cast<float*>(smem);
+  auto result_indices_buffer = reinterpret_cast<INDEX_T*>(query_buffer + MAX_DATASET_DIM);
+  auto result_distances_buffer =
+    reinterpret_cast<DISTANCE_T*>(result_indices_buffer + result_buffer_size_32);
+  auto visited_hash_buffer =
+    reinterpret_cast<std::uint32_t*>(result_distances_buffer + result_buffer_size_32);
+  auto parent_list_buffer = reinterpret_cast<std::uint32_t*>(visited_hash_buffer + small_hash_size);
+  auto topk_ws            = reinterpret_cast<std::uint32_t*>(parent_list_buffer + num_parents);
+  auto terminate_flag     = reinterpret_cast<std::uint32_t*>(topk_ws + 3);
+  auto smem_working_ptr   = reinterpret_cast<std::uint32_t*>(terminate_flag + 1);
+
+  const DATA_T* const query_ptr = queries_ptr + query_id * dataset_dim;
+  for (unsigned i = threadIdx.x; i < MAX_DATASET_DIM; i += BLOCK_SIZE) {
+    unsigned j = device::swizzling(i);
+    if (i < dataset_dim) {
+      query_buffer[j] = static_cast<float>(query_ptr[i]) * device::fragment_scale<DATA_T>();
+    } else {
+      query_buffer[j] = 0.0;
+    }
+  }
+  if (threadIdx.x == 0) {
+    terminate_flag[0] = 0;
+    topk_ws[0]        = ~0u;
+  }
+
+  // Init hashmap
+  uint32_t* local_visited_hashmap_ptr;
+  if (small_hash_bitlen) {
+    local_visited_hashmap_ptr = visited_hash_buffer;
+  } else {
+    local_visited_hashmap_ptr = visited_hashmap_ptr + (hashmap::get_size(hash_bitlen) * query_id);
+  }
+  hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+  __syncthreads();
+  _CLK_REC(clk_init);
+
+  // compute distance to randomly selecting nodes
+  _CLK_START();
+  const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr;
+  device::compute_distance_to_random_nodes<TEAM_SIZE, MAX_DATASET_DIM, LOAD_T>(
+    result_indices_buffer,
+    result_distances_buffer,
+    query_buffer,
+    dataset_ptr,
+    dataset_dim,
+    dataset_size,
+    result_buffer_size,
+    num_distilation,
+    rand_xor_mask,
+    local_seed_ptr,
+    num_seeds,
+    local_visited_hashmap_ptr,
+    hash_bitlen);
+  __syncthreads();
+  _CLK_REC(clk_compute_1st_distance);
+
+  std::uint32_t iter = 0;
+  while (1) {
+    // sort
+    if (TOPK_BY_BITONIC_SORT) {
+      // [Notice]
+      // It is good to use multiple warps in topk_by_bitonic_sort() when
+      // batch size is small (short-latency), but it might not be always good
+      // when batch size is large (high-throughput).
+      // topk_by_bitonic_sort() consists of two operations:
+      // if MAX_CANDIDATES is greater than 128, the first operation uses two warps;
+      // if MAX_ITOPK is greater than 256, the second operation used two warps.
+      constexpr unsigned multi_warps_1 = ((BLOCK_SIZE >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
+      constexpr unsigned multi_warps_2 = ((BLOCK_SIZE >= 64) && (MAX_ITOPK > 256)) ? 1 : 0;
+
+      // reset small-hash table.
+      if ((iter + 1) % small_hash_reset_interval == 0) {
+        // Depending on the block size and the number of warps used in
+        // topk_by_bitonic_sort(), determine which warps are used to reset
+        // the small hash and whether they are performed in overlap with
+        // topk_by_bitonic_sort().
+        _CLK_START();
+        if (BLOCK_SIZE == 32) {
+          hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+        } else if (BLOCK_SIZE == 64) {
+          if (multi_warps_1 || multi_warps_2) {
+            hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+          } else {
+            hashmap::init<32, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+          }
+        } else {
+          if (multi_warps_1 || multi_warps_2) {
+            hashmap::init<64, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+          } else {
+            hashmap::init<32, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+          }
+        }
+        _CLK_REC(clk_reset_hash);
+      }
+
+      // topk with bitonic sort
+      _CLK_START();
+      topk_by_bitonic_sort<MAX_ITOPK, MAX_CANDIDATES, multi_warps_1, multi_warps_2>(
+        result_distances_buffer,
+        result_indices_buffer,
+        internal_topk,
+        result_distances_buffer + internal_topk,
+        result_indices_buffer + internal_topk,
+        num_parents * graph_degree,
+        topk_ws,
+        (iter == 0));
+      _CLK_REC(clk_topk);
+
+    } else {
+      _CLK_START();
+      // topk with radix block sort
+      topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE>{}(
+        internal_topk,
+        gridDim.x,
+        result_buffer_size,
+        reinterpret_cast<std::uint32_t*>(result_distances_buffer),
+        result_indices_buffer,
+        reinterpret_cast<std::uint32_t*>(result_distances_buffer),
+        result_indices_buffer,
+        nullptr,
+        topk_ws,
+        true,
+        reinterpret_cast<std::uint32_t*>(smem_working_ptr));
+      _CLK_REC(clk_topk);
+
+      // reset small-hash table
+      if ((iter + 1) % small_hash_reset_interval == 0) {
+        _CLK_START();
+        hashmap::init<0, BLOCK_SIZE>(local_visited_hashmap_ptr, hash_bitlen);
+        _CLK_REC(clk_reset_hash);
+      }
+    }
+    __syncthreads();
+
+    if (iter + 1 == max_iteration) { break; }
+
+    // pick up next parents
+    if (threadIdx.x < 32) {
+      _CLK_START();
+      pickup_next_parents<TOPK_BY_BITONIC_SORT, INDEX_T>(terminate_flag,
+                                                         parent_list_buffer,
+                                                         result_indices_buffer,
+                                                         internal_topk,
+                                                         dataset_size,
+                                                         num_parents);
+      _CLK_REC(clk_pickup_parents);
+    }
+
+    // restore small-hash table by putting internal-topk indices in it
+    if ((iter + 1) % small_hash_reset_interval == 0) {
+      constexpr unsigned first_tid = ((BLOCK_SIZE <= 32) ? 0 : 32);
+      _CLK_START();
+      hashmap_restore<first_tid, BLOCK_SIZE>(
+        local_visited_hashmap_ptr, hash_bitlen, result_indices_buffer, internal_topk);
+      _CLK_REC(clk_restore_hash);
+    }
+    __syncthreads();
+
+    if (*terminate_flag && iter >= min_iteration) { break; }
+
+    // compute the norms between child nodes and query node
+    _CLK_START();
+    constexpr unsigned max_n_frags = 16;
+    device::
+      compute_distance_to_child_nodes<TEAM_SIZE, BLOCK_SIZE, MAX_DATASET_DIM, max_n_frags, LOAD_T>(
+        result_indices_buffer + internal_topk,
+        result_distances_buffer + internal_topk,
+        query_buffer,
+        dataset_ptr,
+        dataset_dim,
+        knn_graph,
+        graph_degree,
+        local_visited_hashmap_ptr,
+        hash_bitlen,
+        parent_list_buffer,
+        num_parents);
+    __syncthreads();
+    _CLK_REC(clk_compute_distance);
+
+    iter++;
+  }
+  for (std::uint32_t i = threadIdx.x; i < top_k; i += BLOCK_SIZE) {
+    unsigned j  = i + (top_k * query_id);
+    unsigned ii = i;
+    if (TOPK_BY_BITONIC_SORT) { ii = device::swizzling(i); }
+    if (result_distances_ptr != nullptr) { result_distances_ptr[j] = result_distances_buffer[ii]; }
+    result_indices_ptr[j] = result_indices_buffer[ii] & ~0x80000000;  // clear most significant bit
+  }
+  if (threadIdx.x == 0 && num_executed_iterations != nullptr) {
+    num_executed_iterations[query_id] = iter + 1;
+  }
+#ifdef _CLK_BREAKDOWN
+  if ((threadIdx.x == 0 || threadIdx.x == BLOCK_SIZE - 1) && ((query_id * 3) % gridDim.y < 3)) {
+    RAFT_LOG_DEBUG(
+      "query, %d, thread, %d"
+      ", init, %d"
+      ", 1st_distance, %lu"
+      ", topk, %lu"
+      ", reset_hash, %lu"
+      ", pickup_parents, %lu"
+      ", restore_hash, %lu"
+      ", distance, %lu"
+      "\n",
+      query_id,
+      threadIdx.x,
+      clk_init,
+      clk_compute_1st_distance,
+      clk_topk,
+      clk_reset_hash,
+      clk_pickup_parents,
+      clk_restore_hash,
+      clk_compute_distance);
+  }
+#endif
+}
+
+#define SET_KERNEL_3(                                                               \
+  BLOCK_SIZE, BLOCK_COUNT, MAX_ITOPK, MAX_CANDIDATES, TOPK_BY_BITONIC_SORT, LOAD_T) \
+  kernel = search_kernel<TEAM_SIZE,                                                 \
+                         BLOCK_SIZE,                                                \
+                         BLOCK_COUNT,                                               \
+                         MAX_ITOPK,                                                 \
+                         MAX_CANDIDATES,                                            \
+                         TOPK_BY_BITONIC_SORT,                                      \
+                         MAX_DATASET_DIM,                                           \
+                         DATA_T,                                                    \
+                         DISTANCE_T,                                                \
+                         INDEX_T,                                                   \
+                         LOAD_T>;
+
+#define SET_KERNEL_2(BLOCK_SIZE, BLOCK_COUNT, MAX_ITOPK, MAX_CANDIDATES, TOPK_BY_BITONIC_SORT) \
+  if (load_bit_length == 128) {                                                                \
+    SET_KERNEL_3(BLOCK_SIZE,                                                                   \
+                 BLOCK_COUNT,                                                                  \
+                 MAX_ITOPK,                                                                    \
+                 MAX_CANDIDATES,                                                               \
+                 TOPK_BY_BITONIC_SORT,                                                         \
+                 device::LOAD_128BIT_T)                                                        \
+  } else if (load_bit_length == 64) {                                                          \
+    SET_KERNEL_3(BLOCK_SIZE,                                                                   \
+                 BLOCK_COUNT,                                                                  \
+                 MAX_ITOPK,                                                                    \
+                 MAX_CANDIDATES,                                                               \
+                 TOPK_BY_BITONIC_SORT,                                                         \
+                 device::LOAD_64BIT_T)                                                         \
+  }
+
+#define SET_KERNEL_1B(MAX_ITOPK, MAX_CANDIDATES)              \
+  /* if ( block_size == 32 ) {                                \
+      SET_KERNEL_2( 32, 20, MAX_ITOPK, MAX_CANDIDATES, 1 )    \
+  } else */                                                   \
+  if (block_size == 64) {                                     \
+    SET_KERNEL_2(64, 16 /*20*/, MAX_ITOPK, MAX_CANDIDATES, 1) \
+  } else if (block_size == 128) {                             \
+    SET_KERNEL_2(128, 8, MAX_ITOPK, MAX_CANDIDATES, 1)        \
+  } else if (block_size == 256) {                             \
+    SET_KERNEL_2(256, 4, MAX_ITOPK, MAX_CANDIDATES, 1)        \
+  } else if (block_size == 512) {                             \
+    SET_KERNEL_2(512, 2, MAX_ITOPK, MAX_CANDIDATES, 1)        \
+  } else {                                                    \
+    SET_KERNEL_2(1024, 1, MAX_ITOPK, MAX_CANDIDATES, 1)       \
+  }
+
+#define SET_KERNEL_1R(MAX_ITOPK, MAX_CANDIDATES)        \
+  if (block_size == 256) {                              \
+    SET_KERNEL_2(256, 4, MAX_ITOPK, MAX_CANDIDATES, 0)  \
+  } else if (block_size == 512) {                       \
+    SET_KERNEL_2(512, 2, MAX_ITOPK, MAX_CANDIDATES, 0)  \
+  } else {                                              \
+    SET_KERNEL_2(1024, 1, MAX_ITOPK, MAX_CANDIDATES, 0) \
+  }
+
+#define SET_KERNEL                                                                \
+  typedef void (*search_kernel_t)(INDEX_T* const result_indices_ptr,              \
+                                  DISTANCE_T* const result_distances_ptr,         \
+                                  const std::uint32_t top_k,                      \
+                                  const DATA_T* const dataset_ptr,                \
+                                  const std::size_t dataset_dim,                  \
+                                  const std::size_t dataset_size,                 \
+                                  const DATA_T* const queries_ptr,                \
+                                  const INDEX_T* const knn_graph,                 \
+                                  const std::uint32_t graph_degree,               \
+                                  const unsigned num_distilation,                 \
+                                  const uint64_t rand_xor_mask,                   \
+                                  const INDEX_T* seed_ptr,                        \
+                                  const uint32_t num_seeds,                       \
+                                  std::uint32_t* const visited_hashmap_ptr,       \
+                                  const std::uint32_t itopk_size,                 \
+                                  const std::uint32_t num_parents,                \
+                                  const std::uint32_t min_iteration,              \
+                                  const std::uint32_t max_iteration,              \
+                                  std::uint32_t* const num_executed_iterations,   \
+                                  const std::uint32_t hash_bitlen,                \
+                                  const std::uint32_t small_hash_bitlen,          \
+                                  const std::uint32_t small_hash_reset_interval); \
+  search_kernel_t kernel;                                                         \
+  if (num_itopk_candidates <= 64) {                                               \
+    constexpr unsigned max_candidates = 64;                                       \
+    if (itopk_size <= 64) {                                                       \
+      SET_KERNEL_1B(64, max_candidates)                                           \
+    } else if (itopk_size <= 128) {                                               \
+      SET_KERNEL_1B(128, max_candidates)                                          \
+    } else if (itopk_size <= 256) {                                               \
+      SET_KERNEL_1B(256, max_candidates)                                          \
+    } else if (itopk_size <= 512) {                                               \
+      SET_KERNEL_1B(512, max_candidates)                                          \
+    }                                                                             \
+  } else if (num_itopk_candidates <= 128) {                                       \
+    constexpr unsigned max_candidates = 128;                                      \
+    if (itopk_size <= 64) {                                                       \
+      SET_KERNEL_1B(64, max_candidates)                                           \
+    } else if (itopk_size <= 128) {                                               \
+      SET_KERNEL_1B(128, max_candidates)                                          \
+    } else if (itopk_size <= 256) {                                               \
+      SET_KERNEL_1B(256, max_candidates)                                          \
+    } else if (itopk_size <= 512) {                                               \
+      SET_KERNEL_1B(512, max_candidates)                                          \
+    }                                                                             \
+  } else if (num_itopk_candidates <= 256) {                                       \
+    constexpr unsigned max_candidates = 256;                                      \
+    if (itopk_size <= 64) {                                                       \
+      SET_KERNEL_1B(64, max_candidates)                                           \
+    } else if (itopk_size <= 128) {                                               \
+      SET_KERNEL_1B(128, max_candidates)                                          \
+    } else if (itopk_size <= 256) {                                               \
+      SET_KERNEL_1B(256, max_candidates)                                          \
+    } else if (itopk_size <= 512) {                                               \
+      SET_KERNEL_1B(512, max_candidates)                                          \
+    }                                                                             \
+  } else {                                                                        \
+    /* Radix-based topk is used */                                                \
+    if (itopk_size <= 256) {                                                      \
+      SET_KERNEL_1R(256, /*to avoid build failure*/ 32)                           \
+    } else if (itopk_size <= 512) {                                               \
+      SET_KERNEL_1R(512, /*to avoid build failure*/ 32)                           \
+    }                                                                             \
+  }
+
+template <unsigned TEAM_SIZE,
+          unsigned MAX_DATASET_DIM,
+          typename DATA_T,
+          typename INDEX_T,
+          typename DISTANCE_T>
+struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T> {
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_queries;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::itopk_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::algo;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::team_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_parents;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::min_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::load_bit_length;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::thread_block_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_mode;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_min_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_max_fill_rate;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_random_samplings;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::rand_xor_mask;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::max_dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dim;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::graph_degree;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::topk;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hash_bitlen;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_bitlen;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::small_hash_reset_interval;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dataset_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::result_buffer_size;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::smem_size;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::load_bit_lenght;
+
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::hashmap;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_executed_iterations;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::dev_seed;
+  using search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>::num_seeds;
+
+  uint32_t num_itopk_candidates;
+
+  search(raft::device_resources const& res,
+         search_params params,
+         int64_t dim,
+         int64_t graph_degree,
+         uint32_t topk)
+    : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T>(res, params, dim, graph_degree, topk)
+  {
+    set_params(res);
+  }
+
+  ~search() {}
+
+  inline void set_params(raft::device_resources const& res)
+  {
+    num_itopk_candidates = num_parents * graph_degree;
+    result_buffer_size   = itopk_size + num_itopk_candidates;
+
+    typedef raft::Pow2<32> AlignBytes;
+    unsigned result_buffer_size_32 = AlignBytes::roundUp(result_buffer_size);
+
+    constexpr unsigned max_itopk = 512;
+    RAFT_EXPECTS(itopk_size <= max_itopk, "itopk_size cannot be larger than %u", max_itopk);
+
+    RAFT_LOG_DEBUG("# num_itopk_candidates: %u", num_itopk_candidates);
+    RAFT_LOG_DEBUG("# num_itopk: %u", itopk_size);
+    //
+    // Determine the thread block size
+    //
+    constexpr unsigned min_block_size       = 64;  // 32 or 64
+    constexpr unsigned min_block_size_radix = 256;
+    constexpr unsigned max_block_size       = 1024;
+    //
+    const std::uint32_t topk_ws_size = 3;
+    const std::uint32_t base_smem_size =
+      sizeof(float) * max_dim + (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 +
+      sizeof(std::uint32_t) * hashmap::get_size(small_hash_bitlen) +
+      sizeof(std::uint32_t) * num_parents + sizeof(std::uint32_t) * topk_ws_size +
+      sizeof(std::uint32_t);
+    smem_size = base_smem_size;
+    if (num_itopk_candidates > 256) {
+      // Tentatively calculate the required share memory size when radix
+      // sort based topk is used, assuming the block size is the maximum.
+      if (itopk_size <= 256) {
+        smem_size += topk_by_radix_sort<256, max_block_size>::smem_size * sizeof(std::uint32_t);
+      } else {
+        smem_size += topk_by_radix_sort<512, max_block_size>::smem_size * sizeof(std::uint32_t);
+      }
+    }
+
+    uint32_t block_size = thread_block_size;
+    if (block_size == 0) {
+      block_size = min_block_size;
+
+      if (num_itopk_candidates > 256) {
+        // radix-based topk is used.
+        block_size = min_block_size_radix;
+
+        // Internal topk values per thread must be equlal to or less than 4
+        // when radix-sort block_topk is used.
+        while ((block_size < max_block_size) && (max_itopk / block_size > 4)) {
+          block_size *= 2;
+        }
+      }
+
+      // Increase block size according to shared memory requirements.
+      // If block size is 32, upper limit of shared memory size per
+      // thread block is set to 4096. This is GPU generation dependent.
+      constexpr unsigned ulimit_smem_size_cta32 = 4096;
+      while (smem_size > ulimit_smem_size_cta32 / 32 * block_size) {
+        block_size *= 2;
+      }
+
+      // Increase block size to improve GPU occupancy when batch size
+      // is small, that is, number of queries is low.
+      cudaDeviceProp deviceProp = res.get_device_properties();
+      RAFT_LOG_DEBUG("# multiProcessorCount: %d", deviceProp.multiProcessorCount);
+      while ((block_size < max_block_size) &&
+             (graph_degree * num_parents * team_size >= block_size * 2) &&
+             (max_queries <= (1024 / (block_size * 2)) * deviceProp.multiProcessorCount)) {
+        block_size *= 2;
+      }
+    }
+    RAFT_LOG_DEBUG("# thread_block_size: %u", block_size);
+    RAFT_EXPECTS(block_size >= min_block_size,
+                 "block_size cannot be smaller than min_block size, %u",
+                 min_block_size);
+    RAFT_EXPECTS(block_size <= max_block_size,
+                 "block_size cannot be larger than max_block size %u",
+                 max_block_size);
+    thread_block_size = block_size;
+
+    // Determine load bit length
+    const uint32_t total_bit_length = dim * sizeof(DATA_T) * 8;
+    if (load_bit_length == 0) {
+      load_bit_length = 128;
+      while (total_bit_length % load_bit_length) {
+        load_bit_length /= 2;
+      }
+    }
+    RAFT_LOG_DEBUG("# load_bit_length: %u  (%u loads per vector)",
+                   load_bit_length,
+                   total_bit_length / load_bit_length);
+    RAFT_EXPECTS(total_bit_length % load_bit_length == 0,
+                 "load_bit_length must be a divisor of dim*sizeof(data_t)*8=%u",
+                 total_bit_length);
+    RAFT_EXPECTS(load_bit_length >= 64, "load_bit_lenght cannot be less than 64");
+
+    if (num_itopk_candidates <= 256) {
+      RAFT_LOG_DEBUG("# bitonic-sort based topk routine is used");
+    } else {
+      RAFT_LOG_DEBUG("# radix-sort based topk routine is used");
+      smem_size = base_smem_size;
+      if (itopk_size <= 256) {
+        constexpr unsigned MAX_ITOPK = 256;
+        if (block_size == 256) {
+          constexpr unsigned BLOCK_SIZE = 256;
+          smem_size += topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE>::smem_size * sizeof(std::uint32_t);
+        } else if (block_size == 512) {
+          constexpr unsigned BLOCK_SIZE = 512;
+          smem_size += topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE>::smem_size * sizeof(std::uint32_t);
+        } else {
+          constexpr unsigned BLOCK_SIZE = 1024;
+          smem_size += topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE>::smem_size * sizeof(std::uint32_t);
+        }
+      } else {
+        constexpr unsigned MAX_ITOPK = 512;
+        if (block_size == 256) {
+          constexpr unsigned BLOCK_SIZE = 256;
+          smem_size += topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE>::smem_size * sizeof(std::uint32_t);
+        } else if (block_size == 512) {
+          constexpr unsigned BLOCK_SIZE = 512;
+          smem_size += topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE>::smem_size * sizeof(std::uint32_t);
+        } else {
+          constexpr unsigned BLOCK_SIZE = 1024;
+          smem_size += topk_by_radix_sort<MAX_ITOPK, BLOCK_SIZE>::smem_size * sizeof(std::uint32_t);
+        }
+      }
+    }
+    RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
+    hashmap_size = 0;
+    if (small_hash_bitlen == 0) {
+      hashmap_size = sizeof(uint32_t) * max_queries * hashmap::get_size(hash_bitlen);
+      hashmap.resize(hashmap_size, res.get_stream());
+    }
+    RAFT_LOG_DEBUG("# hashmap_size: %lu", hashmap_size);
+  }
+
+  void operator()(raft::device_resources const& res,
+                  raft::device_matrix_view<const DATA_T, INDEX_T, row_major> dataset,
+                  raft::device_matrix_view<const INDEX_T, INDEX_T, row_major> graph,
+                  INDEX_T* const result_indices_ptr,       // [num_queries, topk]
+                  DISTANCE_T* const result_distances_ptr,  // [num_queries, topk]
+                  const DATA_T* const queries_ptr,         // [num_queries, dataset_dim]
+                  const std::uint32_t num_queries,
+                  const INDEX_T* dev_seed_ptr,                   // [num_queries, num_seeds]
+                  std::uint32_t* const num_executed_iterations,  // [num_queries]
+                  uint32_t topk)
+  {
+    cudaStream_t stream = res.get_stream();
+    uint32_t block_size = thread_block_size;
+    SET_KERNEL;
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    dim3 thread_dims(block_size, 1, 1);
+    dim3 block_dims(1, num_queries, 1);
+    RAFT_LOG_DEBUG(
+      "Launching kernel with %u threads, %u block %lu smem", block_size, num_queries, smem_size);
+    kernel<<<block_dims, thread_dims, smem_size, stream>>>(result_indices_ptr,
+                                                           result_distances_ptr,
+                                                           topk,
+                                                           dataset.data_handle(),
+                                                           dataset.extent(1),
+                                                           dataset.extent(0),
+                                                           queries_ptr,
+                                                           graph.data_handle(),
+                                                           graph.extent(1),
+                                                           num_random_samplings,
+                                                           rand_xor_mask,
+                                                           dev_seed_ptr,
+                                                           num_seeds,
+                                                           hashmap.data(),
+                                                           itopk_size,
+                                                           num_parents,
+                                                           min_iterations,
+                                                           max_iterations,
+                                                           num_executed_iterations,
+                                                           hash_bitlen,
+                                                           small_hash_bitlen,
+                                                           small_hash_reset_interval);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+};
+
+}  // namespace single_cta_search
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk.h b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk.h
new file mode 100644
index 0000000000..ccb65fd0ea
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_fp16.h>
+#include <stdint.h>
+
+namespace raft::neighbors::experimental::cagra::detail {
+
+//
+size_t _cuann_find_topk_bufferSize(uint32_t topK,
+                                   uint32_t sizeBatch,
+                                   uint32_t numElements,
+                                   cudaDataType_t sampleDtype = CUDA_R_32F);
+
+//
+void _cuann_find_topk(uint32_t topK,
+                      uint32_t sizeBatch,
+                      uint32_t numElements,
+                      const float* inputKeys,     // [sizeBatch, ldIK,]
+                      uint32_t ldIK,              // (*) ldIK >= numElements
+                      const uint32_t* inputVals,  // [sizeBatch, ldIV,]
+                      uint32_t ldIV,              // (*) ldIV >= numElements
+                      float* outputKeys,          // [sizeBatch, ldOK,]
+                      uint32_t ldOK,              // (*) ldOK >= topK
+                      uint32_t* outputVals,       // [sizeBatch, ldOV,]
+                      uint32_t ldOV,              // (*) ldOV >= topK
+                      void* workspace,
+                      bool sort           = false,
+                      uint32_t* hint      = NULL,
+                      cudaStream_t stream = 0);
+
+#ifdef __CUDA_ARCH__
+#define CUDA_DEVICE_HOST_FUNC __device__
+#else
+#define CUDA_DEVICE_HOST_FUNC
+#endif
+//
+CUDA_DEVICE_HOST_FUNC inline size_t _cuann_aligned(size_t size, size_t unit = 128)
+{
+  if (size % unit) { size += unit - (size % unit); }
+  return size;
+}
+}  // namespace raft::neighbors::experimental::cagra::detail
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
new file mode 100644
index 0000000000..d09478d1db
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/topk_for_cagra/topk_core.cuh
@@ -0,0 +1,926 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include "topk.h"
+#include <assert.h>
+#include <cub/cub.cuh>
+#include <float.h>
+#include <stdint.h>
+#include <stdio.h>
+
+namespace raft::neighbors::experimental::cagra::detail {
+using namespace cub;
+
+//
+__device__ inline uint32_t convert(uint32_t x)
+{
+  if (x & 0x80000000) {
+    return x ^ 0xffffffff;
+  } else {
+    return x ^ 0x80000000;
+  }
+}
+
+//
+__device__ inline uint16_t convert(uint16_t x)
+{
+  if (x & 0x8000) {
+    return x ^ 0xffff;
+  } else {
+    return x ^ 0x8000;
+  }
+}
+
+//
+struct u32_vector {
+  uint1 x1;
+  uint2 x2;
+  uint4 x4;
+  ulonglong4 x8;
+};
+
+//
+struct u16_vector {
+  ushort1 x1;
+  ushort2 x2;
+  ushort4 x4;
+  uint4 x8;
+};
+
+//
+template <int vecLen>
+__device__ inline void load_u32_vector(struct u32_vector& vec, const uint32_t* x, int i)
+{
+  if (vecLen == 1) {
+    vec.x1 = ((uint1*)(x + i))[0];
+  } else if (vecLen == 2) {
+    vec.x2 = ((uint2*)(x + i))[0];
+  } else if (vecLen == 4) {
+    vec.x4 = ((uint4*)(x + i))[0];
+  } else if (vecLen == 8) {
+    vec.x8 = ((ulonglong4*)(x + i))[0];
+  }
+}
+
+//
+template <int vecLen>
+__device__ inline void load_u16_vector(struct u16_vector& vec, const uint16_t* x, int i)
+{
+  if (vecLen == 1) {
+    vec.x1 = ((ushort1*)(x + i))[0];
+  } else if (vecLen == 2) {
+    vec.x2 = ((ushort2*)(x + i))[0];
+  } else if (vecLen == 4) {
+    vec.x4 = ((ushort4*)(x + i))[0];
+  } else if (vecLen == 8) {
+    vec.x8 = ((uint4*)(x + i))[0];
+  }
+}
+
+//
+template <int vecLen>
+__device__ inline uint32_t get_element_from_u32_vector(struct u32_vector& vec, int i)
+{
+  uint32_t xi;
+  if (vecLen == 1) {
+    xi = convert(vec.x1.x);
+  } else if (vecLen == 2) {
+    if (i == 0)
+      xi = convert(vec.x2.x);
+    else
+      xi = convert(vec.x2.y);
+  } else if (vecLen == 4) {
+    if (i == 0)
+      xi = convert(vec.x4.x);
+    else if (i == 1)
+      xi = convert(vec.x4.y);
+    else if (i == 2)
+      xi = convert(vec.x4.z);
+    else
+      xi = convert(vec.x4.w);
+  } else if (vecLen == 8) {
+    if (i == 0)
+      xi = convert((uint32_t)(vec.x8.x & 0xffffffff));
+    else if (i == 1)
+      xi = convert((uint32_t)(vec.x8.x >> 32));
+    else if (i == 2)
+      xi = convert((uint32_t)(vec.x8.y & 0xffffffff));
+    else if (i == 3)
+      xi = convert((uint32_t)(vec.x8.y >> 32));
+    else if (i == 4)
+      xi = convert((uint32_t)(vec.x8.z & 0xffffffff));
+    else if (i == 5)
+      xi = convert((uint32_t)(vec.x8.z >> 32));
+    else if (i == 6)
+      xi = convert((uint32_t)(vec.x8.w & 0xffffffff));
+    else
+      xi = convert((uint32_t)(vec.x8.w >> 32));
+  }
+  return xi;
+}
+
+//
+template <int vecLen>
+__device__ inline uint16_t get_element_from_u16_vector(struct u16_vector& vec, int i)
+{
+  uint16_t xi;
+  if (vecLen == 1) {
+    xi = convert(vec.x1.x);
+  } else if (vecLen == 2) {
+    if (i == 0)
+      xi = convert(vec.x2.x);
+    else
+      xi = convert(vec.x2.y);
+  } else if (vecLen == 4) {
+    if (i == 0)
+      xi = convert(vec.x4.x);
+    else if (i == 1)
+      xi = convert(vec.x4.y);
+    else if (i == 2)
+      xi = convert(vec.x4.z);
+    else
+      xi = convert(vec.x4.w);
+  } else if (vecLen == 8) {
+    if (i == 0)
+      xi = convert((uint16_t)(vec.x8.x & 0xffff));
+    else if (i == 1)
+      xi = convert((uint16_t)(vec.x8.x >> 16));
+    else if (i == 2)
+      xi = convert((uint16_t)(vec.x8.y & 0xffff));
+    else if (i == 3)
+      xi = convert((uint16_t)(vec.x8.y >> 16));
+    else if (i == 4)
+      xi = convert((uint16_t)(vec.x8.z & 0xffff));
+    else if (i == 5)
+      xi = convert((uint16_t)(vec.x8.z >> 16));
+    else if (i == 6)
+      xi = convert((uint16_t)(vec.x8.w & 0xffff));
+    else
+      xi = convert((uint16_t)(vec.x8.w >> 16));
+  }
+  return xi;
+}
+
+//
+template <typename T, int blockDim_x, int stateBitLen, int vecLen>
+__device__ inline void update_histogram(int itr,
+                                        uint32_t thread_id,
+                                        uint32_t num_threads,
+                                        uint32_t hint,
+                                        uint32_t threshold,
+                                        uint32_t& num_bins,
+                                        uint32_t& shift,
+                                        const T* x,  // [nx,]
+                                        uint32_t nx,
+                                        uint32_t* hist,  // [num_bins]
+                                        uint8_t* state,
+                                        uint32_t* output,  // [topk]
+                                        uint32_t* output_count)
+{
+  if (sizeof(T) == 4) {
+    // 32-bit (uint32_t)
+    // itr:0, calculate histogram with 11 bits from bit-21 to bit-31
+    // itr:1, calculate histogram with 11 bits from bit-10 to bit-20
+    // itr:2, calculate histogram with 10 bits from bit-0 to bit-9
+    if (itr == 0) {
+      shift    = 21;
+      num_bins = 2048;
+    } else if (itr == 1) {
+      shift    = 10;
+      num_bins = 2048;
+    } else {
+      shift    = 0;
+      num_bins = 1024;
+    }
+  } else if (sizeof(T) == 2) {
+    // 16-bit (uint16_t)
+    // itr:0, calculate histogram with 8 bits from bit-8 to bit-15
+    // itr:1, calculate histogram with 8 bits from bit-0 to bit-7
+    if (itr == 0) {
+      shift    = 8;
+      num_bins = 256;
+    } else {
+      shift    = 0;
+      num_bins = 256;
+    }
+  } else {
+    return;
+  }
+  if (itr > 0) {
+    for (int i = threadIdx.x; i < num_bins; i += blockDim_x) {
+      hist[i] = 0;
+    }
+    __syncthreads();
+  }
+
+  // (*) Note that 'thread_id' may be different from 'threadIdx.x',
+  // and 'num_threads' may be different from 'blockDim.x'
+  int ii = 0;
+  for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) {
+    uint8_t iState = 0;
+    if ((stateBitLen == 8) && (itr > 0)) {
+      iState = state[thread_id + (num_threads * ii)];
+      if (iState == (uint8_t)0xff) continue;
+    }
+#pragma unroll
+    for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) {
+      int iv = i + (num_threads * v);
+      if (iv >= nx) break;
+
+      struct u32_vector x_u32_vec;
+      struct u16_vector x_u16_vec;
+      if (sizeof(T) == 4) {
+        load_u32_vector<vecLen>(x_u32_vec, (const uint32_t*)x, iv);
+      } else {
+        load_u16_vector<vecLen>(x_u16_vec, (const uint16_t*)x, iv);
+      }
+#pragma unroll
+      for (int u = 0; u < vecLen; u++) {
+        int ivu = iv + u;
+        if (ivu >= nx) break;
+
+        uint8_t mask = (uint8_t)0x1 << (v + u);
+        if ((stateBitLen == 8) && (iState & mask)) continue;
+
+        uint32_t xi;
+        if (sizeof(T) == 4) {
+          xi = get_element_from_u32_vector<vecLen>(x_u32_vec, u);
+        } else {
+          xi = get_element_from_u16_vector<vecLen>(x_u16_vec, u);
+        }
+        if ((xi > hint) && (itr == 0)) {
+          if (stateBitLen == 8) { iState |= mask; }
+        } else if (xi < threshold) {
+          if (stateBitLen == 8) {
+            // If the condition is already met, record the index.
+            output[atomicAdd(output_count, 1)] = ivu;
+            iState |= mask;
+          }
+        } else {
+          uint32_t k = (xi - threshold) >> shift;  // 0 <= k
+          if (k >= num_bins) {
+            if (stateBitLen == 8) { iState |= mask; }
+          } else if (k + 1 < num_bins) {
+            // Update histogram
+            atomicAdd(&(hist[k + 1]), 1);
+          }
+        }
+      }
+    }
+    if (stateBitLen == 8) { state[thread_id + (num_threads * ii)] = iState; }
+  }
+  __syncthreads();
+}
+
+//
+template <int blockDim_x>
+__device__ inline void select_best_index_for_next_threshold(uint32_t topk,
+                                                            uint32_t threshold,
+                                                            uint32_t max_threshold,
+                                                            uint32_t nx_below_threshold,
+                                                            uint32_t num_bins,
+                                                            uint32_t shift,
+                                                            const uint32_t* hist,  // [num_bins]
+                                                            uint32_t* best_index,
+                                                            uint32_t* best_csum)
+{
+  // Scan the histogram ('hist') and compute csum. Then, find the largest
+  // index under the condition that the sum of the number of elements found
+  // so far ('nx_below_threshold') and the csum value does not exceed the
+  // topk value.
+  typedef BlockScan<uint32_t, blockDim_x> BlockScanT;
+  __shared__ typename BlockScanT::TempStorage temp_storage;
+
+  uint32_t my_index = 0xffffffff;
+  uint32_t my_csum  = 0;
+  if (num_bins <= blockDim_x) {
+    uint32_t csum = 0;
+    if (threadIdx.x < num_bins) { csum = hist[threadIdx.x]; }
+    BlockScanT(temp_storage).InclusiveSum(csum, csum);
+    if (threadIdx.x < num_bins) {
+      uint32_t index = threadIdx.x;
+      if ((nx_below_threshold + csum <= topk) && (threshold + (index << shift) <= max_threshold)) {
+        my_index = index;
+        my_csum  = csum;
+      }
+    }
+  } else {
+    if (num_bins == 2048) {
+      constexpr int n_data = 2048 / blockDim_x;
+      uint32_t csum[n_data];
+      for (int i = 0; i < n_data; i++) {
+        csum[i] = hist[i + (n_data * threadIdx.x)];
+      }
+      BlockScanT(temp_storage).InclusiveSum(csum, csum);
+      for (int i = n_data - 1; i >= 0; i--) {
+        if (nx_below_threshold + csum[i] > topk) continue;
+        uint32_t index = i + (n_data * threadIdx.x);
+        if (threshold + (index << shift) > max_threshold) continue;
+        my_index = index;
+        my_csum  = csum[i];
+        break;
+      }
+    } else if (num_bins == 1024) {
+      constexpr int n_data = 1024 / blockDim_x;
+      uint32_t csum[n_data];
+      for (int i = 0; i < n_data; i++) {
+        csum[i] = hist[i + (n_data * threadIdx.x)];
+      }
+      BlockScanT(temp_storage).InclusiveSum(csum, csum);
+      for (int i = n_data - 1; i >= 0; i--) {
+        if (nx_below_threshold + csum[i] > topk) continue;
+        uint32_t index = i + (n_data * threadIdx.x);
+        if (threshold + (index << shift) > max_threshold) continue;
+        my_index = index;
+        my_csum  = csum[i];
+        break;
+      }
+    }
+  }
+  if (threadIdx.x < num_bins) {
+    int laneid = 31 - __clz(__ballot_sync(0xffffffff, (my_index != 0xffffffff)));
+    if ((threadIdx.x & 0x1f) == laneid) {
+      uint32_t old_index = atomicMax(best_index, my_index);
+      if (old_index < my_index) { atomicMax(best_csum, my_csum); }
+    }
+  }
+  __syncthreads();
+}
+
+//
+template <typename T, int stateBitLen, int vecLen>
+__device__ inline void output_index_below_threshold(uint32_t topk,
+                                                    uint32_t thread_id,
+                                                    uint32_t num_threads,
+                                                    uint32_t threshold,
+                                                    uint32_t nx_below_threshold,
+                                                    const T* x,  // [nx,]
+                                                    uint32_t nx,
+                                                    const uint8_t* state,
+                                                    uint32_t* output,  // [topk]
+                                                    uint32_t* output_count,
+                                                    uint32_t* output_count_eq)
+{
+  int ii = 0;
+  for (int i = thread_id * vecLen; i < nx; i += num_threads * max(vecLen, stateBitLen), ii++) {
+    uint8_t iState = 0;
+    if (stateBitLen == 8) {
+      iState = state[thread_id + (num_threads * ii)];
+      if (iState == (uint8_t)0xff) continue;
+    }
+#pragma unroll
+    for (int v = 0; v < max(vecLen, stateBitLen); v += vecLen) {
+      int iv = i + (num_threads * v);
+      if (iv >= nx) break;
+
+      struct u32_vector u32_vec;
+      struct u16_vector u16_vec;
+      if (sizeof(T) == 4) {
+        load_u32_vector<vecLen>(u32_vec, (const uint32_t*)x, iv);
+      } else {
+        load_u16_vector<vecLen>(u16_vec, (const uint16_t*)x, iv);
+      }
+#pragma unroll
+      for (int u = 0; u < vecLen; u++) {
+        int ivu = iv + u;
+        if (ivu >= nx) break;
+
+        uint8_t mask = (uint8_t)0x1 << (v + u);
+        if ((stateBitLen == 8) && (iState & mask)) continue;
+
+        uint32_t xi;
+        if (sizeof(T) == 4) {
+          xi = get_element_from_u32_vector<vecLen>(u32_vec, u);
+        } else {
+          xi = get_element_from_u16_vector<vecLen>(u16_vec, u);
+        }
+        if (xi < threshold) {
+          output[atomicAdd(output_count, 1)] = ivu;
+        } else if (xi == threshold) {
+          // (*) If the value is equal to the threshold, the index
+          // processed first is recorded. Cause of non-determinism.
+          if (nx_below_threshold + atomicAdd(output_count_eq, 1) < topk) {
+            output[atomicAdd(output_count, 1)] = ivu;
+          }
+        }
+      }
+    }
+  }
+}
+
+//
+template <typename T>
+__device__ inline void swap(T& val1, T& val2)
+{
+  T val0 = val1;
+  val1   = val2;
+  val2   = val0;
+}
+
+//
+template <typename K>
+__device__ inline bool swap_if_needed(K& key1, K& key2)
+{
+  if (key1 > key2) {
+    swap<K>(key1, key2);
+    return true;
+  }
+  return false;
+}
+
+//
+template <typename K, typename V>
+__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2)
+{
+  if (key1 > key2) {
+    swap<K>(key1, key2);
+    swap<V>(val1, val2);
+    return true;
+  }
+  return false;
+}
+
+//
+template <typename K, typename V>
+__device__ inline bool swap_if_needed(K& key1, K& key2, V& val1, V& val2, bool ascending)
+{
+  if (key1 == key2) { return false; }
+  if ((key1 > key2) == ascending) {
+    swap<K>(key1, key2);
+    swap<V>(val1, val2);
+    return true;
+  }
+  return false;
+}
+
+//
+template <typename T>
+__device__ inline T max_value_of();
+template <>
+__device__ inline float max_value_of<float>()
+{
+  return FLT_MAX;
+}
+template <>
+__device__ inline uint32_t max_value_of<uint32_t>()
+{
+  return ~0u;
+}
+
+template <int blockDim_x, int stateBitLen>
+__device__ __host__ inline uint32_t get_state_size(uint32_t len_x)
+{
+  const uint32_t num_threads = blockDim_x;
+  if (stateBitLen == 8) {
+    uint32_t numElements_perThread = (len_x + num_threads - 1) / num_threads;
+    uint32_t numState_perThread    = (numElements_perThread + stateBitLen - 1) / stateBitLen;
+    return numState_perThread * num_threads;
+  }
+  return 0;
+}
+
+//
+template <int blockDim_x, int stateBitLen, int vecLen, int maxTopk, int numSortThreads>
+__device__ inline void topk_cta_11_core(uint32_t topk,
+                                        uint32_t len_x,
+                                        const uint32_t* _x,        // [size_batch, ld_x,]
+                                        const uint32_t* _in_vals,  // [size_batch, ld_iv,]
+                                        uint32_t* _y,              // [size_batch, ld_y,]
+                                        uint32_t* _out_vals,       // [size_batch, ld_ov,]
+                                        uint8_t* _state,           // [size_batch, ...,]
+                                        uint32_t* _hint,
+                                        bool sort,
+                                        uint32_t* _smem)
+{
+  uint32_t* smem_out_vals = _smem;
+  uint32_t* hist          = &(_smem[2 * maxTopk]);
+  uint32_t* best_index    = &(_smem[2 * maxTopk + 2048]);
+  uint32_t* best_csum     = &(_smem[2 * maxTopk + 2048 + 3]);
+
+  const uint32_t num_threads = blockDim_x;
+  const uint32_t thread_id   = threadIdx.x;
+  uint32_t nx                = len_x;
+  const uint32_t* x          = _x;
+  const uint32_t* in_vals    = NULL;
+  if (_in_vals) { in_vals = _in_vals; }
+  uint32_t* y = NULL;
+  if (_y) { y = _y; }
+  uint32_t* out_vals = NULL;
+  if (_out_vals) { out_vals = _out_vals; }
+  uint8_t* state = _state;
+  uint32_t hint  = (_hint == NULL ? ~0u : *_hint);
+
+  // Initialize shared memory
+  for (int i = 2 * maxTopk + thread_id; i < 2 * maxTopk + 2048 + 8; i += num_threads) {
+    _smem[i] = 0;
+  }
+  uint32_t* output_count      = &(_smem[2 * maxTopk + 2048 + 6]);
+  uint32_t* output_count_eq   = &(_smem[2 * maxTopk + 2048 + 7]);
+  uint32_t threshold          = 0;
+  uint32_t nx_below_threshold = 0;
+  __syncthreads();
+
+  //
+  // Search for the maximum threshold that satisfies "(x < threshold).sum() <= topk".
+  //
+#pragma unroll
+  for (int j = 0; j < 3; j += 1) {
+    uint32_t num_bins;
+    uint32_t shift;
+    update_histogram<uint32_t, blockDim_x, stateBitLen, vecLen>(j,
+                                                                thread_id,
+                                                                num_threads,
+                                                                hint,
+                                                                threshold,
+                                                                num_bins,
+                                                                shift,
+                                                                x,
+                                                                nx,
+                                                                hist,
+                                                                state,
+                                                                smem_out_vals,
+                                                                output_count);
+
+    select_best_index_for_next_threshold<blockDim_x>(topk,
+                                                     threshold,
+                                                     hint,
+                                                     nx_below_threshold,
+                                                     num_bins,
+                                                     shift,
+                                                     hist,
+                                                     best_index + j,
+                                                     best_csum + j);
+
+    threshold += (best_index[j] << shift);
+    nx_below_threshold += best_csum[j];
+    if (nx_below_threshold == topk) break;
+  }
+
+  if ((_hint != NULL) && (thread_id == 0)) { *_hint = min(threshold, hint); }
+
+  //
+  // Output index that satisfies "x[i] < threshold".
+  //
+  output_index_below_threshold<uint32_t, stateBitLen, vecLen>(topk,
+                                                              thread_id,
+                                                              num_threads,
+                                                              threshold,
+                                                              nx_below_threshold,
+                                                              x,
+                                                              nx,
+                                                              state,
+                                                              smem_out_vals,
+                                                              output_count,
+                                                              output_count_eq);
+  __syncthreads();
+
+#ifdef CUANN_DEBUG
+  if (thread_id == 0 && output_count[0] < topk) {
+    RAFT_LOG_DEBUG(
+      "# i_batch:%d, topk:%d, output_count:%d, nx_below_threshold:%d, threshold:%08x\n",
+      i_batch,
+      topk,
+      output_count[0],
+      nx_below_threshold,
+      threshold);
+  }
+#endif
+
+  if (!sort) {
+    for (int k = thread_id; k < topk; k += blockDim_x) {
+      uint32_t i = smem_out_vals[k];
+      if (y) { y[k] = x[i]; }
+      if (out_vals) {
+        if (in_vals) {
+          out_vals[k] = in_vals[i];
+        } else {
+          out_vals[k] = i;
+        }
+      }
+    }
+    return;
+  }
+
+  constexpr int numTopkPerThread = maxTopk / numSortThreads;
+  float my_keys[numTopkPerThread];
+  uint32_t my_vals[numTopkPerThread];
+
+  // Read keys and values to registers
+  if (thread_id < numSortThreads) {
+    for (int i = 0; i < numTopkPerThread; i++) {
+      int k = thread_id + (numSortThreads * i);
+      if (k < topk) {
+        int j      = smem_out_vals[k];
+        my_keys[i] = ((float*)x)[j];
+        if (in_vals) {
+          my_vals[i] = in_vals[j];
+        } else {
+          my_vals[i] = j;
+        }
+      } else {
+        my_keys[i] = FLT_MAX;
+        my_vals[i] = 0xffffffffU;
+      }
+    }
+  }
+
+  uint32_t mask = 1;
+
+  // Sorting by thread
+  if (thread_id < numSortThreads) {
+    bool ascending = ((thread_id & mask) == 0);
+    if (numTopkPerThread == 3) {
+      swap_if_needed<float, uint32_t>(my_keys[0], my_keys[1], my_vals[0], my_vals[1], ascending);
+      swap_if_needed<float, uint32_t>(my_keys[0], my_keys[2], my_vals[0], my_vals[2], ascending);
+      swap_if_needed<float, uint32_t>(my_keys[1], my_keys[2], my_vals[1], my_vals[2], ascending);
+    } else {
+      for (int j = 0; j < numTopkPerThread / 2; j += 1) {
+#pragma unroll
+        for (int i = 0; i < numTopkPerThread; i += 2) {
+          swap_if_needed<float, uint32_t>(
+            my_keys[i], my_keys[i + 1], my_vals[i], my_vals[i + 1], ascending);
+        }
+#pragma unroll
+        for (int i = 1; i < numTopkPerThread - 1; i += 2) {
+          swap_if_needed<float, uint32_t>(
+            my_keys[i], my_keys[i + 1], my_vals[i], my_vals[i + 1], ascending);
+        }
+      }
+    }
+  }
+
+  // Bitonic Sorting
+  while (mask < numSortThreads) {
+    uint32_t next_mask = mask << 1;
+
+    for (uint32_t curr_mask = mask; curr_mask > 0; curr_mask >>= 1) {
+      bool ascending = ((thread_id & curr_mask) == 0) == ((thread_id & next_mask) == 0);
+      if (curr_mask >= 32) {
+        // inter warp
+        uint32_t* smem_vals = _smem;  // [numTopkPerThread, numSortThreads]
+        float* smem_keys    = (float*)(_smem + numTopkPerThread * numSortThreads);
+        __syncthreads();
+        if (thread_id < numSortThreads) {
+#pragma unroll
+          for (int i = 0; i < numTopkPerThread; i++) {
+            smem_keys[thread_id + (numSortThreads * i)] = my_keys[i];
+            smem_vals[thread_id + (numSortThreads * i)] = my_vals[i];
+          }
+        }
+        __syncthreads();
+        if (thread_id < numSortThreads) {
+#pragma unroll
+          for (int i = 0; i < numTopkPerThread; i++) {
+            float opp_key    = smem_keys[(thread_id ^ curr_mask) + (numSortThreads * i)];
+            uint32_t opp_val = smem_vals[(thread_id ^ curr_mask) + (numSortThreads * i)];
+            swap_if_needed<float, uint32_t>(my_keys[i], opp_key, my_vals[i], opp_val, ascending);
+          }
+        }
+      } else {
+        // intra warp
+        if (thread_id < numSortThreads) {
+#pragma unroll
+          for (int i = 0; i < numTopkPerThread; i++) {
+            float opp_key    = __shfl_xor_sync(0xffffffff, my_keys[i], curr_mask);
+            uint32_t opp_val = __shfl_xor_sync(0xffffffff, my_vals[i], curr_mask);
+            swap_if_needed<float, uint32_t>(my_keys[i], opp_key, my_vals[i], opp_val, ascending);
+          }
+        }
+      }
+    }
+
+    if (thread_id < numSortThreads) {
+      bool ascending = ((thread_id & next_mask) == 0);
+      if (numTopkPerThread == 3) {
+        swap_if_needed<float, uint32_t>(my_keys[0], my_keys[1], my_vals[0], my_vals[1], ascending);
+        swap_if_needed<float, uint32_t>(my_keys[0], my_keys[2], my_vals[0], my_vals[2], ascending);
+        swap_if_needed<float, uint32_t>(my_keys[1], my_keys[2], my_vals[1], my_vals[2], ascending);
+      } else {
+#pragma unroll
+        for (uint32_t curr_mask = numTopkPerThread / 2; curr_mask > 0; curr_mask >>= 1) {
+#pragma unroll
+          for (int i = 0; i < numTopkPerThread; i++) {
+            int j = i ^ curr_mask;
+            if (i > j) continue;
+            swap_if_needed<float, uint32_t>(
+              my_keys[i], my_keys[j], my_vals[i], my_vals[j], ascending);
+          }
+        }
+      }
+    }
+    mask = next_mask;
+  }
+
+  // Write sorted keys and values
+  if (thread_id < numSortThreads) {
+    for (int i = 0; i < numTopkPerThread; i++) {
+      int k = i + (numTopkPerThread * thread_id);
+      if (k < topk) {
+        if (y) { y[k] = ((uint32_t*)my_keys)[i]; }
+        if (out_vals) { out_vals[k] = my_vals[i]; }
+      }
+    }
+  }
+}
+
+namespace {
+
+//
+constexpr std::uint32_t NUM_THREADS      = 1024;  // DO NOT CHANGE
+constexpr std::uint32_t STATE_BIT_LENGTH = 8;     // 0: state not used,  8: state used
+constexpr std::uint32_t MAX_VEC_LENGTH   = 4;     // 1, 2, 4 or 8
+
+//
+//
+int _get_vecLen(uint32_t maxSamples, int maxVecLen = MAX_VEC_LENGTH)
+{
+  int vecLen = min(maxVecLen, (int)MAX_VEC_LENGTH);
+  while ((maxSamples % vecLen) != 0) {
+    vecLen /= 2;
+  }
+  return vecLen;
+}
+}  // unnamed namespace
+
+template <int blockDim_x, int stateBitLen, int vecLen, int maxTopk, int numSortThreads>
+__launch_bounds__(1024, 1) __global__
+  void kern_topk_cta_11(uint32_t topk,
+                        uint32_t size_batch,
+                        uint32_t len_x,
+                        const uint32_t* _x,  // [size_batch, ld_x,]
+                        uint32_t ld_x,
+                        const uint32_t* _in_vals,  // [size_batch, ld_iv,]
+                        uint32_t ld_iv,
+                        uint32_t* _y,  // [size_batch, ld_y,]
+                        uint32_t ld_y,
+                        uint32_t* _out_vals,  // [size_batch, ld_ov,]
+                        uint32_t ld_ov,
+                        uint8_t* _state,   // [size_batch, ...,]
+                        uint32_t* _hints,  // [size_batch,]
+                        bool sort)
+{
+  uint32_t i_batch = blockIdx.x;
+  if (i_batch >= size_batch) return;
+  __shared__ uint32_t _smem[2 * maxTopk + 2048 + 8];
+
+  topk_cta_11_core<blockDim_x, stateBitLen, vecLen, maxTopk, numSortThreads>(
+    topk,
+    len_x,
+    (_x == NULL ? NULL : _x + i_batch * ld_x),
+    (_in_vals == NULL ? NULL : _in_vals + i_batch * ld_iv),
+    (_y == NULL ? NULL : _y + i_batch * ld_y),
+    (_out_vals == NULL ? NULL : _out_vals + i_batch * ld_ov),
+    (_state == NULL ? NULL : _state + i_batch * get_state_size<blockDim_x, stateBitLen>(len_x)),
+    (_hints == NULL ? NULL : _hints + i_batch),
+    sort,
+    _smem);
+}
+
+//
+size_t inline _cuann_find_topk_bufferSize(uint32_t topK,
+                                          uint32_t sizeBatch,
+                                          uint32_t numElements,
+                                          cudaDataType_t sampleDtype)
+{
+  constexpr int numThreads  = NUM_THREADS;
+  constexpr int stateBitLen = STATE_BIT_LENGTH;
+  assert(stateBitLen == 0 || stateBitLen == 8);
+
+  size_t workspaceSize = 1;
+  // state
+  if (stateBitLen == 8) {
+    workspaceSize = _cuann_aligned(
+      sizeof(uint8_t) * get_state_size<numThreads, stateBitLen>(numElements) * sizeBatch);
+  }
+
+  return workspaceSize;
+}
+
+inline void _cuann_find_topk(uint32_t topK,
+                             uint32_t sizeBatch,
+                             uint32_t numElements,
+                             const float* inputKeys,     // [sizeBatch, ldIK,]
+                             uint32_t ldIK,              // (*) ldIK >= numElements
+                             const uint32_t* inputVals,  // [sizeBatch, ldIV,]
+                             uint32_t ldIV,              // (*) ldIV >= numElements
+                             float* outputKeys,          // [sizeBatch, ldOK,]
+                             uint32_t ldOK,              // (*) ldOK >= topK
+                             uint32_t* outputVals,       // [sizeBatch, ldOV,]
+                             uint32_t ldOV,              // (*) ldOV >= topK
+                             void* workspace,
+                             bool sort,
+                             uint32_t* hints,
+                             cudaStream_t stream)
+{
+  assert(ldIK >= numElements);
+  assert(ldIV >= numElements);
+  assert(ldOK >= topK);
+  assert(ldOV >= topK);
+
+  constexpr int numThreads  = NUM_THREADS;
+  constexpr int stateBitLen = STATE_BIT_LENGTH;
+  assert(stateBitLen == 0 || stateBitLen == 8);
+
+  uint8_t* state = NULL;
+  if (stateBitLen == 8) { state = (uint8_t*)workspace; }
+
+  dim3 threads(numThreads, 1, 1);
+  dim3 blocks(sizeBatch, 1, 1);
+
+  void (*cta_kernel)(uint32_t,
+                     uint32_t,
+                     uint32_t,
+                     const uint32_t*,
+                     uint32_t,
+                     const uint32_t*,
+                     uint32_t,
+                     uint32_t*,
+                     uint32_t,
+                     uint32_t*,
+                     uint32_t,
+                     uint8_t*,
+                     uint32_t*,
+                     bool) = nullptr;
+
+  // V:vecLen, K:maxTopk, T:numSortThreads
+#define SET_KERNEL_VKT(V, K, T)                                      \
+  do {                                                               \
+    assert(numThreads >= T);                                         \
+    assert((K % T) == 0);                                            \
+    assert((K / T) <= 4);                                            \
+    cta_kernel = kern_topk_cta_11<numThreads, stateBitLen, V, K, T>; \
+  } while (0)
+
+  // V: vecLen
+#define SET_KERNEL_V(V)                                                                      \
+  do {                                                                                       \
+    if (topK <= 32) {                                                                        \
+      SET_KERNEL_VKT(V, 32, 32);                                                             \
+    } else if (topK <= 64) {                                                                 \
+      SET_KERNEL_VKT(V, 64, 32);                                                             \
+    } else if (topK <= 96) {                                                                 \
+      SET_KERNEL_VKT(V, 96, 32);                                                             \
+    } else if (topK <= 128) {                                                                \
+      SET_KERNEL_VKT(V, 128, 32);                                                            \
+    } else if (topK <= 192) {                                                                \
+      SET_KERNEL_VKT(V, 192, 64);                                                            \
+    } else if (topK <= 256) {                                                                \
+      SET_KERNEL_VKT(V, 256, 64);                                                            \
+    } else if (topK <= 384) {                                                                \
+      SET_KERNEL_VKT(V, 384, 128);                                                           \
+    } else if (topK <= 512) {                                                                \
+      SET_KERNEL_VKT(V, 512, 128);                                                           \
+    } else if (topK <= 768) {                                                                \
+      SET_KERNEL_VKT(V, 768, 256);                                                           \
+    } else if (topK <= 1024) {                                                               \
+      SET_KERNEL_VKT(V, 1024, 256);                                                          \
+    } \
+        /* else if (topK <= 1536) { SET_KERNEL_VKT(V, 1536, 512); } */ \
+        /* else if (topK <= 2048) { SET_KERNEL_VKT(V, 2048, 512); } */ \
+        /* else if (topK <= 3072) { SET_KERNEL_VKT(V, 3072, 1024); } */ \
+        /* else if (topK <= 4096) { SET_KERNEL_VKT(V, 4096, 1024); } */ \
+        else {                                                                                      \
+      RAFT_LOG_DEBUG(                                                                        \
+        "[ERROR] (%s, %d) topk must be lower than or equla to 1024.\n", __func__, __LINE__); \
+      exit(-1);                                                                              \
+    }                                                                                        \
+  } while (0)
+
+  int _vecLen = _get_vecLen(ldIK, 2);
+  if (_vecLen == 2) {
+    SET_KERNEL_V(2);
+  } else if (_vecLen == 1) {
+    SET_KERNEL_V(1);
+  }
+
+  cta_kernel<<<blocks, threads, 0, stream>>>(topK,
+                                             sizeBatch,
+                                             numElements,
+                                             (const uint32_t*)inputKeys,
+                                             ldIK,
+                                             inputVals,
+                                             ldIV,
+                                             (uint32_t*)outputKeys,
+                                             ldOK,
+                                             outputVals,
+                                             ldOV,
+                                             state,
+                                             hints,
+                                             sort);
+
+  return;
+}
+}  // namespace raft::neighbors::experimental::cagra::detail
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
new file mode 100644
index 0000000000..3e329c9239
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cfloat>
+#include <cstdint>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <raft/core/detail/macros.hpp>
+#include <type_traits>
+
+namespace raft::neighbors::experimental::cagra::detail {
+namespace utils {
+template <class DATA_T>
+inline cudaDataType_t get_cuda_data_type();
+template <>
+inline cudaDataType_t get_cuda_data_type<float>()
+{
+  return CUDA_R_32F;
+}
+template <>
+inline cudaDataType_t get_cuda_data_type<half>()
+{
+  return CUDA_R_16F;
+}
+template <>
+inline cudaDataType_t get_cuda_data_type<int8_t>()
+{
+  return CUDA_R_8I;
+}
+template <>
+inline cudaDataType_t get_cuda_data_type<uint8_t>()
+{
+  return CUDA_R_8U;
+}
+template <>
+inline cudaDataType_t get_cuda_data_type<uint32_t>()
+{
+  return CUDA_R_32U;
+}
+template <>
+inline cudaDataType_t get_cuda_data_type<uint64_t>()
+{
+  return CUDA_R_64U;
+}
+
+template <class T>
+constexpr unsigned size_of();
+template <>
+_RAFT_HOST_DEVICE constexpr unsigned size_of<std::int8_t>()
+{
+  return 1;
+}
+template <>
+_RAFT_HOST_DEVICE constexpr unsigned size_of<std::uint8_t>()
+{
+  return 1;
+}
+template <>
+_RAFT_HOST_DEVICE constexpr unsigned size_of<std::uint16_t>()
+{
+  return 2;
+}
+template <>
+_RAFT_HOST_DEVICE constexpr unsigned size_of<std::uint32_t>()
+{
+  return 4;
+}
+template <>
+_RAFT_HOST_DEVICE constexpr unsigned size_of<std::uint64_t>()
+{
+  return 8;
+}
+template <>
+_RAFT_HOST_DEVICE constexpr unsigned size_of<uint4>()
+{
+  return 16;
+}
+template <>
+_RAFT_HOST_DEVICE constexpr unsigned size_of<ulonglong4>()
+{
+  return 32;
+}
+template <>
+_RAFT_HOST_DEVICE constexpr unsigned size_of<float>()
+{
+  return 4;
+}
+template <>
+_RAFT_HOST_DEVICE constexpr unsigned size_of<half>()
+{
+  return 2;
+}
+
+// max values for data types
+template <class BS_T, class FP_T>
+union fp_conv {
+  BS_T bs;
+  FP_T fp;
+};
+template <class T>
+_RAFT_HOST_DEVICE inline T get_max_value();
+template <>
+_RAFT_HOST_DEVICE inline float get_max_value<float>()
+{
+  return FLT_MAX;
+};
+template <>
+_RAFT_HOST_DEVICE inline half get_max_value<half>()
+{
+  return fp_conv<std::uint16_t, half>{.bs = 0x7aff}.fp;
+};
+template <>
+_RAFT_HOST_DEVICE inline std::uint32_t get_max_value<std::uint32_t>()
+{
+  return 0xffffffffu;
+};
+
+template <int A, int B, class = void>
+struct constexpr_max {
+  static const int value = A;
+};
+
+template <int A, int B>
+struct constexpr_max<A, B, std::enable_if_t<(B > A), bool>> {
+  static const int value = B;
+};
+}  // namespace utils
+
+}  // namespace raft::neighbors::experimental::cagra::detail
diff --git a/cpp/include/raft/util/cache_util.cuh b/cpp/include/raft/util/cache_util.cuh
index 4200be96e8..413e7522b1 100644
--- a/cpp/include/raft/util/cache_util.cuh
+++ b/cpp/include/raft/util/cache_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ __global__ void get_vecs(
   if (tid < n_vec * n) {
     size_t out_col   = tid / n_vec;  // col idx
     size_t cache_col = cache_idx[out_col];
-    if (cache_idx[out_col] >= 0) {
+    if (!std::is_signed<idx_t>::value || cache_idx[out_col] >= 0) {
       if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; }
     }
   }
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 91050461ae..e805f53712 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -258,6 +258,7 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_TEST
     PATH
+    test/neighbors/ann_cagra/test_float_uint32_t.cu
     test/neighbors/ann_ivf_flat/test_float_int64_t.cu
     test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
     test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
new file mode 100644
index 0000000000..385e9a80c0
--- /dev/null
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "../test_utils.cuh"
+#include "ann_utils.cuh"
+
+#include <raft_internal/neighbors/naive_knn.cuh>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/cagra.cuh>
+#include <raft/neighbors/cagra_serialize.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <gtest/gtest.h>
+
+#include <thrust/sequence.h>
+
+#include <cstddef>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace raft::neighbors::experimental::cagra {
+
+struct AnnCagraInputs {
+  int n_queries;
+  int n_rows;
+  int dim;
+  int k;
+  search_algo algo;
+  int max_queries;
+  int team_size;
+  int itopk_size;
+  int num_parents;
+  raft::distance::DistanceType metric;
+  bool host_dataset;
+  // std::optional<double>
+  double min_recall;  // = std::nullopt;
+};
+
+inline ::std::ostream& operator<<(::std::ostream& os, const AnnCagraInputs& p)
+{
+  std::vector<std::string> algo = {"single-cta", "multi_cta", "multi_kernel", "auto"};
+  os << "{n_queries=" << p.n_queries << ", dataset shape=" << p.n_rows << "x" << p.dim
+     << ", k=" << p.k << ", " << algo.at((int)p.algo) << ", max_queries=" << p.max_queries
+     << ", itopk_size=" << p.itopk_size << ", num_parents=" << p.num_parents
+     << ", metric=" << static_cast<int>(p.metric) << (p.host_dataset ? ", host" : ", device") << '}'
+     << std::endl;
+  return os;
+}
+
+template <typename DistanceT, typename DataT, typename IdxT>
+class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
+ public:
+  AnnCagraTest()
+    : stream_(handle_.get_stream()),
+      ps(::testing::TestWithParam<AnnCagraInputs>::GetParam()),
+      database(0, stream_),
+      search_queries(0, stream_)
+  {
+  }
+
+ protected:
+  void testCagra()
+  {
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<IdxT> indices_Cagra(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<DistanceT> distances_Cagra(queries_size);
+    std::vector<DistanceT> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<DistanceT> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      naive_knn<DistanceT, DataT, IdxT>(distances_naive_dev.data(),
+                                        indices_naive_dev.data(),
+                                        search_queries.data(),
+                                        database.data(),
+                                        ps.n_queries,
+                                        ps.n_rows,
+                                        ps.dim,
+                                        ps.k,
+                                        ps.metric,
+                                        stream_);
+      update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      handle_.sync_stream(stream_);
+    }
+
+    {
+      rmm::device_uvector<DistanceT> distances_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_dev(queries_size, stream_);
+
+      {
+        cagra::index_params index_params;
+        index_params.metric = ps.metric;  // Note: currently ony the cagra::index_params metric is
+                                          // not used for knn_graph building.
+        cagra::search_params search_params;
+        search_params.algo        = ps.algo;
+        search_params.max_queries = ps.max_queries;
+        search_params.team_size   = ps.team_size;
+
+        auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+          (const DataT*)database.data(), ps.n_rows, ps.dim);
+
+        {
+          cagra::index<DataT, IdxT> index(handle_);
+          if (ps.host_dataset) {
+            auto database_host = raft::make_host_matrix<DataT, IdxT>(ps.n_rows, ps.dim);
+            raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            auto database_host_view = raft::make_host_matrix_view<const DataT, IdxT>(
+              (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+            index = cagra::build<DataT, IdxT>(handle_, index_params, database_host_view);
+          } else {
+            index = cagra::build<DataT, IdxT>(handle_, index_params, database_view);
+          };
+          cagra::serialize(handle_, "cagra_index", index);
+        }
+        auto index = cagra::deserialize<DataT, IdxT>(handle_, "cagra_index");
+
+        auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
+          search_queries.data(), ps.n_queries, ps.dim);
+        auto indices_out_view =
+          raft::make_device_matrix_view<IdxT, IdxT>(indices_dev.data(), ps.n_queries, ps.k);
+        auto dists_out_view =
+          raft::make_device_matrix_view<DistanceT, IdxT>(distances_dev.data(), ps.n_queries, ps.k);
+
+        cagra::search(
+          handle_, search_params, index, search_queries_view, indices_out_view, dists_out_view);
+
+        update_host(distances_Cagra.data(), distances_dev.data(), queries_size, stream_);
+        update_host(indices_Cagra.data(), indices_dev.data(), queries_size, stream_);
+        handle_.sync_stream(stream_);
+      }
+      // for (int i = 0; i < ps.n_queries; i++) {
+      //   //  std::cout << "query " << i << std::end;
+      //   print_vector("T", indices_naive.data() + i * ps.k, ps.k, std::cout);
+      //   print_vector("C", indices_Cagra.data() + i * ps.k, ps.k, std::cout);
+      //   print_vector("T", distances_naive.data() + i * ps.k, ps.k, std::cout);
+      //   print_vector("C", distances_Cagra.data() + i * ps.k, ps.k, std::cout);
+      // }
+      double min_recall = ps.min_recall;
+      ASSERT_TRUE(eval_neighbours(indices_naive,
+                                  indices_Cagra,
+                                  distances_naive,
+                                  distances_Cagra,
+                                  ps.n_queries,
+                                  ps.k,
+                                  0.001,
+                                  min_recall));
+      ASSERT_TRUE(eval_distances(handle_,
+                                 database.data(),
+                                 search_queries.data(),
+                                 indices_dev.data(),
+                                 distances_dev.data(),
+                                 ps.n_rows,
+                                 ps.dim,
+                                 ps.n_queries,
+                                 ps.k,
+                                 ps.metric,
+                                 1.0e-4));
+    }
+  }
+
+  void SetUp() override
+  {
+    std::cout << "Resizing database: " << ps.n_rows * ps.dim << std::endl;
+    database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
+    std::cout << "Done.\nResizing queries" << std::endl;
+    search_queries.resize(ps.n_queries * ps.dim, stream_);
+    std::cout << "Done.\nRuning rng" << std::endl;
+    raft::random::Rng r(1234ULL);
+    if constexpr (std::is_same<DataT, float>{}) {
+      r.uniform(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_);
+      r.uniform(search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0), stream_);
+    } else {
+      r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_);
+      r.uniformInt(search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20), stream_);
+    }
+    handle_.sync_stream(stream_);
+  }
+
+  void TearDown() override
+  {
+    handle_.sync_stream(stream_);
+    database.resize(0, stream_);
+    search_queries.resize(0, stream_);
+  }
+
+ private:
+  raft::device_resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnCagraInputs ps;
+  rmm::device_uvector<DataT> database;
+  rmm::device_uvector<DataT> search_queries;
+};
+
+inline std::vector<AnnCagraInputs> generate_inputs()
+{
+  // Todo(tfeher): MULTI_CTA tests a bug, consider disabling that mode.
+  std::vector<AnnCagraInputs> inputs = raft::util::itertools::product<AnnCagraInputs>(
+    {100},
+    {1000},
+    {8},
+    {1, 16, 33},  // k
+    {search_algo::SINGLE_CTA, search_algo::MULTI_KERNEL},
+    {1, 10, 100},  // query size
+    {0},
+    {64},
+    {1},
+    {raft::distance::DistanceType::L2Expanded},
+    {false},
+    {0.995});
+
+  auto inputs2 =
+    raft::util::itertools::product<AnnCagraInputs>({100},
+                                                   {1000},
+                                                   {2, 4, 8, 64, 128, 196, 256, 512, 1024},  // dim
+                                                   {16},
+                                                   {search_algo::AUTO},
+                                                   {10},
+                                                   {0},
+                                                   {64},
+                                                   {1},
+                                                   {raft::distance::DistanceType::L2Expanded},
+                                                   {false},
+                                                   {0.995});
+  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+  inputs2 =
+    raft::util::itertools::product<AnnCagraInputs>({100},
+                                                   {1000},
+                                                   {64},
+                                                   {16},
+                                                   {search_algo::AUTO},
+                                                   {10},
+                                                   {0, 4, 8, 16, 32},  // team_size
+                                                   {64},
+                                                   {1},
+                                                   {raft::distance::DistanceType::L2Expanded},
+                                                   {false},
+                                                   {0.995});
+  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
+  inputs2 =
+    raft::util::itertools::product<AnnCagraInputs>({100},
+                                                   {1000},
+                                                   {64},
+                                                   {16},
+                                                   {search_algo::AUTO},
+                                                   {10},
+                                                   {0},  // team_size
+                                                   {32, 64, 128, 256, 512, 768},
+                                                   {1},
+                                                   {raft::distance::DistanceType::L2Expanded},
+                                                   {false},
+                                                   {0.995});
+  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
+  inputs2 =
+    raft::util::itertools::product<AnnCagraInputs>({100},
+                                                   {10000, 20000},
+                                                   {30},
+                                                   {10},
+                                                   {search_algo::AUTO},
+                                                   {10},
+                                                   {0},  // team_size
+                                                   {64},
+                                                   {1},
+                                                   {raft::distance::DistanceType::L2Expanded},
+                                                   {false, true},
+                                                   {0.995});
+  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
+  inputs2 =
+    raft::util::itertools::product<AnnCagraInputs>({100},
+                                                   {10000, 20000},
+                                                   {30},
+                                                   {10},
+                                                   {search_algo::AUTO},
+                                                   {10},
+                                                   {0},  // team_size
+                                                   {64},
+                                                   {1},
+                                                   {raft::distance::DistanceType::L2Expanded},
+                                                   {false, true},
+                                                   {0.995});
+  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
+  return inputs;
+}
+
+const std::vector<AnnCagraInputs> inputs = generate_inputs();
+
+}  // namespace raft::neighbors::experimental::cagra
\ No newline at end of file
diff --git a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
new file mode 100644
index 0000000000..71a83e2cca
--- /dev/null
+++ b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_cagra.cuh"
+
+// #if defined RAFT_DISTANCE_COMPILED
+// #include <raft/neighbors/specializations.cuh>
+// #endif
+
+namespace raft::neighbors::experimental::cagra {
+
+typedef AnnCagraTest<float, float, std::uint32_t> AnnCagraTestF;
+TEST_P(AnnCagraTestF, AnnCagra) { this->testCagra(); }
+
+INSTANTIATE_TEST_CASE_P(AnnCagraTest, AnnCagraTestF, ::testing::ValuesIn(inputs));
+
+}  // namespace raft::neighbors::experimental::cagra
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index 4b07db32f4..fc448f014f 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/distance/distance_types.hpp>
 #include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/matrix.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_utils.cuh>
 
@@ -25,8 +26,11 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
+#include <raft_internal/neighbors/naive_knn.cuh>
+
 #include "../test_utils.cuh"
 #include <gtest/gtest.h>
+#include <iostream>
 
 namespace raft::neighbors {
 
@@ -164,4 +168,49 @@ auto eval_neighbours(const std::vector<T>& expected_idx,
   return testing::AssertionSuccess();
 }
 
+template <typename T, typename DistT, typename IdxT>
+auto eval_distances(raft::device_resources const& handle,
+                    const T* x,              // dataset, n_rows * n_cols
+                    const T* queries,        // n_queries * n_cols
+                    const IdxT* neighbors,   // n_queries * k
+                    const DistT* distances,  // n_queries *k
+                    size_t n_rows,
+                    size_t n_cols,
+                    size_t n_queries,
+                    uint32_t k,
+                    raft::distance::DistanceType metric,
+                    double eps) -> testing::AssertionResult
+{
+  // for each vector, we calculate the actual distance to the k neighbors
+
+  for (size_t i = 0; i < n_queries; i++) {
+    auto y          = raft::make_device_matrix<T, IdxT>(handle, k, n_cols);
+    auto naive_dist = raft::make_device_matrix<DistT, IdxT>(handle, 1, k);
+
+    raft::matrix::copyRows<T, IdxT, int64_t>(
+      x, k, n_cols, y.data_handle(), neighbors + i * k, k, handle.get_stream(), true);
+
+    dim3 block_dim(16, 32, 1);
+    auto grid_y =
+      static_cast<uint16_t>(std::min<size_t>(raft::ceildiv<size_t>(k, block_dim.y), 32768));
+    dim3 grid_dim(raft::ceildiv<size_t>(n_rows, block_dim.x), grid_y, 1);
+
+    naive_distance_kernel<DistT, T, IdxT><<<grid_dim, block_dim, 0, handle.get_stream()>>>(
+      naive_dist.data_handle(), queries + i * n_cols, y.data_handle(), 1, k, n_cols, metric);
+
+    if (!devArrMatch(distances + i * k,
+                     naive_dist.data_handle(),
+                     naive_dist.size(),
+                     CompareApprox<float>(eps))) {
+      std::cout << n_rows << "x" << n_cols << ", " << k << std::endl;
+      std::cout << "query " << i << std::endl;
+      print_vector(" indices", neighbors + i * k, k, std::cout);
+      print_vector("n dist", distances + i * k, k, std::cout);
+      print_vector("c dist", naive_dist.data_handle(), naive_dist.size(), std::cout);
+
+      return testing::AssertionFailure();
+    }
+  }
+  return testing::AssertionSuccess();
+}
 }  // namespace raft::neighbors

From 5ff134ac5b32e5ecb17573c5dee3faf3e7390287 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 6 Apr 2023 15:26:55 +0200
Subject: [PATCH 12/89] Add select_k source files

---
 .../matrix/detail/select_k_float_uint32_t.cu  | 33 +++++++++++++++++++
 .../matrix/detail/select_k_float_uint64_t.cu  | 33 +++++++++++++++++++
 .../matrix/detail/select_k_half_uint32_t.cu   | 33 +++++++++++++++++++
 .../matrix/detail/select_k_half_uint64_t.cu   | 33 +++++++++++++++++++
 4 files changed, 132 insertions(+)
 create mode 100644 cpp/src/matrix/detail/select_k_float_uint32_t.cu
 create mode 100644 cpp/src/matrix/detail/select_k_float_uint64_t.cu
 create mode 100644 cpp/src/matrix/detail/select_k_half_uint32_t.cu
 create mode 100644 cpp/src/matrix/detail/select_k_half_uint64_t.cu

diff --git a/cpp/src/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
new file mode 100644
index 0000000000..3bb47acbf2
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_float_uint32_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(float, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_float_uint64_t.cu b/cpp/src/matrix/detail/select_k_float_uint64_t.cu
new file mode 100644
index 0000000000..3bb47acbf2
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_float_uint64_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(float, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
new file mode 100644
index 0000000000..b18887bfc0
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_half_uint32_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(__half, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_half_uint64_t.cu b/cpp/src/matrix/detail/select_k_half_uint64_t.cu
new file mode 100644
index 0000000000..cf4e15959d
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_half_uint64_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(__half, int64_t);
+
+#undef instantiate_raft_matrix_detail_select_k

From 698ef533c46005cf3083199b0c0cd3db290582d0 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 11 Apr 2023 10:59:08 +0200
Subject: [PATCH 13/89] Split distance.cuh

---
 cpp/CMakeLists.txt                            |   84 +-
 .../raft/cluster/detail/kmeans_common.cuh     |    1 +
 cpp/include/raft/core/resources.hpp           |    3 +-
 .../detail/pairwise_matrix/dispatch-ext.cuh   |  166 +++
 .../detail/pairwise_matrix/dispatch-inl.cuh   |   63 +-
 .../detail/pairwise_matrix/dispatch.cuh       |   40 +
 cpp/include/raft/distance/distance-ext.cuh    | 1224 +++++++++++++++++
 cpp/include/raft/distance/distance-inl.cuh    |  251 ++--
 cpp/include/raft/distance/distance.cuh        |   24 +
 .../detail/00_write_template.py               |  148 --
 .../specializations/detail/canberra.cuh       |   40 -
 .../specializations/detail/correlation.cuh    |   40 -
 .../specializations/detail/cosine.cuh         |   40 -
 .../detail/hamming_unexpanded.cuh             |   40 -
 .../detail/hellinger_expanded.cuh             |   40 -
 .../specializations/detail/inner_product.cuh  |   52 -
 .../specializations/detail/jensen_shannon.cuh |   40 -
 .../specializations/detail/kl_divergence.cuh  |   40 -
 .../distance/specializations/detail/l1.cuh    |   40 -
 .../specializations/detail/l2_expanded.cuh    |   40 -
 .../specializations/detail/l2_unexpanded.cuh  |   40 -
 .../distance/specializations/detail/l_inf.cuh |   40 -
 .../specializations/detail/lp_unexpanded.cuh  |   40 -
 .../specializations/detail/russel_rao.cuh     |   40 -
 .../distance/specializations/distance.cuh     |   15 -
 .../detail/pairwise_matrix/dispatch.cu        |   91 ++
 .../pairwise_matrix/dispatch_00_generate.py   |  151 ++
 ...patch_canberra_double_double_double_int.cu |   46 +
 ...dispatch_canberra_float_float_float_int.cu |   41 +
 ...ch_correlation_double_double_double_int.cu |   46 +
 ...patch_correlation_float_float_float_int.cu |   46 +
 ...ispatch_cosine_double_double_double_int.cu |   42 +
 .../dispatch_cosine_float_float_float_int.cu  |   42 +
 ...ing_unexpanded_double_double_double_int.cu |   41 +
 ...amming_unexpanded_float_float_float_int.cu |   41 +
 ...inger_expanded_double_double_double_int.cu |   46 +
 ...ellinger_expanded_float_float_float_int.cu |   41 +
 ...jensen_shannon_double_double_double_int.cu |   46 +
 ...ch_jensen_shannon_float_float_float_int.cu |   46 +
 ..._kl_divergence_double_double_double_int.cu |   41 +
 ...tch_kl_divergence_float_float_float_int.cu |   41 +
 .../dispatch_l1_double_double_double_int.cu   |   41 +
 .../dispatch_l1_float_float_float_int.cu      |   41 +
 ...ch_l2_expanded_double_double_double_int.cu |   42 +
 ...patch_l2_expanded_float_float_float_int.cu |   42 +
 ..._l2_unexpanded_double_double_double_int.cu |   46 +
 ...tch_l2_unexpanded_float_float_float_int.cu |   41 +
 ...dispatch_l_inf_double_double_double_int.cu |   41 +
 .../dispatch_l_inf_float_float_float_int.cu   |   41 +
 ..._lp_unexpanded_double_double_double_int.cu |   46 +
 ...tch_lp_unexpanded_float_float_float_int.cu |   41 +
 ...tch_russel_rao_double_double_double_int.cu |   46 +
 ...spatch_russel_rao_float_float_float_int.cu |   41 +
 cpp/src/distance/distance.cu                  |  909 ++++++++++++
 cpp/src/distance/fused_l2_min_arg.cu          |    3 +-
 55 files changed, 3932 insertions(+), 928 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
 create mode 100644 cpp/include/raft/distance/distance-ext.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/00_write_template.py
 delete mode 100644 cpp/include/raft/distance/specializations/detail/canberra.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/correlation.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/cosine.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/inner_product.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/l1.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/l_inf.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/russel_rao.cuh
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
 create mode 100644 cpp/src/distance/distance.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8aa71647c2..c4051984cc 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -305,20 +305,48 @@ if(RAFT_COMPILE_LIBRARY)
     src/cluster/kmeans_fit_double.cu
     src/cluster/kmeans_init_plus_plus_double.cu
     src/cluster/kmeans_init_plus_plus_float.cu
-    src/distance/specializations/detail/canberra_double_double_double_int.cu
-    src/distance/specializations/detail/canberra_float_float_float_int.cu
-    src/distance/specializations/detail/correlation_double_double_double_int.cu
-    src/distance/specializations/detail/correlation_float_float_float_int.cu
-    src/distance/specializations/detail/cosine_double_double_double_int.cu
-    src/distance/specializations/detail/cosine_float_float_float_int.cu
-    src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
-    src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
-    src/distance/specializations/detail/inner_product_float_float_float_int.cu
-    src/distance/specializations/detail/inner_product_double_double_double_int.cu
-    src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
-    src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+    src/distance/distance.cu
+    src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
+    # src/distance/detail/pairwise_matrix/canberra_float_float_float_int.cu
+    # src/distance/specializations/detail/canberra_double_double_double_int.cu
+    # src/distance/specializations/detail/canberra_float_float_float_int.cu
+    # src/distance/specializations/detail/correlation_double_double_double_int.cu
+    # src/distance/specializations/detail/correlation_float_float_float_int.cu
+    # src/distance/specializations/detail/cosine_double_double_double_int.cu
+    # src/distance/specializations/detail/cosine_float_float_float_int.cu
+    # src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
+    # src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
+    # src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
+    # src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
+    # src/distance/specializations/detail/inner_product_float_float_float_int.cu
+    # src/distance/specializations/detail/inner_product_double_double_double_int.cu
+    # src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
+    # src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
     src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
     src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
     src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
@@ -329,20 +357,20 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/brute_force_knn_int64_t_float.cu
     src/distance/specializations/detail/kernels/tanh_kernel_double.cu
     src/distance/specializations/detail/kernels/tanh_kernel_float.cu
-    src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
-    src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
-    src/distance/specializations/detail/l1_float_float_float_int.cu
-    src/distance/specializations/detail/l1_double_double_double_int.cu
-    src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
-    src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/l_inf_double_double_double_int.cu
-    src/distance/specializations/detail/l_inf_float_float_float_int.cu
-    src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
-    src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
-    src/distance/specializations/detail/russel_rao_double_double_double_int.cu
-    src/distance/specializations/detail/russel_rao_float_float_float_int.cu
+    # src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
+    # src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
+    # src/distance/specializations/detail/l1_float_float_float_int.cu
+    # src/distance/specializations/detail/l1_double_double_double_int.cu
+    # src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
+    # src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
+    # src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
+    # src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+    # src/distance/specializations/detail/l_inf_double_double_double_int.cu
+    # src/distance/specializations/detail/l_inf_float_float_float_int.cu
+    # src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
+    # src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
+    # src/distance/specializations/detail/russel_rao_double_double_double_int.cu
+    # src/distance/specializations/detail/russel_rao_float_float_float_int.cu
     src/distance/specializations/fused_l2_nn_double_int.cu
     src/distance/specializations/fused_l2_nn_double_int64.cu
     src/distance/specializations/fused_l2_nn_float_int.cu
diff --git a/cpp/include/raft/cluster/detail/kmeans_common.cuh b/cpp/include/raft/cluster/detail/kmeans_common.cuh
index 76fc22e99e..cca1cbb6e9 100644
--- a/cpp/include/raft/cluster/detail/kmeans_common.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_common.cuh
@@ -38,6 +38,7 @@
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/norm.cuh>
 #include <raft/linalg/reduce_rows_by_key.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/gather.cuh>
diff --git a/cpp/include/raft/core/resources.hpp b/cpp/include/raft/core/resources.hpp
index 64e281e934..49836ee962 100644
--- a/cpp/include/raft/core/resources.hpp
+++ b/cpp/include/raft/core/resources.hpp
@@ -18,6 +18,7 @@
 #include "resource/resource_types.hpp"
 #include <algorithm>
 #include <mutex>
+#include <raft/core/error.hpp>  // RAFT_EXPECTS
 #include <raft/core/logger.hpp>
 #include <string>
 #include <vector>
@@ -128,4 +129,4 @@ class resources {
   mutable std::vector<pair_res_factory> factories_;
   mutable std::vector<pair_resource> resources_;
 };
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
new file mode 100644
index 0000000000..4fc55c29b8
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/operators.hpp>                          // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>    // ops::*
+#include <raft/distance/detail/distance_ops/cutlass.cuh>    // ops::has_cutlass_op
+#include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
+#include <raft/util/raft_explicit.hpp>                      // RAFT_EXPLICIT
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft::distance::detail {
+
+template <typename OpT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void pairwise_matrix_dispatch(OpT distance_op,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              const DataT* x,
+                              const DataT* y,
+                              const DataT* x_norm,
+                              const DataT* y_norm,
+                              OutT* out,
+                              FinOpT fin_op,
+                              cudaStream_t stream,
+                              bool is_row_major) RAFT_EXPLICIT;
+
+};  // namespace raft::distance::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  extern template void raft::distance::detail::                                        \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::canberra_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::canberra_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::correlation_distance_op,
+  float,
+  float,
+  float,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::correlation_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::cosine_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::cosine_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hamming_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hamming_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hellinger_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hellinger_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::jensen_shannon_distance_op,
+  float,
+  float,
+  float,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::jensen_shannon_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::kl_divergence_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::kl_divergence_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l1_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l1_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_exp_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_exp_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l_inf_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l_inf_distance_op, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::lp_unexp_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::lp_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::russel_rao_distance_op, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::russel_rao_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
index e04b56ee8a..8df671d637 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
@@ -60,11 +60,35 @@ void pairwise_matrix_sm80_dispatch(OpT,
                                    SM_compat_t,
                                    cudaStream_t);
 
-template <typename OpT, typename IdxT, typename DataT, typename OutT, typename FinOpT>
-void pairwise_matrix_instantiation_point(OpT distance_op,
-                                         pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params,
-                                         cudaStream_t stream)
+template <typename OpT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void pairwise_matrix_dispatch(OpT distance_op,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              const DataT* x,
+                              const DataT* y,
+                              const DataT* x_norm,
+                              const DataT* y_norm,
+                              OutT* out,
+                              FinOpT fin_op,
+                              cudaStream_t stream,
+                              bool is_row_major)
 {
+  // Create kernel parameter struct. Flip x and y if column major.
+  IdxT ldx    = is_row_major ? k : m;
+  IdxT ldy    = is_row_major ? k : n;
+  IdxT ld_out = is_row_major ? n : m;
+
+  pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params{
+    m, n, k, ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, is_row_major};
+
+  if (!params.is_row_major) { params.flip_x_and_y(); }
+
   // On CUDA 12:
   // - always execute normal kernel
   //
@@ -103,35 +127,4 @@ void pairwise_matrix_instantiation_point(OpT distance_op,
   }
 }
 
-template <typename OpT,
-          typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void pairwise_matrix_dispatch(OpT distance_op,
-                              IdxT m,
-                              IdxT n,
-                              IdxT k,
-                              const DataT* x,
-                              const DataT* y,
-                              const DataT* x_norm,
-                              const DataT* y_norm,
-                              OutT* out,
-                              FinOpT fin_op,
-                              cudaStream_t stream,
-                              bool is_row_major)
-{
-  // Create kernel parameter struct. Flip x and y if column major.
-  IdxT ldx    = is_row_major ? k : m;
-  IdxT ldy    = is_row_major ? k : n;
-  IdxT ld_out = is_row_major ? n : m;
-
-  pairwise_matrix_params<IdxT, DataT, OutT, FinOpT> params{
-    m, n, k, ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, is_row_major};
-
-  if (!params.is_row_major) { params.flip_x_and_y(); }
-  pairwise_matrix_instantiation_point(distance_op, params, stream);
-}
-
 };  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index e69de29bb2..73666f639f 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+/* This file has two responsibilities:
+ *
+ * 1. Dispatch to the correct implementation of a kernel based on the
+ *    architecture of the device on which the kernel will be launched. For
+ *    instance, the cosine distance has a CUTLASS-based implementation that can
+ *    be used on SM80+ and the normal implementation that is used on older
+ *    architectures.
+ *
+ * 2. Provide concise function templates that can be instantiated in
+ *    src/distance/distance/specializations/detail/. Previously,
+ *    raft::distance::detail::distance was instantiated. The function
+ *    necessarily required a large set of include files, which slowed down the
+ *    build. The raft::distance::detail::pairwise_matrix_arch_dispatch functions
+ *    do not require as large an include files set, which speeds up the build.
+ */
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "dispatch-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "dispatch-ext.cuh"
+#endif
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
new file mode 100644
index 0000000000..20af73e401
--- /dev/null
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -0,0 +1,1224 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>       // raft::device_matrix_view
+#include <raft/core/operators.hpp>           // raft::identity_op
+#include <raft/core/resources.hpp>           // raft::resources
+#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
+#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
+#include <rmm/device_uvector.hpp>            // rmm::device_uvector
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft {
+namespace distance {
+
+/**
+ * @defgroup pairwise_distance pointer-based pairwise distance prims
+ * @{
+ */
+
+/**
+ * @brief Evaluate pairwise distances with the user epilogue lamba allowed
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam IdxT Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param fin_op the final gemm epilogue lambda
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note fin_op: This is a device lambda which is supposed to operate upon the
+ * input which is AccT and returns the output in OutT. It's signature is
+ * as follows:  <pre>OutT fin_op(AccT in, int g_idx);</pre>. If one needs
+ * any other parameters, feel free to pass them via closure.
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinalLambda,
+          typename IdxT = int>
+void distance(raft::resources const& handle,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
+              void* workspace,
+              size_t worksize,
+              FinalLambda fin_op,
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
+void distance(raft::resources const& handle,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
+              void* workspace,
+              size_t worksize,
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+
+/**
+ * @brief Return the exact workspace size to compute the distance
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ *
+ * @note If the specified DistT doesn't need the workspace at all, it
+ * returns 0.
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
+size_t getWorkspaceSize(const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) RAFT_EXPLICIT;
+
+/**
+ * @brief Return the exact workspace size to compute the distance
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
+ * @param x first set of points (size m*k)
+ * @param y second set of points (size n*k)
+ * @return number of bytes needed in workspace
+ *
+ * @note If the specified DistT doesn't need the workspace at all, it
+ * returns 0.
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int,
+          typename layout>
+size_t getWorkspaceSize(raft::device_matrix_view<DataT, IdxT, layout> const& x,
+                        raft::device_matrix_view<DataT, IdxT, layout> const& y) RAFT_EXPLICIT;
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
+void distance(raft::resources const& handle,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+
+/**
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam IdxT indexing type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace buffer which can get resized as per the
+ * needed workspace size
+ * @param metric distance metric
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename Type, typename IdxT = int>
+void pairwise_distance(raft::resources const& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       rmm::device_uvector<char>& workspace,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+
+/**
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam IdxT indexing type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param metric distance metric
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename Type, typename IdxT = int>
+void pairwise_distance(raft::resources const& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       raft::distance::DistanceType metric,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+/** @} */
+
+/**
+ * \defgroup distance_mdspan Pairwise distance functions
+ * @{
+ */
+
+/**
+ * @brief Evaluate pairwise distances for the simple use case.
+ *
+ * Note: Only contiguous row- or column-major layouts supported currently.
+ *
+ * Usage example:
+ * @code{.cpp}
+ * #include <raft/core/device_resources.hpp>
+ * #include <raft/core/device_mdarray.hpp>
+ * #include <raft/random/make_blobs.cuh>
+ * #include <raft/distance/distance.cuh>
+ *
+ * raft::raft::device_resources handle;
+ * int n_samples = 5000;
+ * int n_features = 50;
+ *
+ * auto input = raft::make_device_matrix<float>(handle, n_samples, n_features);
+ * auto labels = raft::make_device_vector<int>(handle, n_samples);
+ * auto output = raft::make_device_matrix<float>(handle, n_samples, n_samples);
+ *
+ * raft::random::make_blobs(handle, input.view(), labels.view());
+ * auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+ * raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
+ * @endcode
+ *
+ * @tparam DistanceType which distance to evaluate
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
+ * @param handle raft handle for managing expensive resources
+ * @param x first set of points (size n*k)
+ * @param y second set of points (size m*k)
+ * @param dist output distance matrix (size n*m)
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename layout = raft::layout_c_contiguous,
+          typename IdxT   = int>
+void distance(raft::resources const& handle,
+              raft::device_matrix_view<DataT, IdxT, layout> const x,
+              raft::device_matrix_view<DataT, IdxT, layout> const y,
+              raft::device_matrix_view<OutT, IdxT, layout> dist,
+              DataT metric_arg = 2.0f) RAFT_EXPLICIT;
+
+/**
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam IdxT indexing type
+ * @param handle raft handle for managing expensive resources
+ * @param x first matrix of points (size mxk)
+ * @param y second matrix of points (size nxk)
+ * @param dist output distance matrix (size mxn)
+ * @param metric distance metric
+ * @param metric_arg metric argument (used for Minkowski distance)
+ */
+template <typename Type, typename layout = layout_c_contiguous, typename IdxT = int>
+void pairwise_distance(raft::resources const& handle,
+                       device_matrix_view<Type, IdxT, layout> const x,
+                       device_matrix_view<Type, IdxT, layout> const y,
+                       device_matrix_view<Type, IdxT, layout> dist,
+                       raft::distance::DistanceType metric,
+                       Type metric_arg = 2.0f) RAFT_EXPLICIT;
+
+/** @} */
+
+};  // namespace distance
+};  // namespace raft
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT)       \
+  extern template void raft::distance::distance<DT, DataT, AccT, OutT, FinalLambda, IdxT>( \
+    raft::resources const& handle,                                                         \
+    const DataT* x,                                                                        \
+    const DataT* y,                                                                        \
+    OutT* dist,                                                                            \
+    IdxT m,                                                                                \
+    IdxT n,                                                                                \
+    IdxT k,                                                                                \
+    void* workspace,                                                                       \
+    size_t worksize,                                                                       \
+    FinalLambda fin_op,                                                                    \
+    bool isRowMajor,                                                                       \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::identity_op,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_distance
+
+// Same, but without raft::identity_op
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, IdxT)       \
+  extern template void raft::distance::distance<DT, DataT, AccT, OutT, IdxT>( \
+    raft::resources const& handle,                                            \
+    const DataT* x,                                                           \
+    const DataT* y,                                                           \
+    OutT* dist,                                                               \
+    IdxT m,                                                                   \
+    IdxT n,                                                                   \
+    IdxT k,                                                                   \
+    void* workspace,                                                          \
+    size_t worksize,                                                          \
+    bool isRowMajor,                                                          \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_distance
+
+// Same, but without workspace
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, IdxT)       \
+  extern template void raft::distance::distance<DT, DataT, AccT, OutT, IdxT>( \
+    raft::resources const& handle,                                            \
+    const DataT* x,                                                           \
+    const DataT* y,                                                           \
+    OutT* dist,                                                               \
+    IdxT m,                                                                   \
+    IdxT n,                                                                   \
+    IdxT k,                                                                   \
+    bool isRowMajor,                                                          \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT)         \
+  extern template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>( \
+    const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_getWorkspaceSize
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT, layout)         \
+  extern template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT, layout>( \
+    raft::device_matrix_view<DataT, IdxT, layout> const& x,                                        \
+    raft::device_matrix_view<DataT, IdxT, layout> const& y)
+
+// We could consider not taking template parameters for this function. The
+// number of instantiations seems a bit excessive..
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::InnerProduct,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::InnerProduct,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::JensenShannon,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::JensenShannon,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::KLDivergence,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::KLDivergence,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2Unexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int, raft::layout_f_contiguous);
+
+#undef instantiate_raft_distance_getWorkspaceSize
+
+#define instantiate_raft_distance_pairwise_distance(DataT, IdxT)                               \
+  extern template void raft::distance::pairwise_distance(raft::resources const& handle,        \
+                                                         const DataT* x,                       \
+                                                         const DataT* y,                       \
+                                                         DataT* dist,                          \
+                                                         IdxT m,                               \
+                                                         IdxT n,                               \
+                                                         IdxT k,                               \
+                                                         rmm::device_uvector<char>& workspace, \
+                                                         raft::distance::DistanceType metric,  \
+                                                         bool isRowMajor,                      \
+                                                         DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, int);
+instantiate_raft_distance_pairwise_distance(double, int);
+
+#undef instantiate_raft_distance_pairwise_distance
+
+// Same, but without workspace
+#define instantiate_raft_distance_pairwise_distance(DataT, IdxT)                              \
+  extern template void raft::distance::pairwise_distance(raft::resources const& handle,       \
+                                                         const DataT* x,                      \
+                                                         const DataT* y,                      \
+                                                         DataT* dist,                         \
+                                                         IdxT m,                              \
+                                                         IdxT n,                              \
+                                                         IdxT k,                              \
+                                                         raft::distance::DistanceType metric, \
+                                                         bool isRowMajor,                     \
+                                                         DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, int);
+instantiate_raft_distance_pairwise_distance(double, int);
+
+#undef instantiate_raft_distance_pairwise_distance
+
+// Version with mdspan
+#define instantiate_raft_distance_distance(DistT, DataT, AccT, OutT, layout, IdxT)       \
+  extern template void raft::distance::distance<DistT, DataT, AccT, OutT, layout, IdxT>( \
+    raft::resources const& handle,                                                       \
+    raft::device_matrix_view<DataT, IdxT, layout> const x,                               \
+    raft::device_matrix_view<DataT, IdxT, layout> const y,                               \
+    raft::device_matrix_view<OutT, IdxT, layout> dist,                                   \
+    DataT metric_arg)
+
+// Again, we might want to consider reigning in the number of instantiations...
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::InnerProduct,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::InnerProduct,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::JensenShannon,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::JensenShannon,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::KLDivergence,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::KLDivergence,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::LpUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::LpUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_pairwise_distance(DataT, layout, IdxT) \
+  extern template void raft::distance::pairwise_distance(                \
+    raft::resources const& handle,                                       \
+    raft::device_matrix_view<DataT, IdxT, layout> const x,               \
+    raft::device_matrix_view<DataT, IdxT, layout> const y,               \
+    raft::device_matrix_view<DataT, IdxT, layout> dist,                  \
+    raft::distance::DistanceType metric,                                 \
+    DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_pairwise_distance(float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_pairwise_distance(double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_pairwise_distance(double, raft::layout_f_contiguous, int);
+
+#undef instantiate_raft_distance_pairwise_distance
diff --git a/cpp/include/raft/distance/distance-inl.cuh b/cpp/include/raft/distance/distance-inl.cuh
index 5216902635..3399443765 100644
--- a/cpp/include/raft/distance/distance-inl.cuh
+++ b/cpp/include/raft/distance/distance-inl.cuh
@@ -13,9 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef __DISTANCE_H
-#define __DISTANCE_H
-
 #pragma once
 
 #include <raft/core/resource/cuda_stream.hpp>
@@ -38,11 +35,11 @@ namespace distance {
 /**
  * @brief Evaluate pairwise distances with the user epilogue lamba allowed
  * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
  * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
+ * @tparam IdxT Index type
  * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
@@ -57,40 +54,40 @@ namespace distance {
  * @param metric_arg metric argument (used for Minkowski distance)
  *
  * @note fin_op: This is a device lambda which is supposed to operate upon the
- * input which is AccType and returns the output in OutType. It's signature is
- * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
+ * input which is AccT and returns the output in OutT. It's signature is
+ * as follows:  <pre>OutT fin_op(AccT in, int g_idx);</pre>. If one needs
  * any other parameters, feel free to pass them via closure.
  */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
           typename FinalLambda,
-          typename Index_ = int>
+          typename IdxT = int>
 void distance(raft::resources const& handle,
-              const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
               void* workspace,
               size_t worksize,
               FinalLambda fin_op,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f)
 {
-  detail::distance<distanceType, InType, AccType, OutType, FinalLambda, Index_>(
+  detail::distance<DistT, DataT, AccT, OutT, FinalLambda, IdxT>(
     handle, x, y, dist, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg);
 }
 
 /**
  * @brief Evaluate pairwise distances for the simple use case
  * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
  * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
@@ -103,89 +100,89 @@ void distance(raft::resources const& handle,
  * @param isRowMajor whether the matrices are row-major or col-major
  * @param metric_arg metric argument (used for Minkowski distance)
  */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
 void distance(raft::resources const& handle,
-              const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
               void* workspace,
               size_t worksize,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f)
 {
-  detail::distance<distanceType, InType, AccType, OutType, Index_>(
+  detail::distance<DistT, DataT, AccT, OutT, IdxT>(
     handle, x, y, dist, m, n, k, workspace, worksize, isRowMajor, metric_arg);
 }
 
 /**
  * @brief Return the exact workspace size to compute the distance
  * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
  * @param x first set of points
  * @param y second set of points
  * @param m number of points in x
  * @param n number of points in y
  * @param k dimensionality
  *
- * @note If the specified distanceType doesn't need the workspace at all, it
+ * @note If the specified DistT doesn't need the workspace at all, it
  * returns 0.
  */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
+size_t getWorkspaceSize(const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
 {
-  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
+  return detail::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>(x, y, m, n, k);
 }
 
 /**
  * @brief Return the exact workspace size to compute the distance
  * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
  * @param x first set of points (size m*k)
  * @param y second set of points (size n*k)
  * @return number of bytes needed in workspace
  *
- * @note If the specified distanceType doesn't need the workspace at all, it
+ * @note If the specified DistT doesn't need the workspace at all, it
  * returns 0.
  */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int,
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int,
           typename layout>
-size_t getWorkspaceSize(const raft::device_matrix_view<InType, layout> x,
-                        const raft::device_matrix_view<InType, layout> y)
+size_t getWorkspaceSize(raft::device_matrix_view<DataT, IdxT, layout> const& x,
+                        raft::device_matrix_view<DataT, IdxT, layout> const& y)
 {
   RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
 
-  return getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(
-    x.data(), y.data(), x.extent(0), y.extent(0), x.extent(1));
+  return getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>(
+    x.data_handle(), y.data_handle(), x.extent(0), y.extent(0), x.extent(1));
 }
 
 /**
  * @brief Evaluate pairwise distances for the simple use case
  * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
  * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
@@ -196,26 +193,26 @@ size_t getWorkspaceSize(const raft::device_matrix_view<InType, layout> x,
  * @param isRowMajor whether the matrices are row-major or col-major
  * @param metric_arg metric argument (used for Minkowski distance)
  */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT = int>
 void distance(raft::resources const& handle,
-              const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
+              const DataT* x,
+              const DataT* y,
+              OutT* dist,
+              IdxT m,
+              IdxT n,
+              IdxT k,
+              bool isRowMajor  = true,
+              DataT metric_arg = 2.0f)
 {
   auto stream = raft::resource::get_cuda_stream(handle);
   rmm::device_uvector<char> workspace(0, stream);
-  auto worksize = getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
+  auto worksize = getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>(x, y, m, n, k);
   workspace.resize(worksize, stream);
-  detail::distance<distanceType, InType, AccType, OutType, Index_>(
+  detail::distance<DistT, DataT, AccT, OutT, IdxT>(
     handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg);
 }
 
@@ -223,7 +220,7 @@ void distance(raft::resources const& handle,
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
+ * @tparam IdxT indexing type
  * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
@@ -237,14 +234,14 @@ void distance(raft::resources const& handle,
  * @param isRowMajor whether the matrices are row-major or col-major
  * @param metric_arg metric argument (used for Minkowski distance)
  */
-template <typename Type, typename Index_ = int>
+template <typename Type, typename IdxT = int>
 void pairwise_distance(raft::resources const& handle,
                        const Type* x,
                        const Type* y,
                        Type* dist,
-                       Index_ m,
-                       Index_ n,
-                       Index_ k,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
                        rmm::device_uvector<char>& workspace,
                        raft::distance::DistanceType metric,
                        bool isRowMajor = true,
@@ -253,9 +250,9 @@ void pairwise_distance(raft::resources const& handle,
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
 
   auto dispatch = [&](auto distance_type) {
-    auto worksize = getWorkspaceSize<distance_type(), Type, Type, Type, Index_>(x, y, m, n, k);
+    auto worksize = getWorkspaceSize<distance_type(), Type, Type, Type, IdxT>(x, y, m, n, k);
     workspace.resize(worksize, stream);
-    detail::distance<distance_type(), Type, Type, Type, Index_>(
+    detail::distance<distance_type(), Type, Type, Type, IdxT>(
       handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg);
   };
 
@@ -316,7 +313,7 @@ void pairwise_distance(raft::resources const& handle,
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
+ * @tparam IdxT indexing type
  * @param handle raft handle for managing expensive resources
  * @param x first set of points
  * @param y second set of points
@@ -328,21 +325,21 @@ void pairwise_distance(raft::resources const& handle,
  * @param isRowMajor whether the matrices are row-major or col-major
  * @param metric_arg metric argument (used for Minkowski distance)
  */
-template <typename Type, typename Index_ = int>
+template <typename Type, typename IdxT = int>
 void pairwise_distance(raft::resources const& handle,
                        const Type* x,
                        const Type* y,
                        Type* dist,
-                       Index_ m,
-                       Index_ n,
-                       Index_ k,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
                        raft::distance::DistanceType metric,
                        bool isRowMajor = true,
                        Type metric_arg = 2.0f)
 {
   auto stream = raft::resource::get_cuda_stream(handle);
   rmm::device_uvector<char> workspace(0, stream);
-  pairwise_distance<Type, Index_>(
+  pairwise_distance<Type, IdxT>(
     handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
 }
 
@@ -379,27 +376,27 @@ void pairwise_distance(raft::resources const& handle,
  * @endcode
  *
  * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam Index_ Index type
+ * @tparam DataT input argument type
+ * @tparam AccT accumulation type
+ * @tparam OutT output type
+ * @tparam IdxT Index type
  * @param handle raft handle for managing expensive resources
  * @param x first set of points (size n*k)
  * @param y second set of points (size m*k)
  * @param dist output distance matrix (size n*m)
  * @param metric_arg metric argument (used for Minkowski distance)
  */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
+template <raft::distance::DistanceType DistT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
           typename layout = raft::layout_c_contiguous,
-          typename Index_ = int>
+          typename IdxT   = int>
 void distance(raft::resources const& handle,
-              raft::device_matrix_view<InType, Index_, layout> const x,
-              raft::device_matrix_view<InType, Index_, layout> const y,
-              raft::device_matrix_view<OutType, Index_, layout> dist,
-              InType metric_arg = 2.0f)
+              raft::device_matrix_view<DataT, IdxT, layout> const x,
+              raft::device_matrix_view<DataT, IdxT, layout> const y,
+              raft::device_matrix_view<OutT, IdxT, layout> dist,
+              DataT metric_arg = 2.0f)
 {
   RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal.");
   RAFT_EXPECTS(dist.extent(0) == x.extent(0),
@@ -414,22 +411,22 @@ void distance(raft::resources const& handle,
 
   constexpr auto is_rowmajor = std::is_same_v<layout, layout_c_contiguous>;
 
-  distance<distanceType, InType, AccType, OutType, Index_>(handle,
-                                                           x.data_handle(),
-                                                           y.data_handle(),
-                                                           dist.data_handle(),
-                                                           x.extent(0),
-                                                           y.extent(0),
-                                                           x.extent(1),
-                                                           is_rowmajor,
-                                                           metric_arg);
+  distance<DistT, DataT, AccT, OutT, IdxT>(handle,
+                                           x.data_handle(),
+                                           y.data_handle(),
+                                           dist.data_handle(),
+                                           x.extent(0),
+                                           y.extent(0),
+                                           x.extent(1),
+                                           is_rowmajor,
+                                           metric_arg);
 }
 
 /**
  * @brief Convenience wrapper around 'distance' prim to convert runtime metric
  * into compile time for the purpose of dispatch
  * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
+ * @tparam IdxT indexing type
  * @param handle raft handle for managing expensive resources
  * @param x first matrix of points (size mxk)
  * @param y second matrix of points (size nxk)
@@ -437,11 +434,11 @@ void distance(raft::resources const& handle,
  * @param metric distance metric
  * @param metric_arg metric argument (used for Minkowski distance)
  */
-template <typename Type, typename layout = layout_c_contiguous, typename Index_ = int>
+template <typename Type, typename layout = layout_c_contiguous, typename IdxT = int>
 void pairwise_distance(raft::resources const& handle,
-                       device_matrix_view<Type, Index_, layout> const x,
-                       device_matrix_view<Type, Index_, layout> const y,
-                       device_matrix_view<Type, Index_, layout> dist,
+                       device_matrix_view<Type, IdxT, layout> const x,
+                       device_matrix_view<Type, IdxT, layout> const y,
+                       device_matrix_view<Type, IdxT, layout> dist,
                        raft::distance::DistanceType metric,
                        Type metric_arg = 2.0f)
 {
@@ -478,5 +475,3 @@ void pairwise_distance(raft::resources const& handle,
 
 };  // namespace distance
 };  // namespace raft
-
-#endif
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index e69de29bb2..bf38d5e5fe 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "distance-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "distance-ext.cuh"
+#endif
diff --git a/cpp/include/raft/distance/specializations/detail/00_write_template.py b/cpp/include/raft/distance/specializations/detail/00_write_template.py
deleted file mode 100644
index 63ae6580b4..0000000000
--- a/cpp/include/raft/distance/specializations/detail/00_write_template.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/env python3
-
-# This template manages all files in this directory, apart from
-# inner_product.cuh and kernels.cuh.
-
-
-# NOTE: this template is not perfectly formatted. Use pre-commit to get
-# everything in shape again.
-start_template = """/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-"""
-
-extern_template = """
-extern template void pairwise_matrix_instantiation_point<OpT,
-                                                         IdxT,
-                                                         DataT,
-                                                         OutT,
-                                                         FinopT>(
-  OpT,
-  pairwise_matrix_params<IdxT, DataT, OutT, FinopT>,
-  cudaStream_t);
-"""
-
-end_template = """}  // namespace raft::distance::detail
-"""
-
-data_type_instances = [
-    dict(
-        DataT="float",
-        AccT="float",
-        OutT="float",
-        IdxT="int",
-    ),
-    dict(
-        DataT="double",
-        AccT="double",
-        OutT="double",
-        IdxT="int",
-    ),
-]
-
-
-
-
-op_instances = [
-    dict(
-        path_prefix="canberra",
-        OpT="ops::canberra_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="correlation",
-        OpT="ops::correlation_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="cosine",
-        OpT="ops::cosine_distance_op<DataT, AccT, IdxT>",
-        # cosine uses CUTLASS for SM80+
-    ),
-    dict(
-        path_prefix="hamming_unexpanded",
-        OpT="ops::hamming_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="hellinger_expanded",
-        OpT="ops::hellinger_distance_op<DataT, AccT, IdxT>",
-    ),
-    # inner product is handled by cublas.
-    dict(
-        path_prefix="jensen_shannon",
-        OpT="ops::jensen_shannon_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="kl_divergence",
-        OpT="ops::kl_divergence_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="l1",
-        OpT="ops::l1_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="l2_expanded",
-        OpT="ops::l2_exp_distance_op<DataT, AccT, IdxT>",
-        # L2 expanded uses CUTLASS for SM80+
-    ),
-    dict(
-        path_prefix="l2_unexpanded",
-        OpT="ops::l2_unexp_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="l_inf",
-        OpT="ops::l_inf_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="lp_unexpanded",
-        OpT="ops::lp_unexp_distance_op<DataT, AccT, IdxT>",
-    ),
-    dict(
-        path_prefix="russel_rao",
-        OpT="ops::russel_rao_distance_op<DataT, AccT, IdxT>",
-    ),
-]
-
-def fill_in(s, template):
-    for k, v in template.items():
-        s = s.replace(k, v)
-    return s
-
-for op_instance in op_instances:
-    path = fill_in("path_prefix.cuh", op_instance)
-    with open(path, "w") as f:
-        f.write(start_template)
-
-        for data_type_instance in data_type_instances:
-            op_data_instance = {
-                k : fill_in(v, data_type_instance)
-                for k, v in op_instance.items()
-            }
-            instance = {
-                **op_data_instance,
-                **data_type_instance,
-                "FinopT": "raft::identity_op",
-            }
-
-            text = fill_in(extern_template, instance)
-
-            f.write(text)
-
-        f.write(end_template)
diff --git a/cpp/include/raft/distance/specializations/detail/canberra.cuh b/cpp/include/raft/distance/specializations/detail/canberra.cuh
deleted file mode 100644
index 276c85e5f6..0000000000
--- a/cpp/include/raft/distance/specializations/detail/canberra.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::canberra_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::canberra_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::canberra_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::canberra_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/correlation.cuh b/cpp/include/raft/distance/specializations/detail/correlation.cuh
deleted file mode 100644
index f019f678df..0000000000
--- a/cpp/include/raft/distance/specializations/detail/correlation.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::correlation_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::correlation_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::correlation_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::correlation_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/cosine.cuh b/cpp/include/raft/distance/specializations/detail/cosine.cuh
deleted file mode 100644
index dcde4ec286..0000000000
--- a/cpp/include/raft/distance/specializations/detail/cosine.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<ops::cosine_distance_op<float, float, int>,
-                                                         int,
-                                                         float,
-                                                         float,
-                                                         raft::identity_op>(
-  ops::cosine_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, raft::identity_op>,
-  cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::cosine_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::cosine_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh b/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
deleted file mode 100644
index 1d6964fbce..0000000000
--- a/cpp/include/raft/distance/specializations/detail/hamming_unexpanded.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::hamming_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::hamming_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::hamming_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::hamming_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh b/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
deleted file mode 100644
index f96a06f919..0000000000
--- a/cpp/include/raft/distance/specializations/detail/hellinger_expanded.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::hellinger_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::hellinger_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::hellinger_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::hellinger_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/inner_product.cuh b/cpp/include/raft/distance/specializations/detail/inner_product.cuh
deleted file mode 100644
index d97d678928..0000000000
--- a/cpp/include/raft/distance/specializations/detail/inner_product.cuh
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-extern template void distance<raft::distance::DistanceType::InnerProduct, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-
-extern template void
-distance<raft::distance::DistanceType::InnerProduct, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh b/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
deleted file mode 100644
index 0b58646582..0000000000
--- a/cpp/include/raft/distance/specializations/detail/jensen_shannon.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::jensen_shannon_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::jensen_shannon_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::jensen_shannon_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::jensen_shannon_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh b/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
deleted file mode 100644
index 5c164e0fd4..0000000000
--- a/cpp/include/raft/distance/specializations/detail/kl_divergence.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<float, float, int>,
-                                                         int,
-                                                         float,
-                                                         float,
-                                                         raft::identity_op>(
-  ops::kl_divergence_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, raft::identity_op>,
-  cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<double, double, int>,
-                                                         int,
-                                                         double,
-                                                         double,
-                                                         raft::identity_op>(
-  ops::kl_divergence_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, raft::identity_op>,
-  cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l1.cuh b/cpp/include/raft/distance/specializations/detail/l1.cuh
deleted file mode 100644
index 870627d909..0000000000
--- a/cpp/include/raft/distance/specializations/detail/l1.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<ops::l1_distance_op<float, float, int>,
-                                                         int,
-                                                         float,
-                                                         float,
-                                                         raft::identity_op>(
-  ops::l1_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, raft::identity_op>,
-  cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<ops::l1_distance_op<double, double, int>,
-                                                         int,
-                                                         double,
-                                                         double,
-                                                         raft::identity_op>(
-  ops::l1_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, raft::identity_op>,
-  cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh b/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
deleted file mode 100644
index ee3207bcce..0000000000
--- a/cpp/include/raft/distance/specializations/detail/l2_expanded.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<ops::l2_exp_distance_op<float, float, int>,
-                                                         int,
-                                                         float,
-                                                         float,
-                                                         raft::identity_op>(
-  ops::l2_exp_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, raft::identity_op>,
-  cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::l2_exp_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::l2_exp_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh b/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
deleted file mode 100644
index 1fbf57632b..0000000000
--- a/cpp/include/raft/distance/specializations/detail/l2_unexpanded.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::l2_unexp_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::l2_unexp_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::l2_unexp_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::l2_unexp_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/l_inf.cuh b/cpp/include/raft/distance/specializations/detail/l_inf.cuh
deleted file mode 100644
index 388d3bf439..0000000000
--- a/cpp/include/raft/distance/specializations/detail/l_inf.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<ops::l_inf_distance_op<float, float, int>,
-                                                         int,
-                                                         float,
-                                                         float,
-                                                         raft::identity_op>(
-  ops::l_inf_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, raft::identity_op>,
-  cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::l_inf_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::l_inf_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh b/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
deleted file mode 100644
index d8e86ce6f2..0000000000
--- a/cpp/include/raft/distance/specializations/detail/lp_unexpanded.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::lp_unexp_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::lp_unexp_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::lp_unexp_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::lp_unexp_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/detail/russel_rao.cuh b/cpp/include/raft/distance/specializations/detail/russel_rao.cuh
deleted file mode 100644
index 4803fb8ab0..0000000000
--- a/cpp/include/raft/distance/specializations/detail/russel_rao.cuh
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance.cuh>
-
-namespace raft::distance::detail {
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::russel_rao_distance_op<float, float, int>,
-  int,
-  float,
-  float,
-  raft::identity_op>(ops::russel_rao_distance_op<float, float, int>,
-                     pairwise_matrix_params<int, float, float, raft::identity_op>,
-                     cudaStream_t);
-
-extern template void pairwise_matrix_instantiation_point<
-  ops::russel_rao_distance_op<double, double, int>,
-  int,
-  double,
-  double,
-  raft::identity_op>(ops::russel_rao_distance_op<double, double, int>,
-                     pairwise_matrix_params<int, double, double, raft::identity_op>,
-                     cudaStream_t);
-}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index a34f696e9e..df53d896d6 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -16,19 +16,4 @@
 
 #pragma once
 
-#include <raft/distance/specializations/detail/canberra.cuh>
-#include <raft/distance/specializations/detail/correlation.cuh>
-#include <raft/distance/specializations/detail/cosine.cuh>
-#include <raft/distance/specializations/detail/hamming_unexpanded.cuh>
-#include <raft/distance/specializations/detail/hellinger_expanded.cuh>
-#include <raft/distance/specializations/detail/inner_product.cuh>
-#include <raft/distance/specializations/detail/jensen_shannon.cuh>
 #include <raft/distance/specializations/detail/kernels.cuh>
-#include <raft/distance/specializations/detail/kl_divergence.cuh>
-#include <raft/distance/specializations/detail/l1.cuh>
-#include <raft/distance/specializations/detail/l2_expanded.cuh>
-#include <raft/distance/specializations/detail/l2_unexpanded.cuh>
-#include <raft/distance/specializations/detail/l_inf.cuh>
-#include <raft/distance/specializations/detail/lp_unexpanded.cuh>
-#include <raft/distance/specializations/detail/russel_rao.cuh>
-#include <raft/distance/specializations/fused_l2_nn_min.cuh>
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch.cu
new file mode 100644
index 0000000000..7b91b3c3bf
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch.cu
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>
+#include <raft/distance/detail/distance_ops/all_ops.cuh>  // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>
+
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::canberra_distance_op, float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::canberra_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::correlation_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::correlation_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::cosine_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::cosine_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::hamming_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::hamming_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::hellinger_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::hellinger_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::jensen_shannon_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::jensen_shannon_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::kl_divergence_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::kl_divergence_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l1_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l1_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l2_exp_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l2_exp_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l2_unexp_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l2_unexp_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l_inf_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l_inf_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::lp_unexp_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::lp_unexp_distance_op,
+// double, double, double, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::russel_rao_distance_op,
+// float, float, float, raft::identity_op, int);
+// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::russel_rao_distance_op,
+// double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
new file mode 100644
index 0000000000..8978697ead
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+
+# NOTE: this template is not perfectly formatted. Use pre-commit to get
+# everything in shape again.
+header = """/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp> // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>  // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh> // dispatch
+"""
+
+
+macro = """
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \\
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \\
+  template void raft::distance::detail::                                               \\
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \\
+      OpT<DataT, AccT, IdxT> distance_op,                                              \\
+      IdxT m,                                                                          \\
+      IdxT n,                                                                          \\
+      IdxT k,                                                                          \\
+      const DataT* x,                                                                  \\
+      const DataT* y,                                                                  \\
+      const DataT* x_norm,                                                             \\
+      const DataT* y_norm,                                                             \\
+      OutT* out,                                                                       \\
+      FinOpT fin_op,                                                                   \\
+      cudaStream_t stream,                                                             \\
+      bool is_row_major)
+"""
+
+data_type_instances = [
+    dict(
+        DataT="float",
+        AccT="float",
+        OutT="float",
+        IdxT="int",
+    ),
+    dict(
+        DataT="double",
+        AccT="double",
+        OutT="double",
+        IdxT="int",
+    ),
+]
+
+op_instances = [
+    dict(
+        path_prefix="canberra",
+        OpT="raft::distance::detail::ops::canberra_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="correlation",
+        OpT="raft::distance::detail::ops::correlation_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="cosine",
+        OpT="raft::distance::detail::ops::cosine_distance_op",
+        archs = [60, 80],
+    ),
+    dict(
+        path_prefix="hamming_unexpanded",
+        OpT="raft::distance::detail::ops::hamming_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="hellinger_expanded",
+        OpT="raft::distance::detail::ops::hellinger_distance_op",
+        archs = [60],
+    ),
+    # inner product is handled by cublas.
+    dict(
+        path_prefix="jensen_shannon",
+        OpT="raft::distance::detail::ops::jensen_shannon_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="kl_divergence",
+        OpT="raft::distance::detail::ops::kl_divergence_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l1",
+        OpT="raft::distance::detail::ops::l1_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l2_expanded",
+        OpT="raft::distance::detail::ops::l2_exp_distance_op",
+        archs = [60, 80],
+    ),
+    dict(
+        path_prefix="l2_unexpanded",
+        OpT="raft::distance::detail::ops::l2_unexp_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l_inf",
+        OpT="raft::distance::detail::ops::l_inf_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="lp_unexpanded",
+        OpT="raft::distance::detail::ops::lp_unexp_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="russel_rao",
+        OpT="raft::distance::detail::ops::russel_rao_distance_op",
+        archs = [60],
+     ),
+]
+
+def arch_headers(op_instance):
+    include_headers ="\n".join([
+        f"#include <raft/distance/detail/pairwise_matrix/dispatch_sm{arch}.cuh>"
+        for arch in op_instance["archs"]
+    ])
+    return include_headers
+
+
+
+for op in op_instances:
+    for dt in data_type_instances:
+        DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
+        path = f"dispatch_{op['path_prefix']}_{DataT}_{AccT}_{OutT}_{IdxT}.cu"
+        with open(path, "w") as f:
+            f.write(header)
+            f.write(arch_headers(op))
+            f.write(macro)
+
+            OpT = op['OpT']
+            FinOpT = "raft::identity_op"
+            f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
+            f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
new file mode 100644
index 0000000000..e937e319f0
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::canberra_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
new file mode 100644
index 0000000000..87f6d3ba6b
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::canberra_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
new file mode 100644
index 0000000000..04d223edb3
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::correlation_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
new file mode 100644
index 0000000000..a5f66d448c
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::correlation_distance_op,
+  float,
+  float,
+  float,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
new file mode 100644
index 0000000000..afd9dfdffc
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::cosine_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
new file mode 100644
index 0000000000..cd720b5363
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::cosine_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..8b1ca0d6d5
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hamming_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..4a4338ebd1
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hamming_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
new file mode 100644
index 0000000000..e21a601ff6
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hellinger_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
new file mode 100644
index 0000000000..e1c08abf46
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::hellinger_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
new file mode 100644
index 0000000000..c3a675eb0e
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::jensen_shannon_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
new file mode 100644
index 0000000000..6dcc056e2d
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::jensen_shannon_distance_op,
+  float,
+  float,
+  float,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
new file mode 100644
index 0000000000..4bb4ad1f11
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::kl_divergence_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
new file mode 100644
index 0000000000..44d6c6cace
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::kl_divergence_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
new file mode 100644
index 0000000000..3d257c5001
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l1_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
new file mode 100644
index 0000000000..1a0b393a0a
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l1_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
new file mode 100644
index 0000000000..4059844964
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_exp_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
new file mode 100644
index 0000000000..474a29a149
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_exp_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..99624b59b9
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..5901e7c142
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
new file mode 100644
index 0000000000..22e1470bc3
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l_inf_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
new file mode 100644
index 0000000000..28a66bb36a
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l_inf_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
new file mode 100644
index 0000000000..3b36712161
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::lp_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
new file mode 100644
index 0000000000..be5f30a4d6
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::lp_unexp_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
new file mode 100644
index 0000000000..7e66efae9e
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::russel_rao_distance_op,
+  double,
+  double,
+  double,
+  raft::identity_op,
+  int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
new file mode 100644
index 0000000000..bb3f493445
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::russel_rao_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
new file mode 100644
index 0000000000..f986dd30ef
--- /dev/null
+++ b/cpp/src/distance/distance.cu
@@ -0,0 +1,909 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/distance-inl.cuh>
+
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT) \
+  template void raft::distance::distance<DT, DataT, AccT, OutT, FinalLambda, IdxT>(  \
+    raft::resources const& handle,                                                   \
+    const DataT* x,                                                                  \
+    const DataT* y,                                                                  \
+    OutT* dist,                                                                      \
+    IdxT m,                                                                          \
+    IdxT n,                                                                          \
+    IdxT k,                                                                          \
+    void* workspace,                                                                 \
+    size_t worksize,                                                                 \
+    FinalLambda fin_op,                                                              \
+    bool isRowMajor,                                                                 \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::identity_op,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, raft::identity_op, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_distance
+
+// Same, but without raft::identity_op
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, IdxT) \
+  template void raft::distance::distance<DT, DataT, AccT, OutT, IdxT>(  \
+    raft::resources const& handle,                                      \
+    const DataT* x,                                                     \
+    const DataT* y,                                                     \
+    OutT* dist,                                                         \
+    IdxT m,                                                             \
+    IdxT n,                                                             \
+    IdxT k,                                                             \
+    void* workspace,                                                    \
+    size_t worksize,                                                    \
+    bool isRowMajor,                                                    \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_distance
+
+// Same, but without workspace
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, IdxT) \
+  template void raft::distance::distance<DT, DataT, AccT, OutT, IdxT>(  \
+    raft::resources const& handle,                                      \
+    const DataT* x,                                                     \
+    const DataT* y,                                                     \
+    OutT* dist,                                                         \
+    IdxT m,                                                             \
+    IdxT n,                                                             \
+    IdxT k,                                                             \
+    bool isRowMajor,                                                    \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT)  \
+  template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>( \
+    const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CorrelationExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CorrelationExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CosineExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::CosineExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HammingUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HammingUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HellingerExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::HellingerExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtExpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Linf, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Linf, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::LpUnexpanded, double, double, double, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::RusselRaoExpanded, float, float, float, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::RusselRaoExpanded, double, double, double, int);
+
+#undef instantiate_raft_distance_getWorkspaceSize
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT, layout)  \
+  template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT, layout>( \
+    raft::device_matrix_view<DataT, IdxT, layout> const& x,                                 \
+    raft::device_matrix_view<DataT, IdxT, layout> const& y)
+
+// We could consider not taking template parameters for this function. The
+// number of instantiations seems a bit excessive..
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::Canberra, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CorrelationExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::CosineExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HammingUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::HellingerExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::InnerProduct,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::InnerProduct, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::InnerProduct,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::JensenShannon,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::JensenShannon, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::JensenShannon,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::KLDivergence,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::KLDivergence, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::KLDivergence,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L1, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, float, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, double, int, raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtExpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           float,
+                                           float,
+                                           float,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_f_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int, raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(raft::distance::DistanceType::L2Unexpanded,
+                                           double,
+                                           double,
+                                           double,
+                                           int,
+                                           raft::layout_c_contiguous);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, int, raft::layout_f_contiguous);
+
+#undef instantiate_raft_distance_getWorkspaceSize
+
+#define instantiate_raft_distance_pairwise_distance(DataT, IdxT)                        \
+  template void raft::distance::pairwise_distance(raft::resources const& handle,        \
+                                                  const DataT* x,                       \
+                                                  const DataT* y,                       \
+                                                  DataT* dist,                          \
+                                                  IdxT m,                               \
+                                                  IdxT n,                               \
+                                                  IdxT k,                               \
+                                                  rmm::device_uvector<char>& workspace, \
+                                                  raft::distance::DistanceType metric,  \
+                                                  bool isRowMajor,                      \
+                                                  DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, int);
+instantiate_raft_distance_pairwise_distance(double, int);
+
+#undef instantiate_raft_distance_pairwise_distance
+
+// Same, but without workspace
+#define instantiate_raft_distance_pairwise_distance(DataT, IdxT)                       \
+  template void raft::distance::pairwise_distance(raft::resources const& handle,       \
+                                                  const DataT* x,                      \
+                                                  const DataT* y,                      \
+                                                  DataT* dist,                         \
+                                                  IdxT m,                              \
+                                                  IdxT n,                              \
+                                                  IdxT k,                              \
+                                                  raft::distance::DistanceType metric, \
+                                                  bool isRowMajor,                     \
+                                                  DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, int);
+instantiate_raft_distance_pairwise_distance(double, int);
+
+#undef instantiate_raft_distance_pairwise_distance
+
+// Version with mdspan
+#define instantiate_raft_distance_distance(DistT, DataT, AccT, OutT, layout, IdxT) \
+  template void raft::distance::distance<DistT, DataT, AccT, OutT, layout, IdxT>(  \
+    raft::resources const& handle,                                                 \
+    raft::device_matrix_view<DataT, IdxT, layout> const x,                         \
+    raft::device_matrix_view<DataT, IdxT, layout> const y,                         \
+    raft::device_matrix_view<OutT, IdxT, layout> dist,                             \
+    DataT metric_arg)
+
+// Again, we might want to consider reigning in the number of instantiations...
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Canberra, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CorrelationExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::CosineExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HammingUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::HellingerExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::InnerProduct,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::InnerProduct, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::InnerProduct,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::JensenShannon,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::JensenShannon, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::JensenShannon,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::KLDivergence,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::KLDivergence, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::KLDivergence,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L1, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Expanded, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2SqrtUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::L2Unexpanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::Linf, double, double, double, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::LpUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(
+  raft::distance::DistanceType::LpUnexpanded, float, float, float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::LpUnexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_c_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::layout_f_contiguous,
+                                   int);
+instantiate_raft_distance_distance(raft::distance::DistanceType::RusselRaoExpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::layout_f_contiguous,
+                                   int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_pairwise_distance(DataT, layout, IdxT) \
+  template void raft::distance::pairwise_distance(                       \
+    raft::resources const& handle,                                       \
+    raft::device_matrix_view<DataT, IdxT, layout> const x,               \
+    raft::device_matrix_view<DataT, IdxT, layout> const y,               \
+    raft::device_matrix_view<DataT, IdxT, layout> dist,                  \
+    raft::distance::DistanceType metric,                                 \
+    DataT metric_arg)
+
+instantiate_raft_distance_pairwise_distance(float, raft::layout_c_contiguous, int);
+instantiate_raft_distance_pairwise_distance(float, raft::layout_f_contiguous, int);
+instantiate_raft_distance_pairwise_distance(double, raft::layout_c_contiguous, int);
+instantiate_raft_distance_pairwise_distance(double, raft::layout_f_contiguous, int);
+
+#undef instantiate_raft_distance_pairwise_distance
diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/distance/fused_l2_min_arg.cu
index b682446cc2..487c7e3a4a 100644
--- a/cpp/src/distance/fused_l2_min_arg.cu
+++ b/cpp/src/distance/fused_l2_min_arg.cu
@@ -20,6 +20,7 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
 #include <raft/distance/specializations.cuh>
+#include <raft/linalg/norm.cuh>
 #include <thrust/for_each.h>
 #include <thrust/tuple.h>
 
@@ -95,4 +96,4 @@ void fused_l2_nn_min_arg(raft::device_resources const& handle,
   compute_fused_l2_nn_min_arg<double, int>(handle, min, x, y, m, n, k, sqrt);
 }
 
-}  // end namespace raft::runtime::distance
\ No newline at end of file
+}  // end namespace raft::runtime::distance

From 50a8fe7441b2d02553b921eb90a7063e4694541c Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 11 Apr 2023 11:21:08 +0200
Subject: [PATCH 14/89] Clean up cmake

---
 cpp/CMakeLists.txt | 44 --------------------------------------------
 1 file changed, 44 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c4051984cc..bebc50f2c1 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -332,21 +332,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
-    # src/distance/detail/pairwise_matrix/canberra_float_float_float_int.cu
-    # src/distance/specializations/detail/canberra_double_double_double_int.cu
-    # src/distance/specializations/detail/canberra_float_float_float_int.cu
-    # src/distance/specializations/detail/correlation_double_double_double_int.cu
-    # src/distance/specializations/detail/correlation_float_float_float_int.cu
-    # src/distance/specializations/detail/cosine_double_double_double_int.cu
-    # src/distance/specializations/detail/cosine_float_float_float_int.cu
-    # src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
-    # src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
-    # src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
-    # src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
-    # src/distance/specializations/detail/inner_product_float_float_float_int.cu
-    # src/distance/specializations/detail/inner_product_double_double_double_int.cu
-    # src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
-    # src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
     src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
     src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
     src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
@@ -357,20 +342,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/brute_force_knn_int64_t_float.cu
     src/distance/specializations/detail/kernels/tanh_kernel_double.cu
     src/distance/specializations/detail/kernels/tanh_kernel_float.cu
-    # src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
-    # src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
-    # src/distance/specializations/detail/l1_float_float_float_int.cu
-    # src/distance/specializations/detail/l1_double_double_double_int.cu
-    # src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
-    # src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
-    # src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
-    # src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
-    # src/distance/specializations/detail/l_inf_double_double_double_int.cu
-    # src/distance/specializations/detail/l_inf_float_float_float_int.cu
-    # src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
-    # src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
-    # src/distance/specializations/detail/russel_rao_double_double_double_int.cu
-    # src/distance/specializations/detail/russel_rao_float_float_float_int.cu
     src/distance/specializations/fused_l2_nn_double_int.cu
     src/distance/specializations/fused_l2_nn_double_int64.cu
     src/distance/specializations/fused_l2_nn_float_int.cu
@@ -379,10 +350,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/matrix/detail/select_k_float_uint64_t.cu
     src/matrix/detail/select_k_half_uint32_t.cu
     src/matrix/detail/select_k_half_uint64_t.cu
-    # src/matrix/specializations/detail/select_k_float_uint32_t.cu
-    # src/matrix/specializations/detail/select_k_float_int64_t.cu
-    # src/matrix/specializations/detail/select_k_half_uint32_t.cu
-    # src/matrix/specializations/detail/select_k_half_int64_t.cu
     src/neighbors/ivfpq_build.cu
     src/neighbors/ivfpq_deserialize.cu
     src/neighbors/ivfpq_serialize.cu
@@ -431,17 +398,9 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
     src/neighbors/detail/ivf_flat_search.cu
-    # src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
-    # src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
-    # src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
-    # src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
     src/neighbors/specializations/ball_cover_all_knn_query.cu
     src/neighbors/specializations/ball_cover_build_index.cu
     src/neighbors/specializations/ball_cover_knn_query.cu
-    # src/neighbors/specializations/fused_l2_knn_long_float_true.cu
-    # src/neighbors/specializations/fused_l2_knn_long_float_false.cu
-    # src/neighbors/specializations/fused_l2_knn_int_float_true.cu
-    # src/neighbors/specializations/fused_l2_knn_int_float_false.cu
     src/neighbors/ivf_flat_search.cu
     src/neighbors/ivf_flat_build.cu
     src/neighbors/specializations/ivfflat_build_float_int64_t.cu
@@ -450,9 +409,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
     src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
     src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
-    # src/neighbors/specializations/ivfflat_search_float_int64_t.cu
-    # src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
-    # src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
     src/neighbors/ivfpq_build.cu
     src/neighbors/ivfpq_deserialize.cu
     src/neighbors/ivfpq_serialize.cu

From 97fa949503ef5d7b5083efee4520de88faad22fe Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 11 Apr 2023 11:41:34 +0200
Subject: [PATCH 15/89] Revert "Comment out omp to enable clang compilation"

This reverts commit 22bde6f55ca020250b06a3dfca847b5563cde5af.
---
 cpp/include/raft/neighbors/detail/refine.cuh | 77 ++++++++++----------
 1 file changed, 39 insertions(+), 38 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh
index 20f86d9bae..5fa0e1ab15 100644
--- a/cpp/include/raft/neighbors/detail/refine.cuh
+++ b/cpp/include/raft/neighbors/detail/refine.cuh
@@ -27,6 +27,7 @@
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
 #include <cstdlib>
+#include <omp.h>
 
 #include <thrust/sequence.h>
 
@@ -202,44 +203,44 @@ void refine_host(raft::host_matrix_view<const data_t, matrix_idx, row_major> dat
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "neighbors::refine_host(%zu, %u)", size_t(numQueries), uint32_t(topK));
 
-  // #pragma omp parallel
-  //   {
-  //     struct_for_refinement* sfr =
-  //       (struct_for_refinement*)malloc(sizeof(struct_for_refinement) * topK);
-  //     for (size_t i = omp_get_thread_num(); i < numQueries; i += omp_get_num_threads()) {
-  //       // compute distance with original dataset vectors
-  //       const data_t* cur_query = queries_ptr + ((uint64_t)dimDataset * i);
-  //       for (size_t j = 0; j < (size_t)topK; j++) {
-  //         idx_t id                  = neighbors[j + (topK * i)];
-  //         const data_t* cur_dataset = dataset_ptr + ((uint64_t)dimDataset * id);
-  //         float distance            = 0.0;
-  //         for (size_t k = 0; k < (size_t)dimDataset; k++) {
-  //           float val_q = (float)(cur_query[k]);
-  //           float val_d = (float)(cur_dataset[k]);
-  //           if (metric == raft::distance::DistanceType::InnerProduct) {
-  //             distance += -val_q * val_d;  // Negate because we sort in ascending order.
-  //           } else {
-  //             distance += (val_q - val_d) * (val_q - val_d);
-  //           }
-  //         }
-  //         sfr[j].id       = id;
-  //         sfr[j].distance = distance;
-  //       }
-
-  //       qsort(sfr, topK, sizeof(struct_for_refinement), _postprocessing_qsort_compare);
-
-  //       for (size_t j = 0; j < (size_t)refinedTopK; j++) {
-  //         refinedNeighbors[j + (refinedTopK * i)] = sfr[j].id;
-  //         if (refinedDistances == NULL) continue;
-  //         if (metric == raft::distance::DistanceType::InnerProduct) {
-  //           refinedDistances[j + (refinedTopK * i)] = -sfr[j].distance;
-  //         } else {
-  //           refinedDistances[j + (refinedTopK * i)] = sfr[j].distance;
-  //         }
-  //       }
-  //     }
-  //     free(sfr);
-  //   }
+#pragma omp parallel
+  {
+    struct_for_refinement* sfr =
+      (struct_for_refinement*)malloc(sizeof(struct_for_refinement) * topK);
+    for (size_t i = omp_get_thread_num(); i < numQueries; i += omp_get_num_threads()) {
+      // compute distance with original dataset vectors
+      const data_t* cur_query = queries_ptr + ((uint64_t)dimDataset * i);
+      for (size_t j = 0; j < (size_t)topK; j++) {
+        idx_t id                  = neighbors[j + (topK * i)];
+        const data_t* cur_dataset = dataset_ptr + ((uint64_t)dimDataset * id);
+        float distance            = 0.0;
+        for (size_t k = 0; k < (size_t)dimDataset; k++) {
+          float val_q = (float)(cur_query[k]);
+          float val_d = (float)(cur_dataset[k]);
+          if (metric == raft::distance::DistanceType::InnerProduct) {
+            distance += -val_q * val_d;  // Negate because we sort in ascending order.
+          } else {
+            distance += (val_q - val_d) * (val_q - val_d);
+          }
+        }
+        sfr[j].id       = id;
+        sfr[j].distance = distance;
+      }
+
+      qsort(sfr, topK, sizeof(struct_for_refinement), _postprocessing_qsort_compare);
+
+      for (size_t j = 0; j < (size_t)refinedTopK; j++) {
+        refinedNeighbors[j + (refinedTopK * i)] = sfr[j].id;
+        if (refinedDistances == NULL) continue;
+        if (metric == raft::distance::DistanceType::InnerProduct) {
+          refinedDistances[j + (refinedTopK * i)] = -sfr[j].distance;
+        } else {
+          refinedDistances[j + (refinedTopK * i)] = sfr[j].distance;
+        }
+      }
+    }
+    free(sfr);
+  }
 }
 
 }  // namespace raft::neighbors::detail

From ca1ed721fb4be924a534c4b91a269c22dee824dd Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 12 Apr 2023 11:17:30 +0200
Subject: [PATCH 16/89] Fix tests

---
 cpp/CMakeLists.txt                            | 22 ++++-
 cpp/include/raft/core/logger-inl.hpp          | 25 ++---
 cpp/include/raft/core/logger.hpp              |  2 +-
 cpp/include/raft/distance/distance-ext.cuh    | 12 +--
 .../raft/matrix/detail/select_k-ext.cuh       |  4 +
 .../neighbors/detail/selection_faiss-ext.cuh  |  5 +
 .../neighbors/specializations/ball_cover.cuh  |  3 +-
 .../detail/ball_cover_lowdim.hpp              | 85 -----------------
 .../knn/detail/ball_cover/registers.cuh       |  8 +-
 .../spatial/knn/detail/fused_l2_knn-ext.cuh   | 12 +--
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |  8 +-
 cpp/include/raft/util/memory_pool-inl.hpp     |  3 +-
 cpp/include/raft/util/raft_explicit.hpp       |  1 +
 .../pairwise_matrix/dispatch_00_generate.py   |  6 +-
 ...uint64_t.cu => select_k_double_int64_t.cu} |  2 +-
 .../matrix/detail/select_k_double_uint32_t.cu | 34 +++++++
 .../matrix/detail/select_k_float_int64_t.cu   | 33 +++++++
 ...f_uint64_t.cu => select_k_half_int64_t.cu} |  0
 cpp/src/neighbors/detail/selection_faiss.cu   |  8 ++
 .../detail/ball_cover_lowdim_pass_one_2d.cu   | 43 ---------
 .../detail/ball_cover_lowdim_pass_one_3d.cu   | 43 ---------
 .../detail/ball_cover_lowdim_pass_two_2d.cu   | 41 ---------
 .../detail/ball_cover_lowdim_pass_two_3d.cu   | 42 ---------
 .../ball_cover/registers_00_generate.py       | 91 +++++++++++++++++++
 .../ball_cover/registers_pass_one_2d_dist.cu  | 39 ++++++++
 .../registers_pass_one_2d_euclidean.cu        | 39 ++++++++
 .../registers_pass_one_2d_haversine.cu        | 39 ++++++++
 .../ball_cover/registers_pass_one_3d_dist.cu  | 39 ++++++++
 .../registers_pass_one_3d_euclidean.cu        | 39 ++++++++
 .../registers_pass_one_3d_haversine.cu        | 39 ++++++++
 .../ball_cover/registers_pass_two_2d_dist.cu  | 39 ++++++++
 .../registers_pass_two_2d_euclidean.cu        | 39 ++++++++
 .../registers_pass_two_2d_haversine.cu        | 39 ++++++++
 .../ball_cover/registers_pass_two_3d_dist.cu  | 39 ++++++++
 .../registers_pass_two_3d_euclidean.cu        | 39 ++++++++
 .../registers_pass_two_3d_haversine.cu        | 39 ++++++++
 cpp/src/spatial/knn/detail/fused_l2_knn.cu    | 46 ++++++++++
 .../knn/detail/fused_l2_knn_int32_t_float.cu  | 40 ++++++++
 .../knn/detail/fused_l2_knn_int64_t_float.cu  | 40 ++++++++
 .../knn/detail/fused_l2_knn_uint32_t_float.cu | 41 +++++++++
 cpp/test/CMakeLists.txt                       | 21 +++++
 cpp/test/core/handle.cpp                      |  1 +
 cpp/test/distance/dist_adj.cu                 | 16 +---
 cpp/test/distance/dist_adj.cuh                | 71 +++++++++++++++
 .../distance/dist_adj_distance_instance.cu    | 63 +++++++++++++
 cpp/test/distance/dist_adj_threshold.cuh      | 34 +++++++
 cpp/test/linalg/eigen_solvers.cu              |  4 +-
 cpp/test/matrix/select_k.cu                   | 10 +-
 cpp/test/neighbors/ann_utils.cuh              |  1 +
 cpp/test/neighbors/fused_l2_knn.cu            |  6 +-
 cpp/test/neighbors/selection.cu               |  9 +-
 51 files changed, 1083 insertions(+), 321 deletions(-)
 delete mode 100644 cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
 rename cpp/src/matrix/detail/{select_k_float_uint64_t.cu => select_k_double_int64_t.cu} (96%)
 create mode 100644 cpp/src/matrix/detail/select_k_double_uint32_t.cu
 create mode 100644 cpp/src/matrix/detail/select_k_float_int64_t.cu
 rename cpp/src/matrix/detail/{select_k_half_uint64_t.cu => select_k_half_int64_t.cu} (100%)
 delete mode 100644 cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
 create mode 100644 cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
 create mode 100644 cpp/src/spatial/knn/detail/fused_l2_knn.cu
 create mode 100644 cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
 create mode 100644 cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
 create mode 100644 cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
 create mode 100644 cpp/test/distance/dist_adj.cuh
 create mode 100644 cpp/test/distance/dist_adj_distance_instance.cu
 create mode 100644 cpp/test/distance/dist_adj_threshold.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index bebc50f2c1..fc7014fa0c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -346,10 +346,12 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/specializations/fused_l2_nn_double_int64.cu
     src/distance/specializations/fused_l2_nn_float_int.cu
     src/distance/specializations/fused_l2_nn_float_int64.cu
+    src/matrix/detail/select_k_double_int64_t.cu
+    src/matrix/detail/select_k_double_uint32_t.cu
+    src/matrix/detail/select_k_float_int64_t.cu
     src/matrix/detail/select_k_float_uint32_t.cu
-    src/matrix/detail/select_k_float_uint64_t.cu
+    src/matrix/detail/select_k_half_int64_t.cu
     src/matrix/detail/select_k_half_uint32_t.cu
-    src/matrix/detail/select_k_half_uint64_t.cu
     src/neighbors/ivfpq_build.cu
     src/neighbors/ivfpq_deserialize.cu
     src/neighbors/ivfpq_serialize.cu
@@ -449,7 +451,21 @@ if(RAFT_COMPILE_LIBRARY)
     src/random/rmat_rectangular_generator_int64_double.cu
     src/random/rmat_rectangular_generator_int_float.cu
     src/random/rmat_rectangular_generator_int64_float.cu
-    src/spatial/knn/detail/ball_cover/registers.cu
+    src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
+    src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
+    src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
     src/util/memory_pool.cpp
   )
   set_target_properties(
diff --git a/cpp/include/raft/core/logger-inl.hpp b/cpp/include/raft/core/logger-inl.hpp
index b63b98ae90..ee007eb2b8 100644
--- a/cpp/include/raft/core/logger-inl.hpp
+++ b/cpp/include/raft/core/logger-inl.hpp
@@ -36,6 +36,7 @@
 #define SPDLOG_HEADER_ONLY
 #include <raft/core/detail/callback_sink.hpp>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/inline.hpp>               // RAFT_INLINE_CONDITIONAL
 #include <spdlog/sinks/stdout_color_sinks.h>  // NOLINT
 #include <spdlog/spdlog.h>                    // NOLINT
 
@@ -111,7 +112,7 @@ class logger::impl {  // defined privately here
  * @todo This currently only supports logging to stdout. Need to add support in
  *       future to add custom loggers as well [Issue #2046]
  */
-logger::logger(std::string const& name_) : pimpl(new impl(name_))
+RAFT_INLINE_CONDITIONAL logger::logger(std::string const& name_) : pimpl(new impl(name_))
 {
   set_pattern(default_log_pattern);
   set_level(RAFT_ACTIVE_LEVEL);
@@ -121,7 +122,7 @@ logger::logger(std::string const& name_) : pimpl(new impl(name_))
  *
  * @return the singleton logger object
  */
-logger& logger::get(std::string const& name)
+RAFT_INLINE_CONDITIONAL logger& logger::get(std::string const& name)
 {
   if (log_map.find(name) == log_map.end()) { log_map[name] = std::make_shared<raft::logger>(name); }
   return *log_map[name];
@@ -138,7 +139,7 @@ logger& logger::get(std::string const& name)
  *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
  *       be ignored. See documentation of decisiontree for how this gets used
  */
-void logger::set_level(int level)
+RAFT_INLINE_CONDITIONAL void logger::set_level(int level)
 {
   level = raft::detail::convert_level_to_spdlog(level);
   pimpl->spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
@@ -151,7 +152,7 @@ void logger::set_level(int level)
  *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
  *                    to know the right syntax of this pattern
  */
-void logger::set_pattern(const std::string& pattern)
+RAFT_INLINE_CONDITIONAL void logger::set_pattern(const std::string& pattern)
 {
   pimpl->cur_pattern = pattern;
   pimpl->spdlogger->set_pattern(pattern);
@@ -162,7 +163,7 @@ void logger::set_pattern(const std::string& pattern)
  *
  * @param[in] callback the function to be run on all logged messages
  */
-void logger::set_callback(void (*callback)(int lvl, const char* msg))
+RAFT_INLINE_CONDITIONAL void logger::set_callback(void (*callback)(int lvl, const char* msg))
 {
   pimpl->sink->set_callback(callback);
 }
@@ -172,7 +173,7 @@ void logger::set_callback(void (*callback)(int lvl, const char* msg))
  *
  * @param[in] flush the function to use when flushing logs
  */
-void logger::set_flush(void (*flush)()) { pimpl->sink->set_flush(flush); }
+RAFT_INLINE_CONDITIONAL void logger::set_flush(void (*flush)()) { pimpl->sink->set_flush(flush); }
 
 /**
  * @brief Tells whether messages will be logged for the given log level
@@ -180,7 +181,7 @@ void logger::set_flush(void (*flush)()) { pimpl->sink->set_flush(flush); }
  * @param[in] level log level to be checked for
  * @return true if messages will be logged for this level, else false
  */
-bool logger::should_log_for(int level) const
+RAFT_INLINE_CONDITIONAL bool logger::should_log_for(int level) const
 {
   level        = raft::detail::convert_level_to_spdlog(level);
   auto level_e = static_cast<spdlog::level::level_enum>(level);
@@ -192,7 +193,7 @@ bool logger::should_log_for(int level) const
  *
  * @return the current log level
  */
-int logger::get_level() const
+RAFT_INLINE_CONDITIONAL int logger::get_level() const
 {
   auto level_e = pimpl->spdlogger->level();
   return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
@@ -202,7 +203,7 @@ int logger::get_level() const
  * @brief Get the current logging pattern
  * @return the pattern
  */
-std::string logger::get_pattern() const { return pimpl->cur_pattern; }
+RAFT_INLINE_CONDITIONAL std::string logger::get_pattern() const { return pimpl->cur_pattern; }
 
 /**
  * @brief Main logging method
@@ -210,7 +211,7 @@ std::string logger::get_pattern() const { return pimpl->cur_pattern; }
  * @param[in] level logging level of this message
  * @param[in] fmt   C-like format string, followed by respective params
  */
-void logger::log(int level, const char* fmt, ...)
+RAFT_INLINE_CONDITIONAL void logger::log(int level, const char* fmt, ...)
 {
   level        = raft::detail::convert_level_to_spdlog(level);
   auto level_e = static_cast<spdlog::level::level_enum>(level);
@@ -227,8 +228,8 @@ void logger::log(int level, const char* fmt, ...)
 /**
  * @brief Flush logs by calling flush on underlying logger
  */
-void logger::flush() { pimpl->spdlogger->flush(); }
+RAFT_INLINE_CONDITIONAL void logger::flush() { pimpl->spdlogger->flush(); }
 
-logger::~logger() {}
+RAFT_INLINE_CONDITIONAL logger::~logger() {}
 
 };  // namespace raft
diff --git a/cpp/include/raft/core/logger.hpp b/cpp/include/raft/core/logger.hpp
index 84e44ab7e8..109cf8b334 100644
--- a/cpp/include/raft/core/logger.hpp
+++ b/cpp/include/raft/core/logger.hpp
@@ -17,7 +17,7 @@
 
 #include "logger-macros.hpp"
 
-#if defined(RAFT_COMPILED) && defined(RAFT_EXPLICIT_INSTANTIATE)
+#ifdef RAFT_COMPILED
 #include "logger-ext.hpp"
 #else
 #include "logger-inl.hpp"
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index 20af73e401..4df42ca72e 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -15,12 +15,12 @@
  */
 #pragma once
 
-#include <raft/core/device_mdspan.hpp>       // raft::device_matrix_view
-#include <raft/core/operators.hpp>           // raft::identity_op
-#include <raft/core/resources.hpp>           // raft::resources
-#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
-#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
-#include <rmm/device_uvector.hpp>            // rmm::device_uvector
+#include <raft/core/device_mdspan.hpp>                  // raft::device_matrix_view
+#include <raft/core/operators.hpp>                      // raft::identity_op
+#include <raft/core/resources.hpp>                      // raft::resources
+#include <raft/distance/distance_types.hpp>             // raft::distance::DistanceType
+#include <raft/util/raft_explicit.hpp>                  // RAFT_EXPLICIT
+#include <rmm/device_uvector.hpp>                       // rmm::device_uvector
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE
 
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index 074ac3127a..2f2912219f 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -94,5 +94,9 @@ instantiate_raft_matrix_detail_select_k(__half, uint32_t);
 instantiate_raft_matrix_detail_select_k(__half, int64_t);
 instantiate_raft_matrix_detail_select_k(float, int64_t);
 instantiate_raft_matrix_detail_select_k(float, uint32_t);
+// We did not have these two for double before, but there are tests for them. We
+// therefore include them here.
+instantiate_raft_matrix_detail_select_k(double, int64_t);
+instantiate_raft_matrix_detail_select_k(double, uint32_t);
 
 #undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
index cd6fdee192..c724aa0407 100644
--- a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
@@ -65,6 +65,11 @@ void select_k(const key_t* inK,
 // @benfred: Not sure if this is correct. Should I not flip float and uint32_t?
 // It seems weird that float is the key and uint32_t is the payload type.
 instantiate_raft_neighbors_detail_select_k(uint32_t, float);
+instantiate_raft_neighbors_detail_select_k(int32_t, float);
 instantiate_raft_neighbors_detail_select_k(long, float);
+instantiate_raft_neighbors_detail_select_k(size_t, double);
+// test/neighbors/selection.cu
+instantiate_raft_neighbors_detail_select_k(int, double);
+instantiate_raft_neighbors_detail_select_k(size_t, float);
 
 #undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/include/raft/neighbors/specializations/ball_cover.cuh b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
index d6a6b2e296..33e1a272e3 100644
--- a/cpp/include/raft/neighbors/specializations/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/neighbors/ball_cover.cuh>
 #include <raft/neighbors/ball_cover_types.hpp>
-#include <raft/neighbors/specializations/detail/ball_cover_lowdim.hpp>
 
 #include <cstdint>
 
@@ -50,4 +49,4 @@ extern template void all_knn_query<std::int64_t, float, std::uint32_t, std::uint
   bool perform_post_filtering,
   float weight);
 
-};  // namespace raft::neighbors::ball_cover
\ No newline at end of file
+};  // namespace raft::neighbors::ball_cover
diff --git a/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp b/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
deleted file mode 100644
index c558ab8b56..0000000000
--- a/cpp/include/raft/neighbors/specializations/detail/ball_cover_lowdim.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/spatial/knn/detail/ball_cover/common.cuh>
-#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 2>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* dists_counter);
-
-extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* post_dists_counter);
-
-extern template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* dists_counter);
-
-extern template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* post_dists_counter);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index 399d4b07c6..b5925680f2 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -16,8 +16,10 @@
 
 #pragma once
 
-#if defined(RAFT_COMPILED) && defined(RAFT_EXPLICIT_INSTANTIATE)
-#include "registers-ext.cuh"
-#else
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
 #include "registers-inl.cuh"
 #endif
+
+#ifdef RAFT_COMPILED
+#include "registers-ext.cuh"
+#endif
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
index 5f4d5a6347..07ddf3a166 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
@@ -72,13 +72,13 @@ void fusedL2Knn(size_t D,
     cudaStream_t stream,                                                                    \
     raft::distance::DistanceType metric)
 
-instantiate_raft_spatial_knn_detail_fusedL2Knn(long, float, true);
-instantiate_raft_spatial_knn_detail_fusedL2Knn(long, float, false);
-instantiate_raft_spatial_knn_detail_fusedL2Knn(int, float, true);
-instantiate_raft_spatial_knn_detail_fusedL2Knn(int, float, false);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, false);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, false);
 
 // These are used by brute_force_knn:
-instantiate_raft_spatial_knn_detail_fusedL2Knn(unsigned int, float, true);
-instantiate_raft_spatial_knn_detail_fusedL2Knn(unsigned int, float, false);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(uint32_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(uint32_t, float, false);
 
 #undef instantiate_raft_spatial_knn_detail_fusedL2Knn
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 7b16fc6f72..f9b9138168 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -15,10 +15,10 @@
  */
 #pragma once
 
-#if defined(RAFT_COMPILED)
-#include "fused_l2_knn-ext.cuh"
-#endif
-
 #if !defined(RAFT_EXPLICIT_INSTANTIATE)
 #include "fused_l2_knn-inl.cuh"
 #endif
+
+#if defined(RAFT_COMPILED)
+#include "fused_l2_knn-ext.cuh"
+#endif
diff --git a/cpp/include/raft/util/memory_pool-inl.hpp b/cpp/include/raft/util/memory_pool-inl.hpp
index 43ac24d3f6..6d457923b9 100644
--- a/cpp/include/raft/util/memory_pool-inl.hpp
+++ b/cpp/include/raft/util/memory_pool-inl.hpp
@@ -18,13 +18,14 @@
 #include <cstddef>
 #include <memory>
 
+#include <raft/util/inline.hpp>  // RAFT_INLINE_CONDITIONAL
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 namespace raft {
 
-std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
+RAFT_INLINE_CONDITIONAL std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
   rmm::mr::device_memory_resource*& mr, size_t initial_size)
 {
   using pool_res_t = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
diff --git a/cpp/include/raft/util/raft_explicit.hpp b/cpp/include/raft/util/raft_explicit.hpp
index fd81fe23de..ceb82fadf7 100644
--- a/cpp/include/raft/util/raft_explicit.hpp
+++ b/cpp/include/raft/util/raft_explicit.hpp
@@ -17,6 +17,7 @@
 #define RAFT_EXPLICIT                                                     \
   {                                                                       \
     raft::util::raft_explicit::do_not_implicitly_instantiate_templates(); \
+    throw "raft_explicit_error";                                          \
   }
 
 namespace raft::util::raft_explicit {
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
index 8978697ead..42ce3e2291 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -127,10 +127,10 @@
      ),
 ]
 
-def arch_headers(op_instance):
+def arch_headers(archs):
     include_headers ="\n".join([
         f"#include <raft/distance/detail/pairwise_matrix/dispatch_sm{arch}.cuh>"
-        for arch in op_instance["archs"]
+        for arch in archs
     ])
     return include_headers
 
@@ -142,7 +142,7 @@ def arch_headers(op_instance):
         path = f"dispatch_{op['path_prefix']}_{DataT}_{AccT}_{OutT}_{IdxT}.cu"
         with open(path, "w") as f:
             f.write(header)
-            f.write(arch_headers(op))
+            f.write(arch_headers(op["archs"]))
             f.write(macro)
 
             OpT = op['OpT']
diff --git a/cpp/src/matrix/detail/select_k_float_uint64_t.cu b/cpp/src/matrix/detail/select_k_double_int64_t.cu
similarity index 96%
rename from cpp/src/matrix/detail/select_k_float_uint64_t.cu
rename to cpp/src/matrix/detail/select_k_double_int64_t.cu
index 3bb47acbf2..022627283a 100644
--- a/cpp/src/matrix/detail/select_k_float_uint64_t.cu
+++ b/cpp/src/matrix/detail/select_k_double_int64_t.cu
@@ -28,6 +28,6 @@
                                                rmm::cuda_stream_view stream, \
                                                rmm::mr::device_memory_resource* mr)
 
-instantiate_raft_matrix_detail_select_k(float, uint32_t);
+instantiate_raft_matrix_detail_select_k(double, int64_t);
 
 #undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
new file mode 100644
index 0000000000..22c6989337
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_double_uint32_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // uint32_t
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_float_int64_t.cu b/cpp/src/matrix/detail/select_k_float_int64_t.cu
new file mode 100644
index 0000000000..1f1d686048
--- /dev/null
+++ b/cpp/src/matrix/detail/select_k_float_int64_t.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_matrix_detail_select_k(T, IdxT)                     \
+  template void raft::matrix::detail::select_k(const T* in_val,              \
+                                               const IdxT* in_idx,           \
+                                               size_t batch_size,            \
+                                               size_t len,                   \
+                                               int k,                        \
+                                               T* out_val,                   \
+                                               IdxT* out_idx,                \
+                                               bool select_min,              \
+                                               rmm::cuda_stream_view stream, \
+                                               rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_matrix_detail_select_k(float, int64_t);
+
+#undef instantiate_raft_matrix_detail_select_k
diff --git a/cpp/src/matrix/detail/select_k_half_uint64_t.cu b/cpp/src/matrix/detail/select_k_half_int64_t.cu
similarity index 100%
rename from cpp/src/matrix/detail/select_k_half_uint64_t.cu
rename to cpp/src/matrix/detail/select_k_half_int64_t.cu
diff --git a/cpp/src/neighbors/detail/selection_faiss.cu b/cpp/src/neighbors/detail/selection_faiss.cu
index 067ac5fdda..034f37f8cc 100644
--- a/cpp/src/neighbors/detail/selection_faiss.cu
+++ b/cpp/src/neighbors/detail/selection_faiss.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
 #include <raft/neighbors/detail/selection_faiss-inl.cuh>
 
 #define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
@@ -30,6 +32,12 @@
 // @benfred: Not sure if this is correct. Should I not flip float and uint32_t?
 // It seems weird that float is the key and uint32_t is the payload type.
 instantiate_raft_neighbors_detail_select_k(uint32_t, float);
+instantiate_raft_neighbors_detail_select_k(int32_t, float);
 instantiate_raft_neighbors_detail_select_k(long, float);
+// Needed by the tests
+instantiate_raft_neighbors_detail_select_k(size_t, double);
+// test/neighbors/selection.cu
+instantiate_raft_neighbors_detail_select_k(int, double);
+instantiate_raft_neighbors_detail_select_k(size_t, float);
 
 #undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
deleted file mode 100644
index b69751a62a..0000000000
--- a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_2d.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* dists_counter);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
deleted file mode 100644
index ca44ad3165..0000000000
--- a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_one_3d.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void rbc_low_dim_pass_one<std::int64_t, float, std::uint32_t, 3>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* dists_counter);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
deleted file mode 100644
index ba44327653..0000000000
--- a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_2d.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 2>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* post_dists_counter);
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu b/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
deleted file mode 100644
index 59132c1f99..0000000000
--- a/cpp/src/neighbors/specializations/detail/ball_cover_lowdim_pass_two_3d.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void rbc_low_dim_pass_two<std::int64_t, float, std::uint32_t, 3>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t>& index,
-  const float* query,
-  const std::uint32_t n_query_rows,
-  std::uint32_t k,
-  const std::int64_t* R_knn_inds,
-  const float* R_knn_dists,
-  DistFunc<float, std::uint32_t>& dfunc,
-  std::int64_t* inds,
-  float* dists,
-  float weight,
-  std::uint32_t* post_dists_counter);
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
\ No newline at end of file
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py b/cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py
new file mode 100644
index 0000000000..04e5a9e4b6
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+
+header = """/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint> // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+"""
+
+
+macro_pass_one = """
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \\
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \\
+  template void                                                                       \\
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \\
+    raft::device_resources const& handle,                                                    \\
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \\
+    const Mvalue_t* query,                                                                   \\
+    const Mvalue_int n_query_rows,                                                           \\
+    Mvalue_int k,                                                                            \\
+    const Mvalue_idx* R_knn_inds,                                                            \\
+    const Mvalue_t* R_knn_dists,                                                             \\
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \\
+    Mvalue_idx* inds,                                                                        \\
+    Mvalue_t* dists,                                                                         \\
+    float weight,                                                                            \\
+    Mvalue_int* dists_counter)
+
+"""
+
+macro_pass_two = """
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \\
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \\
+  template void                                                                       \\
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \\
+    raft::device_resources const& handle,                                                    \\
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \\
+    const Mvalue_t* query,                                                                   \\
+    const Mvalue_int n_query_rows,                                                           \\
+    Mvalue_int k,                                                                            \\
+    const Mvalue_idx* R_knn_inds,                                                            \\
+    const Mvalue_t* R_knn_dists,                                                             \\
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \\
+    Mvalue_idx* inds,                                                                        \\
+    Mvalue_t* dists,                                                                         \\
+    float weight,                                                                            \\
+    Mvalue_int* dists_counter)
+
+"""
+
+distances = dict(
+    haversine="raft::spatial::knn::detail::HaversineFunc",
+    euclidean="raft::spatial::knn::detail::EuclideanFunc",
+    dist="raft::spatial::knn::detail::DistFunc",
+)
+
+for k, v in distances.items():
+    for dim in [2, 3]:
+        path = f"registers_pass_one_{dim}d_{k}.cu"
+        with open(path, "w") as f:
+            f.write(header)
+            f.write(macro_pass_one)
+            f.write(f"instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(\n")
+            f.write(f"  std::int64_t, float, std::uint32_t, {dim}, {v});\n")
+            f.write("#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one\n")
+        print(f"src/spatial/knn/detail/ball_cover/{path}")
+
+for k, v in distances.items():
+    for dim in [2, 3]:
+        path = f"registers_pass_two_{dim}d_{k}.cu"
+        with open(path, "w") as f:
+            f.write(header)
+            f.write(macro_pass_two)
+            f.write(f"instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(\n")
+            f.write(f"  std::int64_t, float, std::uint32_t, {dim}, {v});\n")
+            f.write("#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two\n")
+        print(f"src/spatial/knn/detail/ball_cover/{path}")
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
new file mode 100644
index 0000000000..42a14d11e0
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::DistFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
new file mode 100644
index 0000000000..437b5a3d7e
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::EuclideanFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
new file mode 100644
index 0000000000..ef69305571
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::HaversineFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
new file mode 100644
index 0000000000..111513e6d0
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::DistFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
new file mode 100644
index 0000000000..98dbcac2aa
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::EuclideanFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
new file mode 100644
index 0000000000..7b0c885986
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_one<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::HaversineFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
new file mode 100644
index 0000000000..17dafbe862
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::DistFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
new file mode 100644
index 0000000000..7a3d770b87
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::EuclideanFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
new file mode 100644
index 0000000000..0c877ad717
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 2, raft::spatial::knn::detail::HaversineFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
new file mode 100644
index 0000000000..2bfa4bdf5e
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::DistFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
new file mode 100644
index 0000000000..30f3ad97e9
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::EuclideanFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
new file mode 100644
index 0000000000..8b25ca9698
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>  // int64_t
+#include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(                            \
+  Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
+  template void                                                                              \
+  raft::spatial::knn::detail::rbc_low_dim_pass_two<Mvalue_idx, Mvalue_t, Mvalue_int, Mdims>( \
+    raft::device_resources const& handle,                                                    \
+    const BallCoverIndex<Mvalue_idx, Mvalue_t, Mvalue_int>& index,                           \
+    const Mvalue_t* query,                                                                   \
+    const Mvalue_int n_query_rows,                                                           \
+    Mvalue_int k,                                                                            \
+    const Mvalue_idx* R_knn_inds,                                                            \
+    const Mvalue_t* R_knn_dists,                                                             \
+    Mdist_func<Mvalue_t, Mvalue_int>& dfunc,                                                 \
+    Mvalue_idx* inds,                                                                        \
+    Mvalue_t* dists,                                                                         \
+    float weight,                                                                            \
+    Mvalue_int* dists_counter)
+
+instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two(
+  std::int64_t, float, std::uint32_t, 3, raft::spatial::knn::detail::HaversineFunc);
+#undef instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_two
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn.cu b/cpp/src/spatial/knn/detail/fused_l2_knn.cu
new file mode 100644
index 0000000000..8795e265e9
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>                           // size_t
+#include <cstdint>                           // int_Xt
+#include <raft/distance/distance_types.hpp>  // DistanceType
+#include <raft/spatial/knn/detail/fused_l2_knn-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs)  \
+  template void raft::spatial::knn::detail::fusedL2Knn<Mvalue_idx, Mvalue_t, MusePrevTopKs>( \
+    size_t D,                                                                                \
+    Mvalue_idx * out_inds,                                                                   \
+    Mvalue_t * out_dists,                                                                    \
+    const Mvalue_t* index,                                                                   \
+    const Mvalue_t* query,                                                                   \
+    size_t n_index_rows,                                                                     \
+    size_t n_query_rows,                                                                     \
+    int k,                                                                                   \
+    bool rowMajorIndex,                                                                      \
+    bool rowMajorQuery,                                                                      \
+    cudaStream_t stream,                                                                     \
+    raft::distance::DistanceType metric)
+
+instantiate_raft_spatial_knn_detail_fusedL2Knn(long, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(long, float, false);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int, float, false);
+
+// These are used by brute_force_knn:
+instantiate_raft_spatial_knn_detail_fusedL2Knn(unsigned int, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(unsigned int, float, false);
+
+#undef instantiate_raft_spatial_knn_detail_fusedL2Knn
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
new file mode 100644
index 0000000000..67b08655e6
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>                           // size_t
+#include <cstdint>                           // int_Xt
+#include <raft/distance/distance_types.hpp>  // DistanceType
+#include <raft/spatial/knn/detail/fused_l2_knn-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs)  \
+  template void raft::spatial::knn::detail::fusedL2Knn<Mvalue_idx, Mvalue_t, MusePrevTopKs>( \
+    size_t D,                                                                                \
+    Mvalue_idx * out_inds,                                                                   \
+    Mvalue_t * out_dists,                                                                    \
+    const Mvalue_t* index,                                                                   \
+    const Mvalue_t* query,                                                                   \
+    size_t n_index_rows,                                                                     \
+    size_t n_query_rows,                                                                     \
+    int k,                                                                                   \
+    bool rowMajorIndex,                                                                      \
+    bool rowMajorQuery,                                                                      \
+    cudaStream_t stream,                                                                     \
+    raft::distance::DistanceType metric)
+
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int32_t, float, false);
+
+#undef instantiate_raft_spatial_knn_detail_fusedL2Knn
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
new file mode 100644
index 0000000000..3c0d13710e
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>                           // size_t
+#include <cstdint>                           // int_Xt
+#include <raft/distance/distance_types.hpp>  // DistanceType
+#include <raft/spatial/knn/detail/fused_l2_knn-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs)  \
+  template void raft::spatial::knn::detail::fusedL2Knn<Mvalue_idx, Mvalue_t, MusePrevTopKs>( \
+    size_t D,                                                                                \
+    Mvalue_idx * out_inds,                                                                   \
+    Mvalue_t * out_dists,                                                                    \
+    const Mvalue_t* index,                                                                   \
+    const Mvalue_t* query,                                                                   \
+    size_t n_index_rows,                                                                     \
+    size_t n_query_rows,                                                                     \
+    int k,                                                                                   \
+    bool rowMajorIndex,                                                                      \
+    bool rowMajorQuery,                                                                      \
+    cudaStream_t stream,                                                                     \
+    raft::distance::DistanceType metric)
+
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(int64_t, float, false);
+
+#undef instantiate_raft_spatial_knn_detail_fusedL2Knn
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu b/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
new file mode 100644
index 0000000000..e799c5181f
--- /dev/null
+++ b/cpp/src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstddef>                           // size_t
+#include <cstdint>                           // int_Xt
+#include <raft/distance/distance_types.hpp>  // DistanceType
+#include <raft/spatial/knn/detail/fused_l2_knn-inl.cuh>
+
+#define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs)  \
+  template void raft::spatial::knn::detail::fusedL2Knn<Mvalue_idx, Mvalue_t, MusePrevTopKs>( \
+    size_t D,                                                                                \
+    Mvalue_idx * out_inds,                                                                   \
+    Mvalue_t * out_dists,                                                                    \
+    const Mvalue_t* index,                                                                   \
+    const Mvalue_t* query,                                                                   \
+    size_t n_index_rows,                                                                     \
+    size_t n_query_rows,                                                                     \
+    int k,                                                                                   \
+    bool rowMajorIndex,                                                                      \
+    bool rowMajorQuery,                                                                      \
+    cudaStream_t stream,                                                                     \
+    raft::distance::DistanceType metric)
+
+// These are used by brute_force_knn:
+instantiate_raft_spatial_knn_detail_fusedL2Knn(uint32_t, float, true);
+instantiate_raft_spatial_knn_detail_fusedL2Knn(uint32_t, float, false);
+
+#undef instantiate_raft_spatial_knn_detail_fusedL2Knn
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index e805f53712..c7292361b7 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -121,6 +121,7 @@ if(BUILD_TESTS)
     DISTANCE_TEST
     PATH
     test/distance/dist_adj.cu
+    test/distance/dist_adj_distance_instance.cu
     test/distance/dist_canberra.cu
     test/distance/dist_correlation.cu
     test/distance/dist_cos.cu
@@ -316,4 +317,24 @@ if(BUILD_TESTS)
     NAME UTILS_TEST PATH test/core/seive.cu test/util/bitonic_sort.cu test/util/cudart_utils.cpp
     test/util/device_atomics.cu test/util/integer_utils.cpp test/util/pow2_utils.cu
   )
+
+  add_custom_target(ALL_TESTS)
+  add_dependencies(
+    ALL_TESTS
+    CLUSTER_TEST
+    CORE_TEST
+    DISTANCE_TEST
+    LABEL_TEST
+    LINALG_TEST
+    MATRIX_TEST
+    NEIGHBORS_TEST
+    RANDOM_TEST
+    SOLVERS_TEST
+    SPARSE_DIST_TEST
+    SPARSE_NEIGHBORS_TEST
+    SPARSE_TEST
+    STATS_TEST
+    UTILS_TEST
+  )
+
 endif()
diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp
index 9f416d3ae8..fddfd58bb8 100644
--- a/cpp/test/core/handle.cpp
+++ b/cpp/test/core/handle.cpp
@@ -22,6 +22,7 @@
 #include <raft/core/comms.hpp>
 #include <raft/core/handle.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 #include <unordered_map>
 
 namespace raft {
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index ce802e5138..bb63cc9be3 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -22,6 +22,8 @@
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include "dist_adj.cuh"
+
 namespace raft {
 namespace distance {
 
@@ -74,18 +76,6 @@ struct DistanceAdjInputs {
   unsigned long long int seed;
 };
 
-template <typename AccT, typename DataT, typename OutT, typename Index>
-struct threshold_final_op {
-  DataT threshold_val;
-
-  __device__ __host__ threshold_final_op() noexcept : threshold_val(0.0) {}
-  __device__ __host__ threshold_final_op(DataT val) noexcept : threshold_val(val) {}
-  __device__ __host__ OutT operator()(AccT d_val, Index g_idx) const noexcept
-  {
-    return d_val <= threshold_val;
-  }
-};
-
 template <typename DataType>
 ::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs<DataType>& dims)
 {
@@ -140,7 +130,7 @@ class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataTy
                                                   n,
                                                   k,
                                                   workspace.data(),
-                                                  workspace.size(),
+                                                  worksize,
                                                   threshold_op,
                                                   isRowMajor);
     handle.sync_stream(stream);
diff --git a/cpp/test/distance/dist_adj.cuh b/cpp/test/distance/dist_adj.cuh
new file mode 100644
index 0000000000..ee4554ff29
--- /dev/null
+++ b/cpp/test/distance/dist_adj.cuh
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dist_adj_threshold.cuh"
+#include <raft/distance/distance.cuh>
+
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT)       \
+  extern template void raft::distance::distance<DT, DataT, AccT, OutT, FinalLambda, IdxT>( \
+    raft::resources const& handle,                                                         \
+    const DataT* x,                                                                        \
+    const DataT* y,                                                                        \
+    OutT* dist,                                                                            \
+    IdxT m,                                                                                \
+    IdxT n,                                                                                \
+    IdxT k,                                                                                \
+    void* workspace,                                                                       \
+    size_t worksize,                                                                       \
+    FinalLambda fin_op,                                                                    \
+    bool isRowMajor,                                                                       \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Expanded,
+                                   float,
+                                   float,
+                                   uint8_t,
+                                   raft::distance::threshold_float,
+                                   int);
+
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Expanded,
+                                   double,
+                                   double,
+                                   uint8_t,
+                                   raft::distance::threshold_double,
+                                   int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT)         \
+  extern template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>( \
+    const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, uint8_t, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, uint8_t, int);
+
+#undef instantiate_raft_distance_getWorkspaceSize
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT)         \
+  extern template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>( \
+    const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, uint8_t, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, uint8_t, int);
+
+#undef instantiate_raft_distance_getWorkspaceSize
diff --git a/cpp/test/distance/dist_adj_distance_instance.cu b/cpp/test/distance/dist_adj_distance_instance.cu
new file mode 100644
index 0000000000..2250701fe9
--- /dev/null
+++ b/cpp/test/distance/dist_adj_distance_instance.cu
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#undef RAFT_EXPLICIT_INSTANTIATE
+
+#include "dist_adj_threshold.cuh"
+#include <cstdint>
+#include <raft/distance/distance-inl.cuh>
+
+#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT) \
+  template void raft::distance::distance<DT, DataT, AccT, OutT, FinalLambda, IdxT>(  \
+    raft::resources const& handle,                                                   \
+    const DataT* x,                                                                  \
+    const DataT* y,                                                                  \
+    OutT* dist,                                                                      \
+    IdxT m,                                                                          \
+    IdxT n,                                                                          \
+    IdxT k,                                                                          \
+    void* workspace,                                                                 \
+    size_t worksize,                                                                 \
+    FinalLambda fin_op,                                                              \
+    bool isRowMajor,                                                                 \
+    DataT metric_arg)
+
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Expanded,
+                                   float,
+                                   float,
+                                   uint8_t,
+                                   raft::distance::threshold_float,
+                                   int);
+
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Expanded,
+                                   double,
+                                   double,
+                                   uint8_t,
+                                   raft::distance::threshold_double,
+                                   int);
+
+#undef instantiate_raft_distance_distance
+
+#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT)  \
+  template size_t raft::distance::getWorkspaceSize<DistT, DataT, AccT, OutT, IdxT>( \
+    const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k)
+
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, float, float, uint8_t, int);
+instantiate_raft_distance_getWorkspaceSize(
+  raft::distance::DistanceType::L2Expanded, double, double, uint8_t, int);
+
+#undef instantiate_raft_distance_getWorkspaceSize
diff --git a/cpp/test/distance/dist_adj_threshold.cuh b/cpp/test/distance/dist_adj_threshold.cuh
new file mode 100644
index 0000000000..ad02be64aa
--- /dev/null
+++ b/cpp/test/distance/dist_adj_threshold.cuh
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace raft::distance {
+
+template <typename AccT, typename DataT, typename OutT, typename Index>
+struct threshold_final_op {
+  DataT threshold_val;
+
+  __device__ __host__ threshold_final_op() noexcept : threshold_val(0.0) {}
+  __device__ __host__ threshold_final_op(DataT val) noexcept : threshold_val(val) {}
+  __device__ __host__ OutT operator()(AccT d_val, Index g_idx) const noexcept
+  {
+    return d_val <= threshold_val;
+  }
+};
+
+using threshold_float  = threshold_final_op<float, float, uint8_t, int>;
+using threshold_double = threshold_final_op<double, double, uint8_t, int>;
+
+}  // namespace raft::distance
diff --git a/cpp/test/linalg/eigen_solvers.cu b/cpp/test/linalg/eigen_solvers.cu
index 1f29d7e275..8b9af959fd 100644
--- a/cpp/test/linalg/eigen_solvers.cu
+++ b/cpp/test/linalg/eigen_solvers.cu
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
-#include <raft/common/nvtx.hpp>
+#include <type_traits>
+
 #include <raft/core/device_resources.hpp>
+#include <raft/core/nvtx.hpp>
 #include <raft/spectral/eigen_solvers.cuh>
 #include <raft/spectral/partition.cuh>
 
diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu
index 2a40d70abc..e92f6c05cc 100644
--- a/cpp/test/matrix/select_k.cu
+++ b/cpp/test/matrix/select_k.cu
@@ -232,9 +232,9 @@ struct SelectK  // NOLINT
     auto& in_dists   = ref.get_in_dists();
     auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) {
       if (i == j) return true;
-      auto ix_i = uint64_t(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin());
-      auto ix_j = uint64_t(std::find(in_ids.begin(), in_ids.end(), j) - in_ids.begin());
-      if (ix_i >= in_ids.size() || ix_j >= in_ids.size()) return false;
+      auto ix_i = int64_t(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin());
+      auto ix_j = int64_t(std::find(in_ids.begin(), in_ids.end(), j) - in_ids.begin());
+      if (size_t(ix_i) >= in_ids.size() || size_t(ix_j) >= in_ids.size()) return false;
       auto dist_i = in_dists[ix_i];
       auto dist_j = in_dists[ix_j];
       if (dist_i == dist_j) return true;
@@ -434,7 +434,7 @@ INSTANTIATE_TEST_CASE_P(                          // NOLINT
                                    select::Algo::kWarpDistributedShm)));
 
 using ReferencedRandomDoubleSizeT =
-  SelectK<double, uint64_t, with_ref<select::Algo::kPublicApi>::params_random>;
+  SelectK<double, int64_t, with_ref<select::Algo::kPublicApi>::params_random>;
 TEST_P(ReferencedRandomDoubleSizeT, Run) { run(); }  // NOLINT
 INSTANTIATE_TEST_CASE_P(                             // NOLINT
   SelectK,
@@ -461,7 +461,7 @@ INSTANTIATE_TEST_CASE_P(                                 // NOLINT
                                    select::Algo::kRadix11bitsExtraPass)));
 
 using ReferencedRandomFloatSizeT =
-  SelectK<float, uint64_t, with_ref<select::Algo::kRadix8bits>::params_random>;
+  SelectK<float, int64_t, with_ref<select::Algo::kRadix8bits>::params_random>;
 TEST_P(ReferencedRandomFloatSizeT, LargeK) { run(); }  // NOLINT
 INSTANTIATE_TEST_CASE_P(SelectK,                       // NOLINT
                         ReferencedRandomFloatSizeT,
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index fc448f014f..438c56da21 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/core/device_mdarray.hpp>  // raft::make_device_matrix
 #include <raft/distance/distance_types.hpp>
 #include <raft/matrix/detail/select_k.cuh>
 #include <raft/matrix/matrix.cuh>
diff --git a/cpp/test/neighbors/fused_l2_knn.cu b/cpp/test/neighbors/fused_l2_knn.cu
index ab05b41cc9..d7e0e1e067 100644
--- a/cpp/test/neighbors/fused_l2_knn.cu
+++ b/cpp/test/neighbors/fused_l2_knn.cu
@@ -81,9 +81,9 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     rmm::device_uvector<T> temp_distances(num_db_vecs * num_queries, stream_);
     distance::pairwise_distance(
       handle_,
-      raft::make_device_matrix_view<T, int64_t>(search_queries.data(), num_queries, dim),
-      raft::make_device_matrix_view<T, int64_t>(database.data(), num_db_vecs, dim),
-      raft::make_device_matrix_view<T, int64_t>(temp_distances.data(), num_queries, num_db_vecs),
+      raft::make_device_matrix_view<T, int32_t>(search_queries.data(), num_queries, dim),
+      raft::make_device_matrix_view<T, int32_t>(database.data(), num_db_vecs, dim),
+      raft::make_device_matrix_view<T, int32_t>(temp_distances.data(), num_queries, num_db_vecs),
       metric);
 
     spatial::knn::select_k<int64_t, T>(temp_distances.data(),
diff --git a/cpp/test/neighbors/selection.cu b/cpp/test/neighbors/selection.cu
index 9f13de357c..281400c396 100644
--- a/cpp/test/neighbors/selection.cu
+++ b/cpp/test/neighbors/selection.cu
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+// XXX: we currently disable the EXPLICIT_INSTANTIATION restriction for now because we
+// need kFaissMax, which is not exposed by selection_faiss-ext.cuh.
+// TODO-inl-headers: consider how to re-enable it.
+#undef RAFT_EXPLICIT_INSTANTIATE
+#include <raft/neighbors/detail/selection_faiss.cuh>
+
 #include <algorithm>
 #include <gtest/gtest.h>
 #include <numeric>
@@ -24,9 +30,6 @@
 
 #include <raft/sparse/detail/utils.h>
 #include <raft/spatial/knn/knn.cuh>
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
 
 namespace raft::spatial::selection {
 

From 96894b28a60a8651cd8f16339d7d051648738de1 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 12 Apr 2023 11:29:22 +0200
Subject: [PATCH 17/89] Ensure uniformity of dispatch headers

---
 .../detail/pairwise_matrix/dispatch-inl.cuh      |  2 +-
 .../distance/detail/pairwise_matrix/dispatch.cuh | 16 ----------------
 .../linalg/detail/coalesced_reduction-ext.cuh    |  6 +++---
 .../raft/linalg/detail/coalesced_reduction.cuh   | 11 ++++++-----
 .../detail/ivf_flat_interleaved_scan.cuh         |  8 ++++----
 5 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
index 8df671d637..55a064acf9 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
@@ -40,7 +40,7 @@
 // Including dispatch_sm80.cuh can slow down compile times (due to CUTLASS).
 // Therefore, it is the including file's responsibility to include the correct
 // dispatch_smXX.cuh headers, as is done in raft/distance/detail/distance.cuh
-// and the specializations in src/distance/distance/specializations/detail/.
+// and src/distance/detail/pairwise_matrix/dispatch_*.cu.
 
 namespace raft::distance::detail {
 
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index 73666f639f..f26c67a8d9 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -15,22 +15,6 @@
  */
 #pragma once
 
-/* This file has two responsibilities:
- *
- * 1. Dispatch to the correct implementation of a kernel based on the
- *    architecture of the device on which the kernel will be launched. For
- *    instance, the cosine distance has a CUTLASS-based implementation that can
- *    be used on SM80+ and the normal implementation that is used on older
- *    architectures.
- *
- * 2. Provide concise function templates that can be instantiated in
- *    src/distance/distance/specializations/detail/. Previously,
- *    raft::distance::detail::distance was instantiated. The function
- *    necessarily required a large set of include files, which slowed down the
- *    build. The raft::distance::detail::pairwise_matrix_arch_dispatch functions
- *    do not require as large an include files set, which speeds up the build.
- */
-
 #if !defined(RAFT_EXPLICIT_INSTANTIATE)
 #include "dispatch-inl.cuh"
 #endif
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
index 2a1bafae43..7dbdb59c10 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
@@ -19,9 +19,9 @@
 #include "coalesced_reduction-types.cuh"
 #include <raft/core/operators.hpp>
 
-// Include inline definition as well. We cannot possibly cover all
-// instantiations in this file.
-#include "coalesced_reduction-inl.cuh"
+// The explicit instantiation of raft::linalg::detail::coalescedReduction is not
+// forced because there would be too many instances. Instead, we cover the most
+// common instantiations with extern template instantiations below.
 
 #define instantiate_raft_linalg_detail_coalescedReduction(                              \
   InType, OutType, IdxType, MainLambda, ReduceLambda, FinalLambda)                      \
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
index 9a51611de1..3e6b17978b 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction.cuh
@@ -16,10 +16,11 @@
 
 #pragma once
 
-#if defined(RAFT_COMPILED) && defined(RAFT_EXPLICIT_INSTANTIATE)
-// Too many lambdas and complicated types to instantiate everything..
-#include "coalesced_reduction-ext.cuh"
-#include "coalesced_reduction-inl.cuh"
-#else
+// Always include inline definitions of coalesced reduction, because we do not
+// force explicit instantion.
 #include "coalesced_reduction-inl.cuh"
+
+// Do include the extern template instantiations when possible.
+#ifdef RAFT_COMPILED
+#include "coalesced_reduction-ext.cuh"
 #endif
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
index 74a1a84e74..d6a4fed973 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#ifdef RAFT_COMPILED
-#include "ivf_flat_interleaved_scan-ext.cuh"
-#endif
-
 #if !defined(RAFT_EXPLICIT_INSTANTIATE)
 #include "ivf_flat_interleaved_scan-inl.cuh"
 #endif
+
+#ifdef RAFT_COMPILED
+#include "ivf_flat_interleaved_scan-ext.cuh"
+#endif

From 2d9dc215c37421453ae590997dc747afc6262e7b Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 12 Apr 2023 11:33:35 +0200
Subject: [PATCH 18/89] WIP: update docs

---
 docs/source/using_libraft.md | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/docs/source/using_libraft.md b/docs/source/using_libraft.md
index f4f966f2c8..6fa2e644a9 100644
--- a/docs/source/using_libraft.md
+++ b/docs/source/using_libraft.md
@@ -1,13 +1,33 @@
 # Using The Pre-Compiled Binary
 
-At its core, RAFT is a header-only template library, which makes it very powerful in that APIs can be called with various different combinations of data types and only the templates which are actually used will be compiled into your binaries. This increased flexibility comes with a drawback that all the APIs need to be declared inline and thus calls which are made frequently in your code could be compiled again each source file for which they are invoked.
+At its core, RAFT is a header-only template library, which makes it very powerful in that APIs can be called with various different combinations of data types and only the templates which are actually used will be compiled into your binaries. This increased flexibility comes with a drawback that all the APIs need to be declared inline and thus calls which are made frequently in your code could be compiled again in each source file for which they are invoked.
 
-For most functions, this overhead is pretty minimal and not noticeable but some of RAFT's APIs consist of very complex hierarchies of function calls that ultimately end up dispatching to device code that's executed on the GPU. The compile times for these APIs may still be bearable when compiling for only a single compute architecture but could end up becoming extremely slow to compile for all of the supported architectures at once.
+For most functions, compile-time overhead is minimal but some of RAFT's APIs take a substantial time to compile. As a rule of thumb, most functionality in `raft::distance`, `raft::neighbors`, and `raft::spatial` is expensive to compile and most functionality in other namespaces has little compile-time overhead.
 
-There are three ways to solve this problem and speed up compile times:
-1. Continue to use RAFT as a header-only library and create a CUDA source file in your project to explicitly instantiate the templates which are slow to compile. This can be tedious and will still require compiling the slow code at least once, but it's the most flexible option if you are using types that aren't already compiled into `libraft`
-2. If you are able to use one of the template types that are already being compiled into `libraft`, you can use the pre-compiled template specializations, which I will describe in more detail in the following section.
-3. If you would like to use RAFT but either cannot or would prefer not to compile any CUDA code yourself, you can simply add `libraft` to your link libraries and use the growing set of runtime APIs.
+
+To speed up compilation when using RAFT as a header-only library, you can do the following... 
+
+To speed up compilation when using the precompiled RAFT library, you can do the
+following:
+
+1. 
+
+
+There are three ways to speed up compile times:
+
+1. Continue to use RAFT as a header-only library and create a CUDA source file
+   in your project to explicitly instantiate the templates which are slow to
+   compile. This can be tedious and will still require compiling the slow code
+   at least once, but it's the most flexible option if you are using types that
+   aren't already compiled into `libraft`
+
+2. If you are able to use one of the template types that are already being
+   compiled into `libraft`, you can use the pre-compiled template
+   instantiations, which are described in more detail in the following section.
+
+3. If you would like to use RAFT but either cannot or would prefer not to
+   compile any CUDA code yourself, you can simply add `libraft` to your link
+   libraries and use the growing set of runtime APIs.
 
 ## Using Template Specializations
 
@@ -56,4 +76,4 @@ We can see here that the function `raft::neighbors::ivf_pq::detail::get_compute_
 
 RAFT contains a growing list of runtime APIs that, unlike the pre-compiled template specializations, allow you to link against `libraft` and invoke RAFT directly from `cpp` files. The benefit to RAFT's runtime APIs are two-fold- unlike the template specializations, which still require your code be compiled with the CUDA compiler (`nvcc`), the `runtime` APIs are the lightweight wrappers which enable `pylibraft`.
 
-Similar to the pre-compiled template specializations, RAFT's runtime APIs 
\ No newline at end of file
+Similar to the pre-compiled template specializations, RAFT's runtime APIs 

From 31def15fdbaa5b55377fa184cb06c6a81407319c Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Thu, 23 Mar 2023 14:57:18 -0400
Subject: [PATCH 19/89] DOC

---
 .github/workflows/build.yaml                  | 16 +++++++-------
 .github/workflows/pr.yaml                     | 22 +++++++++----------
 .github/workflows/test.yaml                   |  8 +++----
 .../all_cuda-118_arch-x86_64.yaml             | 11 +++++-----
 cpp/CMakeLists.txt                            |  4 ++--
 cpp/doxygen/Doxyfile                          |  2 +-
 dependencies.yaml                             | 12 +++++-----
 docs/source/build.md                          |  4 ++--
 docs/source/conf.py                           |  4 ++--
 docs/source/developer_guide.md                | 18 +++++++--------
 fetch_rapids.cmake                            |  2 +-
 python/pylibraft/CMakeLists.txt               |  2 +-
 python/pylibraft/pylibraft/__init__.py        |  2 +-
 python/pylibraft/pyproject.toml               | 17 ++++++--------
 python/raft-dask/CMakeLists.txt               |  2 +-
 python/raft-dask/pyproject.toml               | 18 +++++++--------
 python/raft-dask/raft_dask/__init__.py        |  2 +-
 17 files changed, 72 insertions(+), 74 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 32aab5656b..3c8cc4912d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch' && github.event_name == 'push'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: branch
       node_type: "gpu-latest-1"
@@ -66,7 +66,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-raft-dask:
     needs: wheel-publish-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-publish.yml@branch-23.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 97554b380e..6159b5c8ab 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,41 +23,41 @@ jobs:
       - wheel-build-raft-dask
       - wheel-tests-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.06
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.06
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.06
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.06
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: pull-request
       node_type: "gpu-latest-1"
@@ -67,7 +67,7 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
     with:
       build_type: pull-request
       package-name: pylibraft
@@ -77,7 +77,7 @@ jobs:
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
     with:
       build_type: pull-request
       package-name: pylibraft
@@ -89,7 +89,7 @@ jobs:
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-build.yml@branch-23.06
     with:
       build_type: pull-request
       package-name: raft_dask
@@ -100,7 +100,7 @@ jobs:
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
     with:
       build_type: pull-request
       package-name: raft_dask
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index d204d2c16e..11ff3333d1 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -44,7 +44,7 @@ jobs:
       test-unittest: "python -m pytest -v ./python/pylibraft/pylibraft/test"
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.04
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@branch-23.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index fd2d1d2280..a992ebddb1 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -18,10 +18,9 @@ dependencies:
 - cupy
 - cxx-compiler
 - cython>=0.29,<0.30
-- dask-core==2023.3.2
-- dask-cuda==23.4.*
-- dask==2023.3.2
-- distributed==2023.3.2.1
+- dask-cuda=23.06
+- dask>=2023.1.1
+- distributed>=2023.1.1
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
@@ -44,7 +43,7 @@ dependencies:
 - pytest
 - pytest-cov
 - recommonmark
-- rmm==23.4.*
+- rmm=23.06
 - scikit-build>=0.13.1
 - scikit-learn
 - scipy
@@ -52,6 +51,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.31.*
+- ucx-py=0.32.*
 - ucx>=1.13.0
 name: all_cuda-118_arch-x86_64
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fc7014fa0c..dbb2700dcc 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -10,8 +10,8 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
-set(RAPIDS_VERSION "23.04")
-set(RAFT_VERSION "23.04.00")
+set(RAPIDS_VERSION "23.06")
+set(RAFT_VERSION "23.06.00")
 
 include(FetchContent)
 FetchContent_Declare(
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 2a92c67996..17a1e0caca 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "RAFT C++ API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "23.04"
+PROJECT_NUMBER         = "23.06"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/dependencies.yaml b/dependencies.yaml
index af29bf68a8..7254d12cb3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -276,13 +276,15 @@ dependencies:
         packages:
           - dask-core==2023.3.2
           - ucx>=1.13.0
+          - ucx-py=0.32.*
           - ucx-proc=*=gpu
-      - output_types: pyproject
-        packages:
-          - pylibraft==23.4.*
-  test_python_common:
+          - rmm=23.06
+          - libfaiss>=1.7.1=cuda*
+          - faiss-proc=*=cuda
+          - dask-cuda=23.06
+  test_python:
     common:
-      - output_types: [conda, requirements, pyproject]
+      - output_types: [conda, requirements]
         packages:
           - pytest
           - pytest-cov
diff --git a/docs/source/build.md b/docs/source/build.md
index 9c0c61b544..d7550eb631 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -254,7 +254,7 @@ While not a highly suggested method for building against RAFT, when all of the n
 set(RAFT_GIT_DIR ${CMAKE_CURRENT_BINARY_DIR}/raft CACHE STRING "Path to RAFT repo")
 ExternalProject_Add(raft
   GIT_REPOSITORY    git@github.com:rapidsai/raft.git
-  GIT_TAG           branch-23.04
+  GIT_TAG           branch-23.06
   PREFIX            ${RAFT_GIT_DIR}
   CONFIGURE_COMMAND ""
   BUILD_COMMAND     ""
@@ -292,7 +292,7 @@ The following `cmake` snippet enables a flexible configuration of RAFT:
 
 ```cmake
 
-set(RAFT_VERSION "23.04")
+set(RAFT_VERSION "23.06")
 set(RAFT_FORK "rapidsai")
 set(RAFT_PINNED_TAG "branch-${RAFT_VERSION}")
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 33a8a9217a..f9054420ca 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -67,9 +67,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.04'
+version = '23.06'
 # The full version, including alpha/beta/rc tags.
-release = '23.04.00'
+release = '23.06.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 56100b38f7..6f57453e28 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -140,13 +140,13 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-23.04/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/.clang-format).
 
 #### How is the check done?
-All formatting checks are done by this python script: [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.04/cpp/scripts/run-clang-format.py) which is effectively a wrapper over `clang-format`. An error is raised if the code diverges from the format suggested by clang-format. It is expected that the developers run this script to detect and fix formatting violations before creating PR.
+All formatting checks are done by this python script: [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/run-clang-format.py) which is effectively a wrapper over `clang-format`. An error is raised if the code diverges from the format suggested by clang-format. It is expected that the developers run this script to detect and fix formatting violations before creating PR.
 
 ##### As part of CI
-[run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.04/cpp/scripts/run-clang-format.py) is executed as part of our `ci/checks/style.sh` CI test. If there are any formatting violations, PR author is expected to fix those to get CI passing. Steps needed to fix the formatting violations are described in the subsequent sub-section.
+[run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/run-clang-format.py) is executed as part of our `ci/checks/style.sh` CI test. If there are any formatting violations, PR author is expected to fix those to get CI passing. Steps needed to fix the formatting violations are described in the subsequent sub-section.
 
 ##### Manually
 Developers can also manually (or setup this command as part of git pre-commit hook) run this check by executing:
@@ -156,10 +156,10 @@ python ./cpp/scripts/run-clang-format.py
 From the root of the RAFT repository.
 
 #### How to know the formatting violations?
-When there are formatting errors, [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.04/cpp/scripts/run-clang-format.py) prints a `diff` command, showing where there are formatting differences. Unfortunately, unlike `flake8`, `clang-format` does NOT print descriptions of the violations, but instead directly formats the code. So, the only way currently to know about formatting differences is to run the diff command as suggested by this script against each violating source file.
+When there are formatting errors, [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/run-clang-format.py) prints a `diff` command, showing where there are formatting differences. Unfortunately, unlike `flake8`, `clang-format` does NOT print descriptions of the violations, but instead directly formats the code. So, the only way currently to know about formatting differences is to run the diff command as suggested by this script against each violating source file.
 
 #### How to fix the formatting violations?
-When there are formatting violations, [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.04/cpp/scripts/run-clang-format.py) prints at the end, the exact command that can be run by developers to fix them. This is the easiest way to fix formatting errors. [This screencast](https://asciinema.org/a/287367) shows how developers can check for formatting violations in their branches and also how to fix those, before sending out PRs.
+When there are formatting violations, [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/run-clang-format.py) prints at the end, the exact command that can be run by developers to fix them. This is the easiest way to fix formatting errors. [This screencast](https://asciinema.org/a/287367) shows how developers can check for formatting violations in their branches and also how to fix those, before sending out PRs.
 
 In short, to bulk-fix all the formatting violations, execute the following command:
 ```bash
@@ -168,13 +168,13 @@ python ./cpp/scripts/run-clang-format.py -inplace
 From the root of the RAFT repository.
 
 #### clang-format version?
-To avoid spurious code style violations we specify the exact clang-format version required, currently `11.1.0`. This is enforced by the [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.04/cpp/scripts/run-clang-format.py) script itself. Refer [here](../build#build-dependencies) for the list of build-time dependencies.
+To avoid spurious code style violations we specify the exact clang-format version required, currently `11.1.0`. This is enforced by the [run-clang-format.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/run-clang-format.py) script itself. Refer [here](../build#build-dependencies) for the list of build-time dependencies.
 
 #### Additional scripts
 Along with clang, there are an include checker and copyright checker scripts for checking style, which can be performed as part of CI, as well as manually.
 
 ##### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-23.04/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -184,7 +184,7 @@ python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list
 ```
 
 ##### Copyright header
-[copyright.py](https://github.com/rapidsai/raft/blob/branch-23.04/ci/checks/copyright.py) checks the Copyright header for all git-modified files
+[copyright.py](https://github.com/rapidsai/raft/blob/branch-23.06/ci/checks/copyright.py) checks the Copyright header for all git-modified files
 
 Manually, you can run the following to bulk-fix the header if only the years need to be updated:
 ```bash
@@ -198,7 +198,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-23.04/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-23.06/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index ae4fb329cc..baead41cca 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.04/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.06/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake
   )
 endif()
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 3c2b093362..349a2b08ba 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-set(pylibraft_version 23.04.00)
+set(pylibraft_version 23.06.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/pylibraft/pylibraft/__init__.py b/python/pylibraft/pylibraft/__init__.py
index 39145085f0..aebaa4e272 100644
--- a/python/pylibraft/pylibraft/__init__.py
+++ b/python/pylibraft/pylibraft/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 #
 
-__version__ = "23.04.00"
+__version__ = "23.06.00"
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index fed15bbab0..b4eb296089 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -19,16 +19,13 @@ requires = [
     "cuda-python >=11.7.1,<12.0",
     "cython>=0.29,<0.30",
     "ninja",
-    "rmm==23.4.*",
-    "scikit-build>=0.13.1",
-    "setuptools",
-    "wheel",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+    "rmm==23.6.*",
+]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "pylibraft"
-version = "23.04.00"
+version = "23.06.00"
 description = "RAFT: Reusable Algorithms Functions and other Tools"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -37,10 +34,10 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "cuda-python >=11.7.1,<12.0",
-    "numpy>=1.21",
-    "rmm==23.4.*",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+    "numpy",
+    "cuda-python>=11.7.1,<12.0",
+    "rmm==23.6.*",
+]
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 49e7f50c27..b157abf309 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-set(raft_dask_version 23.04.00)
+set(raft_dask_version 23.06.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index ba6cd7ccae..28938e1590 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -25,7 +25,7 @@ requires = [
 
 [project]
 name = "raft-dask"
-version = "23.04.00"
+version = "23.06.00"
 description = "Reusable Accelerated Functions & Tools Dask Infrastructure"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -34,15 +34,15 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "dask-cuda==23.4.*",
-    "dask==2023.3.2",
-    "distributed==2023.3.2.1",
-    "joblib>=0.11",
+    "numpy",
     "numba>=0.49",
-    "numpy>=1.21",
-    "pylibraft==23.4.*",
-    "ucx-py==0.31.*",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+    "joblib>=0.11",
+    "dask-cuda==23.6.*",
+    "dask>=2023.1.1",
+    "ucx-py==0.32.*",
+    "distributed>=2023.1.1",
+    "pylibraft==23.6.*",
+]
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
diff --git a/python/raft-dask/raft_dask/__init__.py b/python/raft-dask/raft_dask/__init__.py
index 4f4700df48..9582da4851 100644
--- a/python/raft-dask/raft_dask/__init__.py
+++ b/python/raft-dask/raft_dask/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 #
 
-__version__ = "23.04.00"
+__version__ = "23.06.00"

From e477dbeb6e2525aa9a0652ac07c890f7e2315cac Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Date: Wed, 29 Mar 2023 21:17:48 -0400
Subject: [PATCH 20/89] Update rapids version

---
 cpp/template/cmake/thirdparty/fetch_rapids.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
index 40ba83be9e..248f4f1af4 100644
--- a/cpp/template/cmake/thirdparty/fetch_rapids.cmake
+++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "23.04")
+set(RAPIDS_VERSION "23.06")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake

From dd5cdd3c90ad77013caffe2b147756e585c27960 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Date: Wed, 29 Mar 2023 21:38:40 -0400
Subject: [PATCH 21/89] Update pylibraft version

---
 dependencies.yaml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 7254d12cb3..66b1c31b2b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -278,11 +278,10 @@ dependencies:
           - ucx>=1.13.0
           - ucx-py=0.32.*
           - ucx-proc=*=gpu
-          - rmm=23.06
-          - libfaiss>=1.7.1=cuda*
-          - faiss-proc=*=cuda
-          - dask-cuda=23.06
-  test_python:
+      - output_types: pyproject
+        packages:
+          - pylibraft==23.6.*
+  test_python_common:
     common:
       - output_types: [conda, requirements]
         packages:

From d8c85e7d27bc0784bc346e0ccdaa551bf066f216 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Date: Wed, 29 Mar 2023 21:48:31 -0400
Subject: [PATCH 22/89] Run dfg

---
 python/raft-dask/pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 28938e1590..597f1a8764 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -39,6 +39,11 @@ dependencies = [
     "joblib>=0.11",
     "dask-cuda==23.6.*",
     "dask>=2023.1.1",
+    "distributed>=2023.1.1",
+    "joblib>=0.11",
+    "numba>=0.49",
+    "numpy>=1.21",
+    "pylibraft==23.6.*",
     "ucx-py==0.32.*",
     "distributed>=2023.1.1",
     "pylibraft==23.6.*",

From bdcbfcf00d60b66900c950b3c138dc264d8fdc4c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 30 Mar 2023 09:40:42 -0400
Subject: [PATCH 23/89] Fix dask versions in wheel build preinstallation

---
 .github/workflows/pr.yaml   | 4 ++--
 .github/workflows/test.yaml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 6159b5c8ab..d0809e66b6 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -105,7 +105,7 @@ jobs:
       build_type: pull-request
       package-name: raft_dask
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
       test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
       test-smoketest: "python ./ci/wheel_smoke_test_raft_dask.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 11ff3333d1..e0731a9a97 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -51,6 +51,6 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: raft_dask
-      test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-amd64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-arm64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
       test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"

From 30b777cccdccc3db2ab177d216d587627f4cf24d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 31 Mar 2023 19:12:15 -0400
Subject: [PATCH 24/89] Fix ucx-py pin in raft-dask recipe (#1396)

Update the ucx-py pinning for raft-dask 23.06

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Mark Sadang (https://github.com/msadang)

URL: https://github.com/rapidsai/raft/pull/1396
---
 conda/recipes/raft-dask/conda_build_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index 778b187870..4f88728f4b 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -14,7 +14,7 @@ ucx_version:
   - ">=1.13.0,<1.15.0"
 
 ucx_py_version:
-  - "0.31.*"
+  - "0.32.*"
 
 cmake_version:
   - ">=3.23.1,!=3.25.0"

From 6ad9d957e65abb384a6107212dce0751af7f1535 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 7 Apr 2023 08:22:33 -0400
Subject: [PATCH 25/89] Have consistent compile lines between BUILD_TESTS
 enabled or not (#1401)

This will remove 1h from our conda CI builds since we can now re-use the cached object files between `libraft` and `libraft-tests`

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ben Frederickson (https://github.com/benfred)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/1401
---
 cpp/CMakeLists.txt | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index dbb2700dcc..c7d3c89a81 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -81,15 +81,12 @@ option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
 
-if(BUILD_TESTS
-   OR BUILD_PRIMS_BENCH
-   OR BUILD_ANN_BENCH
-)
-  # Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
-  # to have different values for the `Threads::Threads` target. Setting this flag ensures
-  # `Threads::Threads` is the same value in first run and subsequent runs.
-  set(THREADS_PREFER_PTHREAD_FLAG ON)
-endif()
+
+# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
+# to have different values for the `Threads::Threads` target. Setting this flag ensures
+# `Threads::Threads` is the same value across all builds so that cache hits occur
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
 
 include(CMakeDependentOption)
 # cmake_dependent_option( RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for

From c09993b5434254be2f12fea29f3a7e7021cc1db9 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 10 Apr 2023 11:08:35 -0700
Subject: [PATCH 26/89] Remove uses-setup-env-vars (#1406)

This setting now matches the default behavior of the shared-action-workflows repo

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/1406
---
 .github/workflows/build.yaml | 2 --
 .github/workflows/pr.yaml    | 2 --
 2 files changed, 4 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 3c8cc4912d..bec89ab888 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -75,7 +75,6 @@ jobs:
       package-name: pylibraft
       package-dir: python/pylibraft
       skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
-      uses-setup-env-vars: false
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
@@ -98,7 +97,6 @@ jobs:
       package-name: raft_dask
       package-dir: python/raft-dask
       skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
-      uses-setup-env-vars: false
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d0809e66b6..2085f89414 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -73,7 +73,6 @@ jobs:
       package-name: pylibraft
       package-dir: python/pylibraft
       skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
-      uses-setup-env-vars: false
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
@@ -96,7 +95,6 @@ jobs:
       package-dir: python/raft-dask
       before-wheel: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-wheelhouse"
       skbuild-configure-options: "-DRAFT_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
-      uses-setup-env-vars: false
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit

From 8bd64a0c34064c2ae0e09deeef8e89a6fa04092b Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 12 Apr 2023 19:01:46 +0200
Subject: [PATCH 27/89] Split ivf-pq, fused l2 nn

---
 cpp/CMakeLists.txt                            |  85 +-
 cpp/include/raft/distance/fused_l2_nn-ext.cuh | 117 +++
 cpp/include/raft/distance/fused_l2_nn-inl.cuh |  27 +-
 cpp/include/raft/distance/fused_l2_nn.cuh     |  25 +
 .../specializations/fused_l2_nn_min.cuh       | 127 ---
 .../detail/ivf_pq_compute_similarity-ext.cuh  | 179 ++++
 .../detail/ivf_pq_compute_similarity-inl.cuh  | 803 ++++++++++++++++
 .../detail/ivf_pq_compute_similarity.cuh      |  25 +
 .../detail/ivf_pq_dummy_block_sort.cuh        |  39 +
 .../raft/neighbors/detail/ivf_pq_fp_8bit.cuh  | 113 +++
 .../raft/neighbors/detail/ivf_pq_search.cuh   | 864 +-----------------
 cpp/include/raft/neighbors/ivf_pq-ext.cuh     | 366 ++++++++
 cpp/include/raft/neighbors/ivf_pq.cuh         |  25 +
 .../raft/neighbors/specializations.cuh        |   1 -
 .../detail/ivf_pq_compute_similarity.cuh      |  50 -
 .../raft/neighbors/specializations/ivf_pq.cuh |  77 --
 cpp/include/raft/util/detail/cub_wrappers.cuh |   4 +-
 .../raft/util/inline.hpp}                     |  28 +-
 cpp/src/distance/fused_l2_nn.cu               |  54 ++
 .../detail/00_write_template.py               | 159 ----
 .../canberra_double_double_double_int.cu      |  33 -
 .../detail/canberra_float_float_float_int.cu  |  33 -
 .../correlation_double_double_double_int.cu   |  33 -
 .../correlation_float_float_float_int.cu      |  33 -
 .../detail/cosine_double_double_double_int.cu |  34 -
 .../detail/cosine_float_float_float_int.cu    |  34 -
 ...ing_unexpanded_double_double_double_int.cu |  33 -
 ...amming_unexpanded_float_float_float_int.cu |  33 -
 ...inger_expanded_double_double_double_int.cu |  33 -
 ...ellinger_expanded_float_float_float_int.cu |  33 -
 .../inner_product_double_double_double_int.cu |  38 -
 ...jensen_shannon_double_double_double_int.cu |  34 -
 .../jensen_shannon_float_float_float_int.cu   |  34 -
 .../kl_divergence_double_double_double_int.cu |  33 -
 .../kl_divergence_float_float_float_int.cu    |  33 -
 .../detail/l1_double_double_double_int.cu     |  33 -
 .../detail/l1_float_float_float_int.cu        |  33 -
 .../l2_expanded_double_double_double_int.cu   |  34 -
 .../l2_expanded_float_float_float_int.cu      |  34 -
 .../l2_unexpanded_double_double_double_int.cu |  33 -
 .../l2_unexpanded_float_float_float_int.cu    |  33 -
 .../detail/l_inf_double_double_double_int.cu  |  33 -
 .../detail/l_inf_float_float_float_int.cu     |  33 -
 .../lp_unexpanded_double_double_double_int.cu |  33 -
 .../lp_unexpanded_float_float_float_int.cu    |  33 -
 .../russel_rao_double_double_double_int.cu    |  33 -
 .../russel_rao_float_float_float_int.cu       |  33 -
 .../specializations/fused_l2_nn_double_int.cu |  51 --
 .../fused_l2_nn_double_int64.cu               |  51 --
 .../specializations/fused_l2_nn_float_int.cu  |  51 --
 .../fused_l2_nn_float_int64.cu                |  51 --
 .../ivf_pq_compute_similarity_00_generate.py  |  61 ++
 .../ivf_pq_compute_similarity_float_float.cu  |  40 +
 ...f_pq_compute_similarity_float_fp8_false.cu |  41 +
 ...vf_pq_compute_similarity_float_fp8_true.cu |  41 +
 .../ivf_pq_compute_similarity_float_half.cu   |  40 +
 ...vf_pq_compute_similarity_half_fp8_false.cu |  41 +
 ...ivf_pq_compute_similarity_half_fp8_true.cu |  41 +
 .../ivf_pq_compute_similarity_half_half.cu    |  40 +
 .../neighbors/ivfpq_build_float_int64_t.cu    |  36 +
 .../neighbors/ivfpq_build_int8_t_int64_t.cu   |  36 +
 .../neighbors/ivfpq_build_uint8_t_int64_t.cu  |  36 +
 cpp/src/neighbors/ivfpq_deserialize.cu        |   4 +-
 .../neighbors/ivfpq_extend_float_int64_t.cu   |  50 +
 .../neighbors/ivfpq_extend_int8_t_int64_t.cu  |  50 +
 .../neighbors/ivfpq_extend_uint8_t_int64_t.cu |  50 +
 .../neighbors/ivfpq_search_float_int64_t.cu   |  43 +-
 .../neighbors/ivfpq_search_int8_t_int64_t.cu  |  43 +-
 .../neighbors/ivfpq_search_uint8_t_int64_t.cu |  43 +-
 cpp/src/neighbors/ivfpq_serialize.cu          |   3 +-
 70 files changed, 2467 insertions(+), 2546 deletions(-)
 create mode 100644 cpp/include/raft/distance/fused_l2_nn-ext.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh
 create mode 100644 cpp/include/raft/neighbors/ivf_pq-ext.cuh
 delete mode 100644 cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
 delete mode 100644 cpp/include/raft/neighbors/specializations/ivf_pq.cuh
 rename cpp/{src/distance/specializations/detail/inner_product_float_float_float_int.cu => include/raft/util/inline.hpp} (50%)
 create mode 100644 cpp/src/distance/fused_l2_nn.cu
 delete mode 100644 cpp/src/distance/specializations/detail/00_write_template.py
 delete mode 100644 cpp/src/distance/specializations/detail/canberra_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/canberra_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/correlation_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/correlation_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/cosine_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/cosine_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/inner_product_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/l_inf_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/l_inf_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/fused_l2_nn_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/fused_l2_nn_double_int64.cu
 delete mode 100644 cpp/src/distance/specializations/fused_l2_nn_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/fused_l2_nn_float_int64.cu
 create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
 create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
 create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
 create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
 create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
 create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
 create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
 create mode 100644 cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
 create mode 100644 cpp/src/neighbors/ivfpq_build_float_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivfpq_build_int8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivfpq_build_uint8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivfpq_extend_float_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c7d3c89a81..e3486062d5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -81,13 +81,11 @@ option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
 
-
-# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
-# to have different values for the `Threads::Threads` target. Setting this flag ensures
+# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
+# have different values for the `Threads::Threads` target. Setting this flag ensures
 # `Threads::Threads` is the same value across all builds so that cache hits occur
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 
-
 include(CMakeDependentOption)
 # cmake_dependent_option( RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for
 # nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARY OFF )
@@ -291,6 +289,13 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/refine_h_int64_t_float.cu
     src/neighbors/refine_h_int64_t_int8_t.cu
     src/neighbors/refine_h_int64_t_uint8_t.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
     src/neighbors/detail/selection_faiss.cu
     src/neighbors/specializations/refine_d_int64_t_float.cu
     src/neighbors/specializations/refine_d_int64_t_int8_t.cu
@@ -329,6 +334,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
+    src/distance/fused_l2_nn.cu
     src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
     src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
     src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
@@ -339,10 +345,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/brute_force_knn_int64_t_float.cu
     src/distance/specializations/detail/kernels/tanh_kernel_double.cu
     src/distance/specializations/detail/kernels/tanh_kernel_float.cu
-    src/distance/specializations/fused_l2_nn_double_int.cu
-    src/distance/specializations/fused_l2_nn_double_int64.cu
-    src/distance/specializations/fused_l2_nn_float_int.cu
-    src/distance/specializations/fused_l2_nn_float_int64.cu
     src/matrix/detail/select_k_double_int64_t.cu
     src/matrix/detail/select_k_double_uint32_t.cu
     src/matrix/detail/select_k_float_int64_t.cu
@@ -355,40 +357,10 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/ivfpq_search_float_int64_t.cu
     src/neighbors/ivfpq_search_int8_t_int64_t.cu
     src/neighbors/ivfpq_search_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
     src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
     src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
     src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
     src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
     src/random/rmat_rectangular_generator_int_double.cu
     src/random/rmat_rectangular_generator_int64_double.cu
     src/random/rmat_rectangular_generator_int_float.cu
@@ -409,41 +381,16 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
     src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
     src/neighbors/ivfpq_build.cu
+    src/neighbors/ivfpq_build_float_int64_t.cu
+    src/neighbors/ivfpq_build_int8_t_int64_t.cu
+    src/neighbors/ivfpq_build_uint8_t_int64_t.cu
     src/neighbors/ivfpq_deserialize.cu
-    src/neighbors/ivfpq_serialize.cu
+    src/neighbors/ivfpq_extend_float_int64_t.cu
+    src/neighbors/ivfpq_extend_int8_t_int64_t.cu
     src/neighbors/ivfpq_search_float_int64_t.cu
     src/neighbors/ivfpq_search_int8_t_int64_t.cu
     src/neighbors/ivfpq_search_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_float_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
-    src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
+    src/neighbors/ivfpq_serialize.cu
     src/random/rmat_rectangular_generator_int_double.cu
     src/random/rmat_rectangular_generator_int64_double.cu
     src/random/rmat_rectangular_generator_int_float.cu
diff --git a/cpp/include/raft/distance/fused_l2_nn-ext.cuh b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
new file mode 100644
index 0000000000..b9b507179b
--- /dev/null
+++ b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>  // int64_t
+#include <raft/core/kvp.hpp>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft {
+namespace distance {
+/**
+ * \defgroup fused_l2_nn Fused 1-nearest neighbors
+ * @{
+ * @}
+ */
+
+/**
+ * @brief Wrapper around fusedL2NN with minimum reduction operators.
+ *
+ * fusedL2NN cannot be compiled in the distance library due to the lambda
+ * operators, so this wrapper covers the most common case (minimum).
+ * This should be preferred to the more generic API when possible, in order to
+ * reduce compilation times for users of the shared library.
+ *
+ * @tparam DataT     data type
+ * @tparam OutT      output type to either store 1-NN indices and their minimum
+ *                   distances (e.g. raft::KeyValuePair<int, float>) or store only the min
+ * distances.
+ * @tparam IdxT      indexing arithmetic type
+ * @param[out] min           will contain the reduced output (Length = `m`)
+ *                           (on device)
+ * @param[in]  x             first matrix. Row major. Dim = `m x k`.
+ *                           (on device).
+ * @param[in]  y             second matrix. Row major. Dim = `n x k`.
+ *                           (on device).
+ * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
+ * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
+ * @param[in]  m             gemm m
+ * @param[in]  n             gemm n
+ * @param[in]  k             gemm k
+ * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
+ * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
+ * @param[in]  initOutBuffer whether to initialize the output buffer before the
+ *                           main kernel launch
+ * @param[in]  stream        cuda stream
+ */
+template <typename DataT, typename OutT, typename IdxT>
+void fusedL2NNMinReduce(OutT* min,
+                        const DataT* x,
+                        const DataT* y,
+                        const DataT* xn,
+                        const DataT* yn,
+                        IdxT m,
+                        IdxT n,
+                        IdxT k,
+                        void* workspace,
+                        bool sqrt,
+                        bool initOutBuffer,
+                        cudaStream_t stream) RAFT_EXPLICIT;
+
+/** @} */
+
+}  // namespace distance
+}  // namespace raft
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_distance_fusedL2NNMinReduce(DataT, OutT, IdxT)                          \
+  extern template void raft::distance::fusedL2NNMinReduce<DataT, OutT, IdxT>(OutT * min,         \
+                                                                             const DataT* x,     \
+                                                                             const DataT* y,     \
+                                                                             const DataT* xn,    \
+                                                                             const DataT* yn,    \
+                                                                             IdxT m,             \
+                                                                             IdxT n,             \
+                                                                             IdxT k,             \
+                                                                             void* workspace,    \
+                                                                             bool sqrt,          \
+                                                                             bool initOutBuffer, \
+                                                                             cudaStream_t stream)
+
+instantiate_raft_distance_fusedL2NNMinReduce(double, double, int);
+instantiate_raft_distance_fusedL2NNMinReduce(double, double, int64_t);
+instantiate_raft_distance_fusedL2NNMinReduce(float, float, int);
+instantiate_raft_distance_fusedL2NNMinReduce(float, float, int64_t);
+
+// We can't have comma's in the macro expansion, so we use the COMMA macro:
+#define COMMA ,
+
+instantiate_raft_distance_fusedL2NNMinReduce(double, raft::KeyValuePair<int COMMA double>, int);
+instantiate_raft_distance_fusedL2NNMinReduce(double,
+                                             raft::KeyValuePair<int64_t COMMA double>,
+                                             int64_t);
+instantiate_raft_distance_fusedL2NNMinReduce(float, raft::KeyValuePair<int COMMA float>, int);
+instantiate_raft_distance_fusedL2NNMinReduce(float,
+                                             raft::KeyValuePair<int64_t COMMA float>,
+                                             int64_t);
+
+#undef COMMA
+
+#undef instantiate_raft_distance_fusedL2NNMinReduce
diff --git a/cpp/include/raft/distance/fused_l2_nn-inl.cuh b/cpp/include/raft/distance/fused_l2_nn-inl.cuh
index e832bcb020..05a994b3c7 100644
--- a/cpp/include/raft/distance/fused_l2_nn-inl.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn-inl.cuh
@@ -14,9 +14,6 @@
  * limitations under the License.
  */
 
-#ifndef __FUSED_L2_NN_H
-#define __FUSED_L2_NN_H
-
 #pragma once
 
 #include <cub/cub.cuh>
@@ -33,19 +30,9 @@ namespace distance {
 /**
  * \defgroup fused_l2_nn Fused 1-nearest neighbors
  * @{
+ * @}
  */
 
-template <typename LabelT, typename DataT>
-using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
-
-template <typename LabelT, typename DataT>
-using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
-
-template <typename LabelT, typename DataT>
-using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
-
-/** @} */
-
 /**
  * Initialize array using init value from reduction op
  */
@@ -215,8 +202,14 @@ void fusedL2NNMinReduce(OutT* min,
                         bool initOutBuffer,
                         cudaStream_t stream)
 {
-  MinAndDistanceReduceOp<IdxT, DataT> redOp;
-  KVPMinReduce<IdxT, DataT> pairRedOp;
+  // detail::MinAndDistanceReduceOpImpl<IdxT, DataT> redOp;
+  // detail::KVPMinReduceImpl<IdxT, DataT> pairRedOp;
+
+  detail::MinAndDistanceReduceOpImpl<IdxT, DataT> redOp;
+  detail::KVPMinReduceImpl<IdxT, DataT> pairRedOp;
+
+  // MinAndDistanceReduceOp<IdxT, DataT> redOp;
+  // KVPMinReduce<IdxT, DataT> pairRedOp;
 
   fusedL2NN<DataT, OutT, IdxT>(
     min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
@@ -226,5 +219,3 @@ void fusedL2NNMinReduce(OutT* min,
 
 }  // namespace distance
 }  // namespace raft
-
-#endif
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
index e69de29bb2..9501602353 100644
--- a/cpp/include/raft/distance/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "fused_l2_nn-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "fused_l2_nn-ext.cuh"
+#endif
diff --git a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
deleted file mode 100644
index 88e1216635..0000000000
--- a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/core/kvp.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-
-namespace raft {
-namespace distance {
-
-extern template void fusedL2NNMinReduce<float, raft::KeyValuePair<int, float>, int>(
-  raft::KeyValuePair<int, float>* min,
-  const float* x,
-  const float* y,
-  const float* xn,
-  const float* yn,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-extern template void fusedL2NNMinReduce<float, raft::KeyValuePair<int64_t, float>, int64_t>(
-  raft::KeyValuePair<int64_t, float>* min,
-  const float* x,
-  const float* y,
-  const float* xn,
-  const float* yn,
-  int64_t m,
-  int64_t n,
-  int64_t k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-extern template void fusedL2NNMinReduce<double, raft::KeyValuePair<int, double>, int>(
-  raft::KeyValuePair<int, double>* min,
-  const double* x,
-  const double* y,
-  const double* xn,
-  const double* yn,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-extern template void fusedL2NNMinReduce<double, raft::KeyValuePair<int64_t, double>, int64_t>(
-  raft::KeyValuePair<int64_t, double>* min,
-  const double* x,
-  const double* y,
-  const double* xn,
-  const double* yn,
-  int64_t m,
-  int64_t n,
-  int64_t k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-extern template void fusedL2NNMinReduce<float, float, int>(float* min,
-                                                           const float* x,
-                                                           const float* y,
-                                                           const float* xn,
-                                                           const float* yn,
-                                                           int m,
-                                                           int n,
-                                                           int k,
-                                                           void* workspace,
-                                                           bool sqrt,
-                                                           bool initOutBuffer,
-                                                           cudaStream_t stream);
-extern template void fusedL2NNMinReduce<float, float, int64_t>(float* min,
-                                                               const float* x,
-                                                               const float* y,
-                                                               const float* xn,
-                                                               const float* yn,
-                                                               int64_t m,
-                                                               int64_t n,
-                                                               int64_t k,
-                                                               void* workspace,
-                                                               bool sqrt,
-                                                               bool initOutBuffer,
-                                                               cudaStream_t stream);
-extern template void fusedL2NNMinReduce<double, double, int>(double* min,
-                                                             const double* x,
-                                                             const double* y,
-                                                             const double* xn,
-                                                             const double* yn,
-                                                             int m,
-                                                             int n,
-                                                             int k,
-                                                             void* workspace,
-                                                             bool sqrt,
-                                                             bool initOutBuffer,
-                                                             cudaStream_t stream);
-extern template void fusedL2NNMinReduce<double, double, int64_t>(double* min,
-                                                                 const double* x,
-                                                                 const double* y,
-                                                                 const double* xn,
-                                                                 const double* yn,
-                                                                 int64_t m,
-                                                                 int64_t n,
-                                                                 int64_t k,
-                                                                 void* workspace,
-                                                                 bool sqrt,
-                                                                 bool initOutBuffer,
-                                                                 cudaStream_t stream);
-
-}  // namespace distance
-}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
new file mode 100644
index 0000000000..2e15ebd665
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <raft/util/device_loads_stores.cuh>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+#include <raft/util/vectorized.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <cub/cub.cuh>
+
+#include <cuda_fp16.h>
+
+#include <optional>
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft::neighbors::ivf_pq::detail {
+
+// is_local_topk_feasible is not inline here, because we would have to define it
+// here as well. That would run the risk of the definitions here and in the
+// -inl.cuh header diverging.
+auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) -> bool;
+
+template <typename OutT,
+          typename LutT,
+          uint32_t PqBits,
+          int Capacity,
+          bool PrecompBaseDiff,
+          bool EnableSMemLut>
+__global__ void compute_similarity_kernel(uint32_t n_rows,
+                                          uint32_t dim,
+                                          uint32_t n_probes,
+                                          uint32_t pq_dim,
+                                          uint32_t n_queries,
+                                          distance::DistanceType metric,
+                                          codebook_gen codebook_kind,
+                                          uint32_t topk,
+                                          uint32_t max_samples,
+                                          const float* cluster_centers,
+                                          const float* pq_centers,
+                                          const uint8_t* const* pq_dataset,
+                                          const uint32_t* cluster_labels,
+                                          const uint32_t* _chunk_indices,
+                                          const float* queries,
+                                          const uint32_t* index_list,
+                                          float* query_kths,
+                                          LutT* lut_scores,
+                                          OutT* _out_scores,
+                                          uint32_t* _out_indices) RAFT_EXPLICIT;
+
+// The signature of the kernel defined by a minimal set of template parameters
+template <typename OutT, typename LutT>
+using compute_similarity_kernel_t =
+  decltype(&compute_similarity_kernel<OutT, LutT, 8, 0, true, true>);
+
+template <typename OutT, typename LutT>
+struct occupancy_t {
+  using shmem_unit = Pow2<128>;
+
+  int blocks_per_sm = 0;
+  double occupancy  = 0.0;
+  double shmem_use  = 1.0;
+
+  inline occupancy_t() = default;
+  inline occupancy_t(size_t smem,
+                     uint32_t n_threads,
+                     compute_similarity_kernel_t<OutT, LutT> kernel,
+                     const cudaDeviceProp& dev_props) RAFT_EXPLICIT;
+};
+
+template <typename OutT, typename LutT>
+struct selected {
+  compute_similarity_kernel_t<OutT, LutT> kernel;
+  dim3 grid_dim;
+  dim3 block_dim;
+  size_t smem_size;
+  size_t device_lut_size;
+
+  template <typename... Args>
+  void operator()(rmm::cuda_stream_view stream, Args... args);
+};
+
+/**
+ * Use heuristics to choose an optimal instance of the search kernel.
+ * It selects among a few kernel variants (with/out using shared mem for
+ * lookup tables / precomputed distances) and tries to choose the block size
+ * to maximize kernel occupancy.
+ *
+ * @param manage_local_topk
+ *    whether use the fused calculate+select or just calculate the distances for each
+ *    query and probed cluster.
+ *
+ * @param locality_hint
+ *    beyond this limit do not consider increasing the number of active blocks per SM
+ *    would improve locality anymore.
+ */
+template <typename OutT, typename LutT>
+auto compute_similarity_select(const cudaDeviceProp& dev_props,
+                               bool manage_local_topk,
+                               int locality_hint,
+                               double preferred_shmem_carveout,
+                               uint32_t pq_bits,
+                               uint32_t pq_dim,
+                               uint32_t precomp_data_count,
+                               uint32_t n_queries,
+                               uint32_t n_probes,
+                               uint32_t topk) -> selected<OutT, LutT> RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors::ivf_pq::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)         \
+  extern template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                           \
+    bool manage_local_topk,                                                                    \
+    int locality_hint,                                                                         \
+    double preferred_shmem_carveout,                                                           \
+    uint32_t pq_bits,                                                                          \
+    uint32_t pq_dim,                                                                           \
+    uint32_t precomp_data_count,                                                               \
+    uint32_t n_queries,                                                                        \
+    uint32_t n_probes,                                                                         \
+    uint32_t topk)                                                                             \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(half, half);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(float, half);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(float, float);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>);
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
new file mode 100644
index 0000000000..4ce205bda2
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
@@ -0,0 +1,803 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
+#include <raft/matrix/detail/select_warpsort.cuh>  // matrix::detail::select::warpsort::warp_sort_distributed
+#include <raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh>  // dummy_block_sort_t
+#include <raft/neighbors/ivf_pq_types.hpp>                    // codebook_gen
+#include <raft/util/cuda_rt_essentials.hpp>                   // RAFT_CUDA_TRY
+#include <raft/util/device_atomics.cuh>                       // raft::atomicMin
+#include <raft/util/pow2_utils.cuh>                           // raft::Pow2
+#include <raft/util/vectorized.cuh>                           // raft::TxN_t
+#include <rmm/cuda_stream_view.hpp>                           // rmm::cuda_stream_view
+
+namespace raft::neighbors::ivf_pq::detail {
+
+/**
+ * Maximum value of k for the fused calculate & select in ivfpq.
+ *
+ * If runtime value of k is larger than this, the main search operation
+ * is split into two kernels (per batch, first calculate distance, then select top-k).
+ */
+static constexpr int kMaxCapacity = 128;
+static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)),
+              "kMaxCapacity must be a power of two, not smaller than the WarpSize.");
+
+// inline here, because it may be compiled multiple times.
+inline auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) -> bool
+{
+  if (k > kMaxCapacity) { return false; }             // warp_sort not possible
+  if (n_probes <= 16) { return false; }               // too few clusters
+  if (n_queries * n_probes <= 256) { return false; }  // overall amount of work is too small
+  return true;
+}
+
+template <int Capacity, typename T, typename IdxT>
+struct pq_block_sort {
+  using type = matrix::detail::select::warpsort::
+    block_sort<matrix::detail::select::warpsort::warp_sort_distributed, Capacity, true, T, IdxT>;
+};
+
+template <typename T, typename IdxT>
+struct pq_block_sort<0, T, IdxT> : dummy_block_sort_t<T, IdxT> {
+  using type = dummy_block_sort_t<T, IdxT>;
+};
+
+template <int Capacity, typename T, typename IdxT>
+using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
+
+/**
+ * Estimate a carveout value as expected by `cudaFuncAttributePreferredSharedMemoryCarveout`
+ * (which does not take into account `reservedSharedMemPerBlock`),
+ * given by a desired schmem-L1 split and a per-block memory requirement in bytes.
+ *
+ * NB: As per the programming guide, the memory carveout setting is just a hint for the driver; it's
+ * free to choose any shmem-L1 configuration it deems appropriate. For example, if you set the
+ * carveout to zero, it will choose a non-zero config that will allow to run at least one active
+ * block per SM.
+ *
+ * @param shmem_fraction
+ *   a fraction representing a desired split (shmem / (shmem + L1)) [0, 1].
+ * @param shmem_per_block
+ *   a shared memory usage per block (dynamic + static shared memory sizes), in bytes.
+ * @param dev_props
+ *   device properties.
+ * @return
+ *   a carveout value in percents [0, 100].
+ */
+constexpr inline auto estimate_carveout(double shmem_fraction,
+                                        size_t shmem_per_block,
+                                        const cudaDeviceProp& dev_props) -> int
+{
+  using shmem_unit = Pow2<128>;
+  size_t m         = shmem_unit::roundUp(shmem_per_block);
+  size_t r         = dev_props.reservedSharedMemPerBlock;
+  size_t s         = dev_props.sharedMemPerMultiprocessor;
+  return (size_t(100 * s * m * shmem_fraction) - (m - 1) * r) / (s * (m + r));
+}
+
+/* Manually unrolled loop over a chunk of pq_dataset that fits into one VecT. */
+template <typename OutT,
+          typename LutT,
+          typename VecT,
+          bool CheckBounds,
+          uint32_t PqBits,
+          uint32_t BitsLeft = 0,
+          uint32_t Ix       = 0>
+__device__ __forceinline__ void ivfpq_compute_chunk(OutT& score /* NOLINT */,
+                                                    typename VecT::math_t& pq_code,
+                                                    const VecT& pq_codes,
+                                                    const LutT*& lut_head,
+                                                    const LutT*& lut_end)
+{
+  if constexpr (CheckBounds) {
+    if (lut_head >= lut_end) { return; }
+  }
+  constexpr uint32_t kTotalBits = 8 * sizeof(typename VecT::math_t);
+  constexpr uint32_t kPqShift   = 1u << PqBits;
+  constexpr uint32_t kPqMask    = kPqShift - 1u;
+  if constexpr (BitsLeft >= PqBits) {
+    uint8_t code = pq_code & kPqMask;
+    pq_code >>= PqBits;
+    score += OutT(lut_head[code]);
+    lut_head += kPqShift;
+    return ivfpq_compute_chunk<OutT, LutT, VecT, CheckBounds, PqBits, BitsLeft - PqBits, Ix>(
+      score, pq_code, pq_codes, lut_head, lut_end);
+  } else if constexpr (Ix < VecT::Ratio) {
+    uint8_t code                = pq_code;
+    pq_code                     = pq_codes.val.data[Ix];
+    constexpr uint32_t kRemBits = PqBits - BitsLeft;
+    constexpr uint32_t kRemMask = (1u << kRemBits) - 1u;
+    code |= (pq_code & kRemMask) << BitsLeft;
+    pq_code >>= kRemBits;
+    score += OutT(lut_head[code]);
+    lut_head += kPqShift;
+    return ivfpq_compute_chunk<OutT,
+                               LutT,
+                               VecT,
+                               CheckBounds,
+                               PqBits,
+                               kTotalBits - kRemBits,
+                               Ix + 1>(score, pq_code, pq_codes, lut_head, lut_end);
+  }
+}
+
+/* Compute the similarity for one vector in the pq_dataset */
+template <typename OutT, typename LutT, typename VecT, uint32_t PqBits>
+__device__ auto ivfpq_compute_score(uint32_t pq_dim,
+                                    const typename VecT::io_t* pq_head,
+                                    const LutT* lut_scores,
+                                    OutT early_stop_limit) -> OutT
+{
+  constexpr uint32_t kChunkSize = sizeof(VecT) * 8u / PqBits;
+  auto lut_head                 = lut_scores;
+  auto lut_end                  = lut_scores + (pq_dim << PqBits);
+  VecT pq_codes;
+  OutT score{0};
+  for (; pq_dim >= kChunkSize; pq_dim -= kChunkSize) {
+    *pq_codes.vectorized_data() = *pq_head;
+    pq_head += kIndexGroupSize;
+    typename VecT::math_t pq_code = 0;
+    ivfpq_compute_chunk<OutT, LutT, VecT, false, PqBits>(
+      score, pq_code, pq_codes, lut_head, lut_end);
+    // Early stop when it makes sense (otherwise early_stop_limit is kDummy/infinity).
+    if (score >= early_stop_limit) { return score; }
+  }
+  if (pq_dim > 0) {
+    *pq_codes.vectorized_data()   = *pq_head;
+    typename VecT::math_t pq_code = 0;
+    ivfpq_compute_chunk<OutT, LutT, VecT, true, PqBits>(
+      score, pq_code, pq_codes, lut_head, lut_end);
+  }
+  return score;
+}
+
+/**
+ * The main kernel that computes similarity scores across multiple queries and probes.
+ * When `Capacity > 0`, it also selects top K candidates for each query and probe
+ * (which need to be merged across probes afterwards).
+ *
+ * Each block processes a (query, probe) pair: it calculates the distance between the single query
+ * vector and all the dataset vector in the cluster that we are probing.
+ *
+ * @tparam OutT
+ *   The output type - distances.
+ * @tparam LutT
+ *   The lookup table element type (lut_scores).
+ * @tparam PqBits
+ *   The bit length of an encoded vector element after compression by PQ
+ *   (NB: pq_book_size = 1 << PqBits).
+ * @tparam Capacity
+ *   Power-of-two; the maximum possible `k` in top-k. Value zero disables fused top-k search.
+ * @tparam PrecompBaseDiff
+ *   Defines whether we should precompute part of the distance and keep it in shared memory
+ *   before the main part (score calculation) to increase memory usage efficiency in the latter.
+ *   For L2, this is the distance between the query and the cluster center.
+ * @tparam EnableSMemLut
+ *   Defines whether to use the shared memory for the lookup table (`lut_scores`).
+ *   Setting this to `false` allows to reduce the shared memory usage (and maximum data dim)
+ *   at the cost of reducing global memory reading throughput.
+ *
+ * @param n_rows the number of records in the dataset
+ * @param dim the dimensionality of the data (NB: after rotation transform, i.e. `index.rot_dim()`).
+ * @param n_probes the number of clusters to search for each query
+ * @param pq_dim
+ *   The dimensionality of an encoded vector after compression by PQ.
+ * @param n_queries the number of queries.
+ * @param metric the distance type.
+ * @param codebook_kind Defines the way PQ codebooks have been trained.
+ * @param topk the `k` in the select top-k.
+ * @param max_samples the size of the output for a single query.
+ * @param cluster_centers
+ *   The device pointer to the cluster centers in the original space (NB: after rotation)
+ *   [n_clusters, dim].
+ * @param pq_centers
+ *   The device pointer to the cluster centers in the PQ space
+ *   [pq_dim, pq_book_size, pq_len] or [n_clusters, pq_book_size, pq_len,].
+ * @param pq_dataset
+ *   The device pointer to the PQ index (data) [n_rows, ...].
+ * @param cluster_labels
+ *   The device pointer to the labels (clusters) for each query and probe [n_queries, n_probes].
+ * @param _chunk_indices
+ *   The device pointer to the data offsets for each query and probe [n_queries, n_probes].
+ * @param queries
+ *   The device pointer to the queries (NB: after rotation) [n_queries, dim].
+ * @param index_list
+ *   An optional device pointer to the enforced order of search [n_queries, n_probes].
+ *   One can pass reordered indices here to try to improve data reading locality.
+ * @param lut_scores
+ *   The device pointer for storing the lookup table globally [gridDim.x, pq_dim << PqBits].
+ *   Ignored when `EnableSMemLut == true`.
+ * @param _out_scores
+ *   The device pointer to the output scores
+ *   [n_queries, max_samples] or [n_queries, n_probes, topk].
+ * @param _out_indices
+ *   The device pointer to the output indices [n_queries, n_probes, topk].
+ *   These are the indices of the records as they appear in the database view formed by the probed
+ *   clusters / defined by the `_chunk_indices`.
+ *   The indices can have values within the range [0, max_samples).
+ *   Ignored  when `Capacity == 0`.
+ */
+template <typename OutT,
+          typename LutT,
+          uint32_t PqBits,
+          int Capacity,
+          bool PrecompBaseDiff,
+          bool EnableSMemLut>
+__global__ void compute_similarity_kernel(uint32_t n_rows,
+                                          uint32_t dim,
+                                          uint32_t n_probes,
+                                          uint32_t pq_dim,
+                                          uint32_t n_queries,
+                                          distance::DistanceType metric,
+                                          codebook_gen codebook_kind,
+                                          uint32_t topk,
+                                          uint32_t max_samples,
+                                          const float* cluster_centers,
+                                          const float* pq_centers,
+                                          const uint8_t* const* pq_dataset,
+                                          const uint32_t* cluster_labels,
+                                          const uint32_t* _chunk_indices,
+                                          const float* queries,
+                                          const uint32_t* index_list,
+                                          float* query_kths,
+                                          LutT* lut_scores,
+                                          OutT* _out_scores,
+                                          uint32_t* _out_indices)
+{
+  /* Shared memory:
+
+    * lut_scores: lookup table (LUT) of size = `pq_dim << PqBits`  (when EnableSMemLut)
+    * base_diff: size = dim (which is equal to `pq_dim * pq_len`)  or dim*2
+    * topk::block_sort: some amount of shared memory, but overlaps with the rest:
+        block_sort only needs shared memory for `.done()` operation, which can come very last.
+  */
+  extern __shared__ __align__(256) uint8_t smem_buf[];  // NOLINT
+  constexpr bool kManageLocalTopK = Capacity > 0;
+
+  constexpr uint32_t PqShift = 1u << PqBits;  // NOLINT
+  constexpr uint32_t PqMask  = PqShift - 1u;  // NOLINT
+
+  const uint32_t pq_len   = dim / pq_dim;
+  const uint32_t lut_size = pq_dim * PqShift;
+
+  if constexpr (EnableSMemLut) {
+    lut_scores = reinterpret_cast<LutT*>(smem_buf);
+  } else {
+    lut_scores += lut_size * blockIdx.x;
+  }
+
+  float* base_diff = nullptr;
+  if constexpr (PrecompBaseDiff) {
+    if constexpr (EnableSMemLut) {
+      base_diff = reinterpret_cast<float*>(lut_scores + lut_size);
+    } else {
+      base_diff = reinterpret_cast<float*>(smem_buf);
+    }
+  }
+
+  for (int ib = blockIdx.x; ib < n_queries * n_probes; ib += gridDim.x) {
+    if (ib >= gridDim.x) {
+      // sync shared memory accesses on the second and further iterations
+      __syncthreads();
+    }
+    uint32_t query_ix;
+    uint32_t probe_ix;
+    if (index_list == nullptr) {
+      query_ix = ib % n_queries;
+      probe_ix = ib / n_queries;
+    } else {
+      auto ordered_ix = index_list[ib];
+      query_ix        = ordered_ix / n_probes;
+      probe_ix        = ordered_ix % n_probes;
+    }
+
+    const uint32_t* chunk_indices = _chunk_indices + (n_probes * query_ix);
+    const float* query            = queries + (dim * query_ix);
+    OutT* out_scores;
+    uint32_t* out_indices = nullptr;
+    if constexpr (kManageLocalTopK) {
+      // Store topk calculated distances to out_scores (and its indices to out_indices)
+      out_scores  = _out_scores + topk * (probe_ix + (n_probes * query_ix));
+      out_indices = _out_indices + topk * (probe_ix + (n_probes * query_ix));
+    } else {
+      // Store all calculated distances to out_scores
+      out_scores = _out_scores + max_samples * query_ix;
+    }
+    uint32_t label              = cluster_labels[n_probes * query_ix + probe_ix];
+    const float* cluster_center = cluster_centers + (dim * label);
+    const float* pq_center;
+    if (codebook_kind == codebook_gen::PER_SUBSPACE) {
+      pq_center = pq_centers;
+    } else {
+      pq_center = pq_centers + (pq_len << PqBits) * label;
+    }
+
+    if constexpr (PrecompBaseDiff) {
+      // Reduce number of memory reads later by pre-computing parts of the score
+      switch (metric) {
+        case distance::DistanceType::L2SqrtExpanded:
+        case distance::DistanceType::L2Expanded: {
+          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+            base_diff[i] = query[i] - cluster_center[i];
+          }
+        } break;
+        case distance::DistanceType::InnerProduct: {
+          float2 pvals;
+          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+            pvals.x                                 = query[i];
+            pvals.y                                 = cluster_center[i] * pvals.x;
+            reinterpret_cast<float2*>(base_diff)[i] = pvals;
+          }
+        } break;
+        default: __builtin_unreachable();
+      }
+      __syncthreads();
+    }
+
+    {
+      // Create a lookup table
+      // For each subspace, the lookup table stores the distance between the actual query vector
+      // (projected into the subspace) and all possible pq vectors in that subspace.
+      for (uint32_t i = threadIdx.x; i < lut_size; i += blockDim.x) {
+        const uint32_t i_pq  = i >> PqBits;
+        uint32_t j           = i_pq * pq_len;
+        const uint32_t j_end = pq_len + j;
+        auto cur_pq_center   = pq_center + (i & PqMask) +
+                             (codebook_kind == codebook_gen::PER_SUBSPACE ? j * PqShift : 0u);
+        float score = 0.0;
+        do {
+          float pq_c = *cur_pq_center;
+          cur_pq_center += PqShift;
+          switch (metric) {
+            case distance::DistanceType::L2SqrtExpanded:
+            case distance::DistanceType::L2Expanded: {
+              float diff;
+              if constexpr (PrecompBaseDiff) {
+                diff = base_diff[j];
+              } else {
+                diff = query[j] - cluster_center[j];
+              }
+              diff -= pq_c;
+              score += diff * diff;
+            } break;
+            case distance::DistanceType::InnerProduct: {
+              // NB: we negate the scores as we hardcoded select-topk to always compute the minimum
+              float q;
+              if constexpr (PrecompBaseDiff) {
+                float2 pvals = reinterpret_cast<float2*>(base_diff)[j];
+                q            = pvals.x;
+                score -= pvals.y;
+              } else {
+                q = query[j];
+                score -= q * cluster_center[j];
+              }
+              score -= q * pq_c;
+            } break;
+            default: __builtin_unreachable();
+          }
+        } while (++j < j_end);
+        lut_scores[i] = LutT(score);
+      }
+    }
+
+    // Define helper types for efficient access to the pq_dataset, which is stored in an interleaved
+    // format. The chunks of PQ data are stored in kIndexGroupVecLen-bytes-long chunks, interleaved
+    // in groups of kIndexGroupSize elems (which is normally equal to the warp size) for the fastest
+    // possible access by thread warps.
+    //
+    // Consider one record in the pq_dataset is `pq_dim * pq_bits`-bit-long.
+    // Assuming `kIndexGroupVecLen = 16`, one chunk of data read by a thread at once is 128-bits.
+    // Then, such a chunk contains `chunk_size = 128 / pq_bits` record elements, and the record
+    // consists of `ceildiv(pq_dim, chunk_size)` chunks. The chunks are interleaved in groups of 32,
+    // so that the warp can achieve the best coalesced read throughput.
+    using group_align  = Pow2<kIndexGroupSize>;
+    using vec_align    = Pow2<kIndexGroupVecLen>;
+    using local_topk_t = block_sort_t<Capacity, OutT, uint32_t>;
+    using op_t         = uint32_t;
+    using vec_t        = TxN_t<op_t, kIndexGroupVecLen / sizeof(op_t)>;
+
+    uint32_t sample_offset = 0;
+    if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; }
+    uint32_t n_samples            = chunk_indices[probe_ix] - sample_offset;
+    uint32_t n_samples_aligned    = group_align::roundUp(n_samples);
+    constexpr uint32_t kChunkSize = (kIndexGroupVecLen * 8u) / PqBits;
+    uint32_t pq_line_width        = div_rounding_up_unsafe(pq_dim, kChunkSize) * kIndexGroupVecLen;
+    auto pq_thread_data = pq_dataset[label] + group_align::roundDown(threadIdx.x) * pq_line_width +
+                          group_align::mod(threadIdx.x) * vec_align::Value;
+    pq_line_width *= blockDim.x;
+
+    constexpr OutT kDummy = upper_bound<OutT>();
+    OutT query_kth        = kDummy;
+    if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
+    local_topk_t block_topk(topk, nullptr, query_kth);
+    OutT early_stop_limit = kDummy;
+    switch (metric) {
+      // If the metric is non-negative, we can use the query_kth approximation as an early stop
+      // threshold to skip some iterations when computing the score. Add such metrics here.
+      case distance::DistanceType::L2SqrtExpanded:
+      case distance::DistanceType::L2Expanded: {
+        early_stop_limit = query_kth;
+      } break;
+      default: break;
+    }
+
+    // Ensure lut_scores is written by all threads before using it in ivfpq-compute-score
+    __threadfence_block();
+    __syncthreads();
+
+    // Compute a distance for each sample
+    for (uint32_t i = threadIdx.x; i < n_samples_aligned;
+         i += blockDim.x, pq_thread_data += pq_line_width) {
+      OutT score = kDummy;
+      bool valid = i < n_samples;
+      if (valid) {
+        score = ivfpq_compute_score<OutT, LutT, vec_t, PqBits>(
+          pq_dim,
+          reinterpret_cast<const vec_t::io_t*>(pq_thread_data),
+          lut_scores,
+          early_stop_limit);
+      }
+      if constexpr (kManageLocalTopK) {
+        block_topk.add(score, sample_offset + i);
+      } else {
+        if (valid) { out_scores[sample_offset + i] = score; }
+      }
+    }
+    if constexpr (kManageLocalTopK) {
+      // sync threads before the topk merging operation, because we reuse smem_buf
+      __syncthreads();
+      block_topk.done(smem_buf);
+      block_topk.store(out_scores, out_indices);
+      if (threadIdx.x == 0) { atomicMin(query_kths + query_ix, float(out_scores[topk - 1])); }
+    } else {
+      // fill in the rest of the out_scores with dummy values
+      if (probe_ix + 1 == n_probes) {
+        for (uint32_t i = threadIdx.x + sample_offset + n_samples; i < max_samples;
+             i += blockDim.x) {
+          out_scores[i] = kDummy;
+        }
+      }
+    }
+  }
+}
+
+// The signature of the kernel defined by a minimal set of template parameters
+template <typename OutT, typename LutT>
+using compute_similarity_kernel_t =
+  decltype(&compute_similarity_kernel<OutT, LutT, 8, 0, true, true>);
+
+// The config struct lifts the runtime parameters to the template parameters
+template <typename OutT, typename LutT, bool PrecompBaseDiff, bool EnableSMemLut>
+struct compute_similarity_kernel_config {
+ public:
+  static auto get(uint32_t pq_bits, uint32_t k_max) -> compute_similarity_kernel_t<OutT, LutT>
+  {
+    return kernel_choose_bits(pq_bits, k_max);
+  }
+
+ private:
+  static auto kernel_choose_bits(uint32_t pq_bits, uint32_t k_max)
+    -> compute_similarity_kernel_t<OutT, LutT>
+  {
+    switch (pq_bits) {
+      case 4: return kernel_try_capacity<4, kMaxCapacity>(k_max);
+      case 5: return kernel_try_capacity<5, kMaxCapacity>(k_max);
+      case 6: return kernel_try_capacity<6, kMaxCapacity>(k_max);
+      case 7: return kernel_try_capacity<7, kMaxCapacity>(k_max);
+      case 8: return kernel_try_capacity<8, kMaxCapacity>(k_max);
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }
+
+  template <uint32_t PqBits, int Capacity>
+  static auto kernel_try_capacity(uint32_t k_max) -> compute_similarity_kernel_t<OutT, LutT>
+  {
+    if constexpr (Capacity > 0) {
+      if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<PqBits, 0>(k_max); }
+    }
+    if constexpr (Capacity > 1) {
+      if (k_max * 2 <= Capacity) { return kernel_try_capacity<PqBits, (Capacity / 2)>(k_max); }
+    }
+    return compute_similarity_kernel<OutT, LutT, PqBits, Capacity, PrecompBaseDiff, EnableSMemLut>;
+  }
+};
+
+// A standalone accessor function is necessary to make sure template specializations work correctly
+// (we "extern template" this function)
+template <typename OutT, typename LutT, bool PrecompBaseDiff, bool EnableSMemLut>
+auto get_compute_similarity_kernel(uint32_t pq_bits, uint32_t k_max)
+  -> compute_similarity_kernel_t<OutT, LutT>
+{
+  return compute_similarity_kernel_config<OutT, LutT, PrecompBaseDiff, EnableSMemLut>::get(pq_bits,
+                                                                                           k_max);
+}
+
+/** Estimate the occupancy for the given kernel on the given device. */
+template <typename OutT, typename LutT>
+struct occupancy_t {
+  using shmem_unit = Pow2<128>;
+
+  int blocks_per_sm = 0;
+  double occupancy  = 0.0;
+  double shmem_use  = 1.0;
+
+  inline occupancy_t() = default;
+  inline occupancy_t(size_t smem,
+                     uint32_t n_threads,
+                     compute_similarity_kernel_t<OutT, LutT> kernel,
+                     const cudaDeviceProp& dev_props)
+  {
+    RAFT_CUDA_TRY(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_sm, kernel, n_threads, smem));
+    occupancy = double(blocks_per_sm * n_threads) / double(dev_props.maxThreadsPerMultiProcessor);
+    shmem_use = double(shmem_unit::roundUp(smem) * blocks_per_sm) /
+                double(dev_props.sharedMemPerMultiprocessor);
+  }
+};
+
+template <typename OutT, typename LutT>
+struct selected {
+  compute_similarity_kernel_t<OutT, LutT> kernel;
+  dim3 grid_dim;
+  dim3 block_dim;
+  size_t smem_size;
+  size_t device_lut_size;
+
+  template <typename... Args>
+  void operator()(rmm::cuda_stream_view stream, Args... args)
+  {
+    kernel<<<grid_dim, block_dim, smem_size, stream>>>(args...);
+    RAFT_CHECK_CUDA(stream);
+  }
+};
+
+/**
+ * Use heuristics to choose an optimal instance of the search kernel.
+ * It selects among a few kernel variants (with/out using shared mem for
+ * lookup tables / precomputed distances) and tries to choose the block size
+ * to maximize kernel occupancy.
+ *
+ * @param manage_local_topk
+ *    whether use the fused calculate+select or just calculate the distances for each
+ *    query and probed cluster.
+ *
+ * @param locality_hint
+ *    beyond this limit do not consider increasing the number of active blocks per SM
+ *    would improve locality anymore.
+ */
+template <typename OutT, typename LutT>
+auto compute_similarity_select(const cudaDeviceProp& dev_props,
+                               bool manage_local_topk,
+                               int locality_hint,
+                               double preferred_shmem_carveout,
+                               uint32_t pq_bits,
+                               uint32_t pq_dim,
+                               uint32_t precomp_data_count,
+                               uint32_t n_queries,
+                               uint32_t n_probes,
+                               uint32_t topk) -> selected<OutT, LutT>
+{
+  // Shared memory for storing the lookup table
+  size_t lut_mem = sizeof(LutT) * (pq_dim << pq_bits);
+  // Shared memory for storing pre-computed pieces to speedup the lookup table construction
+  // (e.g. the distance between a cluster center and the query for L2).
+  size_t bdf_mem = sizeof(float) * precomp_data_count;
+  // Shared memory for the fused top-k component; it may overlap with the other uses of shared
+  // memory and depends on the number of threads.
+  struct ltk_mem_t {
+    uint32_t subwarp_size;
+    uint32_t topk;
+    bool manage_local_topk;
+    ltk_mem_t(bool manage_local_topk, uint32_t topk)
+      : manage_local_topk(manage_local_topk), topk(topk)
+    {
+      subwarp_size = WarpSize;
+      while (topk * 2 <= subwarp_size) {
+        subwarp_size /= 2;
+      }
+    }
+
+    [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t
+    {
+      return manage_local_topk
+               ? matrix::detail::select::warpsort::template calc_smem_size_for_block_wide<OutT,
+                                                                                          uint32_t>(
+                   n_threads / subwarp_size, topk)
+               : 0;
+    }
+  } ltk_mem{manage_local_topk, topk};
+
+  // Total amount of work; should be enough to occupy the GPU.
+  uint32_t n_blocks = n_queries * n_probes;
+
+  // The minimum block size we may want:
+  //   1. It's a power-of-two for efficient L1 caching of pq_centers values
+  //      (multiples of `1 << pq_bits`).
+  //   2. It should be large enough to fully utilize an SM.
+  uint32_t n_threads_min = WarpSize;
+  while (dev_props.maxBlocksPerMultiProcessor * int(n_threads_min) <
+         dev_props.maxThreadsPerMultiProcessor) {
+    n_threads_min *= 2;
+  }
+  // Further increase the minimum block size to make sure full device occupancy
+  // (NB: this may lead to `n_threads_min` being larger than the kernel's maximum)
+  while (int(n_blocks * n_threads_min) <
+           dev_props.multiProcessorCount * dev_props.maxThreadsPerMultiProcessor &&
+         int(n_threads_min) < dev_props.maxThreadsPerBlock) {
+    n_threads_min *= 2;
+  }
+  // Even further, increase it to allow less blocks per SM if there not enough queries.
+  // With this, we reduce the chance of different clusters being processed by two blocks
+  // on the same SM and thus improve the data locality for L1 caching.
+  while (int(n_queries * n_threads_min) < dev_props.maxThreadsPerMultiProcessor &&
+         int(n_threads_min) < dev_props.maxThreadsPerBlock) {
+    n_threads_min *= 2;
+  }
+
+  // Granularity of changing the number of threads when computing the maximum block size.
+  // It's good to have it multiple of the PQ book width.
+  uint32_t n_threads_gty = round_up_safe<uint32_t>(1u << pq_bits, WarpSize);
+
+  /*
+   Shared memory / L1 cache balance is the main limiter of this kernel.
+   The more blocks per SM we launch, the more shared memory we need. Besides that, we have
+   three versions of the kernel varying in performance and shmem usage.
+
+   We try the most demanding and the fastest kernel first, trying to maximize occupancy with
+   the minimum number of blocks (just one, really). Then, we tweak the `n_threads` to further
+   optimize occupancy and data locality for the L1 cache.
+   */
+  auto conf_fast        = get_compute_similarity_kernel<OutT, LutT, true, true>;
+  auto conf_no_basediff = get_compute_similarity_kernel<OutT, LutT, false, true>;
+  auto conf_no_smem_lut = get_compute_similarity_kernel<OutT, LutT, true, false>;
+  auto topk_or_zero     = manage_local_topk ? topk : 0u;
+  std::array candidates{std::make_tuple(conf_fast(pq_bits, topk_or_zero), lut_mem + bdf_mem, true),
+                        std::make_tuple(conf_no_basediff(pq_bits, topk_or_zero), lut_mem, true),
+                        std::make_tuple(conf_no_smem_lut(pq_bits, topk_or_zero), bdf_mem, false)};
+
+  // we may allow slightly lower than 100% occupancy;
+  constexpr double kTargetOccupancy = 0.75;
+  // This struct is used to select the better candidate
+  occupancy_t<OutT, LutT> selected_perf{};
+  selected<OutT, LutT> selected_config;
+  for (auto [kernel, smem_size_const, lut_is_in_shmem] : candidates) {
+    if (smem_size_const > dev_props.sharedMemPerBlockOptin) {
+      // Even a single block cannot fit into an SM due to shmem requirements. Skip the candidate.
+      continue;
+    }
+
+    // First, we set the carveout hint to the preferred value. The driver will increase this if
+    // needed to run at least one block per SM. At the same time, if more blocks fit into one SM,
+    // this carveout value will limit the calculated occupancy. When we're done selecting the best
+    // launch configuration, we will tighten the carveout once more, based on the final memory
+    // usage and occupancy.
+    const int max_carveout =
+      estimate_carveout(preferred_shmem_carveout, smem_size_const, dev_props);
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, max_carveout));
+
+    // Get the theoretical maximum possible number of threads per block
+    cudaFuncAttributes kernel_attrs;
+    RAFT_CUDA_TRY(cudaFuncGetAttributes(&kernel_attrs, kernel));
+    uint32_t n_threads = round_down_safe<uint32_t>(kernel_attrs.maxThreadsPerBlock, n_threads_gty);
+
+    // Actual required shmem depens on the number of threads
+    size_t smem_size = max(smem_size_const, ltk_mem(n_threads));
+
+    // Make sure the kernel can get enough shmem.
+    cudaError_t cuda_status =
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
+    if (cuda_status != cudaSuccess) {
+      RAFT_EXPECTS(
+        cuda_status == cudaGetLastError(),
+        "Tried to reset the expected cuda error code, but it didn't match the expectation");
+      // Failed to request enough shmem for the kernel. Skip the candidate.
+      continue;
+    }
+
+    occupancy_t<OutT, LutT> cur(smem_size, n_threads, kernel, dev_props);
+    if (cur.blocks_per_sm <= 0) {
+      // For some reason, we still cannot make this kernel run. Skip the candidate.
+      continue;
+    }
+
+    {
+      // Try to reduce the number of threads to increase occupancy and data locality
+      auto n_threads_tmp = n_threads_min;
+      while (n_threads_tmp * 2 < n_threads) {
+        n_threads_tmp *= 2;
+      }
+      if (n_threads_tmp < n_threads) {
+        while (n_threads_tmp >= n_threads_min) {
+          auto smem_size_tmp = max(smem_size_const, ltk_mem(n_threads_tmp));
+          occupancy_t<OutT, LutT> tmp(smem_size_tmp, n_threads_tmp, kernel, dev_props);
+          bool select_it = false;
+          if (lut_is_in_shmem && locality_hint >= tmp.blocks_per_sm) {
+            // Normally, the smaller the block the better for L1 cache hit rate.
+            // Hence, the occupancy should be "just good enough"
+            select_it = tmp.occupancy >= min(kTargetOccupancy, cur.occupancy);
+          } else if (lut_is_in_shmem) {
+            // If we don't have enough repeating probes (locality_hint < tmp.blocks_per_sm),
+            // the locality is not going to improve with increasing the number of blocks per SM.
+            // Hence, the only metric here is the occupancy.
+            bool improves_occupancy = tmp.occupancy > cur.occupancy;
+            // Otherwise, the performance still improves with a smaller block size,
+            // given there is enough work to do
+            bool improves_parallelism =
+              tmp.occupancy == cur.occupancy &&
+              7u * tmp.blocks_per_sm * dev_props.multiProcessorCount <= n_blocks;
+            select_it = improves_occupancy || improves_parallelism;
+          } else {
+            // If we don't use shared memory for the lookup table, increasing the number of blocks
+            // is very taxing on the global memory usage.
+            // In this case, the occupancy must increase a lot to make it worth the cost.
+            select_it = tmp.occupancy >= min(1.0, cur.occupancy / kTargetOccupancy);
+          }
+          if (select_it) {
+            n_threads = n_threads_tmp;
+            smem_size = smem_size_tmp;
+            cur       = tmp;
+          }
+          n_threads_tmp /= 2;
+        }
+      }
+    }
+
+    {
+      if (selected_perf.occupancy <= 0.0  // no candidate yet
+          || (selected_perf.occupancy < cur.occupancy * kTargetOccupancy &&
+              selected_perf.shmem_use >= cur.shmem_use)  // much improved occupancy
+      ) {
+        selected_perf = cur;
+        if (lut_is_in_shmem) {
+          selected_config = {
+            kernel, dim3(n_blocks, 1, 1), dim3(n_threads, 1, 1), smem_size, size_t(0)};
+        } else {
+          // When the global memory is used for the lookup table, we need to minimize the grid
+          // size; otherwise, the kernel may quickly run out of memory.
+          auto n_blocks_min =
+            std::min<uint32_t>(n_blocks, cur.blocks_per_sm * dev_props.multiProcessorCount);
+          selected_config = {kernel,
+                             dim3(n_blocks_min, 1, 1),
+                             dim3(n_threads, 1, 1),
+                             smem_size,
+                             size_t(n_blocks_min) * size_t(pq_dim << pq_bits)};
+        }
+        // Actual shmem/L1 split wildly rounds up the specified preferred carveout, so we set here
+        // a rather conservative bar; most likely, the kernel gets more shared memory than this,
+        // and the occupancy doesn't get hurt.
+        auto carveout = std::min<int>(max_carveout, std::ceil(100.0 * cur.shmem_use));
+        RAFT_CUDA_TRY(
+          cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
+        if (cur.occupancy >= kTargetOccupancy) { break; }
+      } else if (selected_perf.occupancy > 0.0) {
+        // If we found a reasonable candidate on a previous iteration, and this one is not better,
+        // then don't try any more candidates because they are much slower anyway.
+        break;
+      }
+    }
+  }
+
+  RAFT_EXPECTS(selected_perf.occupancy > 0.0,
+               "Couldn't determine a working kernel launch configuration.");
+
+  return selected_config;
+}
+
+}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
index e69de29bb2..2d63617798 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "ivf_pq_compute_similarity-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "ivf_pq_compute_similarity-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh
new file mode 100644
index 0000000000..a00b6a50ff
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/matrix/detail/select_warpsort.cuh>  // matrix::detail::select::warpsort::warp_sort_distributed
+
+/*
+ * This header file is a bit of an ugly duckling. The type dummy_block_sort is
+ * needed by both ivf_pq_search.cuh and ivf_pq_compute_similarity.cuh.
+ *
+ * I have decided to move it to it's own header file, which is overkill. Perhaps
+ * there is a nicer solution.
+ *
+ */
+
+namespace raft::neighbors::ivf_pq::detail {
+
+template <typename T, typename IdxT>
+struct dummy_block_sort_t {
+  using queue_t = matrix::detail::select::warpsort::warp_sort_distributed<WarpSize, true, T, IdxT>;
+  template <typename... Args>
+  __device__ dummy_block_sort_t(int k, Args...){};
+};
+
+}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh
new file mode 100644
index 0000000000..87f9bfb622
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_fp_8bit.cuh
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/cudart_utils.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <raft/util/device_loads_stores.cuh>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <cub/cub.cuh>
+
+#include <cuda_fp16.h>
+
+#include <optional>
+
+namespace raft::neighbors::ivf_pq::detail {
+
+/** 8-bit floating-point storage type.
+ *
+ * This is a custom type for the current IVF-PQ implementation. No arithmetic operations defined
+ * only conversion to and from fp32. This type is unrelated to the proposed FP8 specification.
+ */
+template <uint32_t ExpBits, bool Signed>
+struct fp_8bit {
+  static_assert(ExpBits + uint8_t{Signed} <= 8, "The type does not fit in 8 bits.");
+  constexpr static uint32_t ExpMask = (1u << (ExpBits - 1u)) - 1u;  // NOLINT
+  constexpr static uint32_t ValBits = 8u - ExpBits;                 // NOLINT
+
+ public:
+  uint8_t bitstring;
+
+  HDI explicit fp_8bit(uint8_t bs) : bitstring(bs) {}
+  HDI explicit fp_8bit(float fp) : fp_8bit(float2fp_8bit(fp).bitstring) {}
+  HDI auto operator=(float fp) -> fp_8bit<ExpBits, Signed>&
+  {
+    bitstring = float2fp_8bit(fp).bitstring;
+    return *this;
+  }
+  HDI explicit operator float() const { return fp_8bit2float(*this); }
+  HDI explicit operator half() const { return half(fp_8bit2float(*this)); }
+
+ private:
+  static constexpr float kMin = 1.0f / float(1u << ExpMask);
+  static constexpr float kMax = float(1u << (ExpMask + 1)) * (2.0f - 1.0f / float(1u << ValBits));
+
+  static HDI auto float2fp_8bit(float v) -> fp_8bit<ExpBits, Signed>
+  {
+    if constexpr (Signed) {
+      auto u = fp_8bit<ExpBits, false>(std::abs(v)).bitstring;
+      u      = (u & 0xfeu) | uint8_t{v < 0};  // set the sign bit
+      return fp_8bit<ExpBits, true>(u);
+    } else {
+      // sic! all small and negative numbers are truncated to zero.
+      if (v < kMin) { return fp_8bit<ExpBits, false>{static_cast<uint8_t>(0)}; }
+      // protect from overflow
+      if (v >= kMax) { return fp_8bit<ExpBits, false>{static_cast<uint8_t>(0xffu)}; }
+      // the rest of possible float values should be within the normalized range
+      return fp_8bit<ExpBits, false>{static_cast<uint8_t>(
+        (*reinterpret_cast<uint32_t*>(&v) + (ExpMask << 23u) - 0x3f800000u) >> (15u + ExpBits))};
+    }
+  }
+
+  static HDI auto fp_8bit2float(const fp_8bit<ExpBits, Signed>& v) -> float
+  {
+    uint32_t u = v.bitstring;
+    if constexpr (Signed) {
+      u &= ~1;  // zero the sign bit
+    }
+    float r;
+    *reinterpret_cast<uint32_t*>(&r) =
+      ((u << (15u + ExpBits)) + (0x3f800000u | (0x00400000u >> ValBits)) - (ExpMask << 23));
+    if constexpr (Signed) {  // recover the sign bit
+      if (v.bitstring & 1) { r = -r; }
+    }
+    return r;
+  }
+};
+
+}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 4b6e6f5e31..4256b1631a 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -18,6 +18,9 @@
 
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
+#include <raft/neighbors/detail/ivf_pq_compute_similarity.cuh>
+#include <raft/neighbors/detail/ivf_pq_dummy_block_sort.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>
 
 #include <raft/core/cudart_utils.hpp>
@@ -49,79 +52,8 @@
 
 namespace raft::neighbors::ivf_pq::detail {
 
-/**
- * Maximum value of k for the fused calculate & select in ivfpq.
- *
- * If runtime value of k is larger than this, the main search operation
- * is split into two kernels (per batch, first calculate distance, then select top-k).
- */
-static constexpr int kMaxCapacity = 128;
-static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)),
-              "kMaxCapacity must be a power of two, not smaller than the WarpSize.");
-
 using namespace raft::spatial::knn::detail;  // NOLINT
 
-/** 8-bit floating-point storage type.
- *
- * This is a custom type for the current IVF-PQ implementation. No arithmetic operations defined
- * only conversion to and from fp32. This type is unrelated to the proposed FP8 specification.
- */
-template <uint32_t ExpBits, bool Signed>
-struct fp_8bit {
-  static_assert(ExpBits + uint8_t{Signed} <= 8, "The type does not fit in 8 bits.");
-  constexpr static uint32_t ExpMask = (1u << (ExpBits - 1u)) - 1u;  // NOLINT
-  constexpr static uint32_t ValBits = 8u - ExpBits;                 // NOLINT
-
- public:
-  uint8_t bitstring;
-
-  HDI explicit fp_8bit(uint8_t bs) : bitstring(bs) {}
-  HDI explicit fp_8bit(float fp) : fp_8bit(float2fp_8bit(fp).bitstring) {}
-  HDI auto operator=(float fp) -> fp_8bit<ExpBits, Signed>&
-  {
-    bitstring = float2fp_8bit(fp).bitstring;
-    return *this;
-  }
-  HDI explicit operator float() const { return fp_8bit2float(*this); }
-  HDI explicit operator half() const { return half(fp_8bit2float(*this)); }
-
- private:
-  static constexpr float kMin = 1.0f / float(1u << ExpMask);
-  static constexpr float kMax = float(1u << (ExpMask + 1)) * (2.0f - 1.0f / float(1u << ValBits));
-
-  static HDI auto float2fp_8bit(float v) -> fp_8bit<ExpBits, Signed>
-  {
-    if constexpr (Signed) {
-      auto u = fp_8bit<ExpBits, false>(std::abs(v)).bitstring;
-      u      = (u & 0xfeu) | uint8_t{v < 0};  // set the sign bit
-      return fp_8bit<ExpBits, true>(u);
-    } else {
-      // sic! all small and negative numbers are truncated to zero.
-      if (v < kMin) { return fp_8bit<ExpBits, false>{static_cast<uint8_t>(0)}; }
-      // protect from overflow
-      if (v >= kMax) { return fp_8bit<ExpBits, false>{static_cast<uint8_t>(0xffu)}; }
-      // the rest of possible float values should be within the normalized range
-      return fp_8bit<ExpBits, false>{static_cast<uint8_t>(
-        (*reinterpret_cast<uint32_t*>(&v) + (ExpMask << 23u) - 0x3f800000u) >> (15u + ExpBits))};
-    }
-  }
-
-  static HDI auto fp_8bit2float(const fp_8bit<ExpBits, Signed>& v) -> float
-  {
-    uint32_t u = v.bitstring;
-    if constexpr (Signed) {
-      u &= ~1;  // zero the sign bit
-    }
-    float r;
-    *reinterpret_cast<uint32_t*>(&r) =
-      ((u << (15u + ExpBits)) + (0x3f800000u | (0x00400000u >> ValBits)) - (ExpMask << 23));
-    if constexpr (Signed) {  // recover the sign bit
-      if (v.bitstring & 1) { r = -r; }
-    }
-    return r;
-  }
-};
-
 /**
  * Select the clusters to probe and, as a side-effect, translate the queries type `T -> float`
  *
@@ -439,464 +371,6 @@ void postprocess_distances(float* out,        // [n_queries, topk]
   }
 }
 
-template <typename T, typename IdxT>
-struct dummy_block_sort_t {
-  using queue_t = matrix::detail::select::warpsort::warp_sort_distributed<WarpSize, true, T, IdxT>;
-  template <typename... Args>
-  __device__ dummy_block_sort_t(int k, Args...){};
-};
-
-template <int Capacity, typename T, typename IdxT>
-struct pq_block_sort {
-  using type = matrix::detail::select::warpsort::
-    block_sort<matrix::detail::select::warpsort::warp_sort_distributed, Capacity, true, T, IdxT>;
-};
-
-template <typename T, typename IdxT>
-struct pq_block_sort<0, T, IdxT> : dummy_block_sort_t<T, IdxT> {
-  using type = dummy_block_sort_t<T, IdxT>;
-};
-
-template <int Capacity, typename T, typename IdxT>
-using block_sort_t = typename pq_block_sort<Capacity, T, IdxT>::type;
-
-/* Manually unrolled loop over a chunk of pq_dataset that fits into one VecT. */
-template <typename OutT,
-          typename LutT,
-          typename VecT,
-          bool CheckBounds,
-          uint32_t PqBits,
-          uint32_t BitsLeft = 0,
-          uint32_t Ix       = 0>
-__device__ __forceinline__ void ivfpq_compute_chunk(OutT& score /* NOLINT */,
-                                                    typename VecT::math_t& pq_code,
-                                                    const VecT& pq_codes,
-                                                    const LutT*& lut_head,
-                                                    const LutT*& lut_end)
-{
-  if constexpr (CheckBounds) {
-    if (lut_head >= lut_end) { return; }
-  }
-  constexpr uint32_t kTotalBits = 8 * sizeof(typename VecT::math_t);
-  constexpr uint32_t kPqShift   = 1u << PqBits;
-  constexpr uint32_t kPqMask    = kPqShift - 1u;
-  if constexpr (BitsLeft >= PqBits) {
-    uint8_t code = pq_code & kPqMask;
-    pq_code >>= PqBits;
-    score += OutT(lut_head[code]);
-    lut_head += kPqShift;
-    return ivfpq_compute_chunk<OutT, LutT, VecT, CheckBounds, PqBits, BitsLeft - PqBits, Ix>(
-      score, pq_code, pq_codes, lut_head, lut_end);
-  } else if constexpr (Ix < VecT::Ratio) {
-    uint8_t code                = pq_code;
-    pq_code                     = pq_codes.val.data[Ix];
-    constexpr uint32_t kRemBits = PqBits - BitsLeft;
-    constexpr uint32_t kRemMask = (1u << kRemBits) - 1u;
-    code |= (pq_code & kRemMask) << BitsLeft;
-    pq_code >>= kRemBits;
-    score += OutT(lut_head[code]);
-    lut_head += kPqShift;
-    return ivfpq_compute_chunk<OutT,
-                               LutT,
-                               VecT,
-                               CheckBounds,
-                               PqBits,
-                               kTotalBits - kRemBits,
-                               Ix + 1>(score, pq_code, pq_codes, lut_head, lut_end);
-  }
-}
-
-/* Compute the similarity for one vector in the pq_dataset */
-template <typename OutT, typename LutT, typename VecT, uint32_t PqBits>
-__device__ auto ivfpq_compute_score(uint32_t pq_dim,
-                                    const typename VecT::io_t* pq_head,
-                                    const LutT* lut_scores,
-                                    OutT early_stop_limit) -> OutT
-{
-  constexpr uint32_t kChunkSize = sizeof(VecT) * 8u / PqBits;
-  auto lut_head                 = lut_scores;
-  auto lut_end                  = lut_scores + (pq_dim << PqBits);
-  VecT pq_codes;
-  OutT score{0};
-  for (; pq_dim >= kChunkSize; pq_dim -= kChunkSize) {
-    *pq_codes.vectorized_data() = *pq_head;
-    pq_head += kIndexGroupSize;
-    typename VecT::math_t pq_code = 0;
-    ivfpq_compute_chunk<OutT, LutT, VecT, false, PqBits>(
-      score, pq_code, pq_codes, lut_head, lut_end);
-    // Early stop when it makes sense (otherwise early_stop_limit is kDummy/infinity).
-    if (score >= early_stop_limit) { return score; }
-  }
-  if (pq_dim > 0) {
-    *pq_codes.vectorized_data()   = *pq_head;
-    typename VecT::math_t pq_code = 0;
-    ivfpq_compute_chunk<OutT, LutT, VecT, true, PqBits>(
-      score, pq_code, pq_codes, lut_head, lut_end);
-  }
-  return score;
-}
-
-/**
- * The main kernel that computes similarity scores across multiple queries and probes.
- * When `Capacity > 0`, it also selects top K candidates for each query and probe
- * (which need to be merged across probes afterwards).
- *
- * Each block processes a (query, probe) pair: it calculates the distance between the single query
- * vector and all the dataset vector in the cluster that we are probing.
- *
- * @tparam OutT
- *   The output type - distances.
- * @tparam LutT
- *   The lookup table element type (lut_scores).
- * @tparam PqBits
- *   The bit length of an encoded vector element after compression by PQ
- *   (NB: pq_book_size = 1 << PqBits).
- * @tparam Capacity
- *   Power-of-two; the maximum possible `k` in top-k. Value zero disables fused top-k search.
- * @tparam PrecompBaseDiff
- *   Defines whether we should precompute part of the distance and keep it in shared memory
- *   before the main part (score calculation) to increase memory usage efficiency in the latter.
- *   For L2, this is the distance between the query and the cluster center.
- * @tparam EnableSMemLut
- *   Defines whether to use the shared memory for the lookup table (`lut_scores`).
- *   Setting this to `false` allows to reduce the shared memory usage (and maximum data dim)
- *   at the cost of reducing global memory reading throughput.
- *
- * @param n_rows the number of records in the dataset
- * @param dim the dimensionality of the data (NB: after rotation transform, i.e. `index.rot_dim()`).
- * @param n_probes the number of clusters to search for each query
- * @param pq_dim
- *   The dimensionality of an encoded vector after compression by PQ.
- * @param n_queries the number of queries.
- * @param metric the distance type.
- * @param codebook_kind Defines the way PQ codebooks have been trained.
- * @param topk the `k` in the select top-k.
- * @param max_samples the size of the output for a single query.
- * @param cluster_centers
- *   The device pointer to the cluster centers in the original space (NB: after rotation)
- *   [n_clusters, dim].
- * @param pq_centers
- *   The device pointer to the cluster centers in the PQ space
- *   [pq_dim, pq_book_size, pq_len] or [n_clusters, pq_book_size, pq_len,].
- * @param pq_dataset
- *   The device pointer to the PQ index (data) [n_rows, ...].
- * @param cluster_labels
- *   The device pointer to the labels (clusters) for each query and probe [n_queries, n_probes].
- * @param _chunk_indices
- *   The device pointer to the data offsets for each query and probe [n_queries, n_probes].
- * @param queries
- *   The device pointer to the queries (NB: after rotation) [n_queries, dim].
- * @param index_list
- *   An optional device pointer to the enforced order of search [n_queries, n_probes].
- *   One can pass reordered indices here to try to improve data reading locality.
- * @param lut_scores
- *   The device pointer for storing the lookup table globally [gridDim.x, pq_dim << PqBits].
- *   Ignored when `EnableSMemLut == true`.
- * @param _out_scores
- *   The device pointer to the output scores
- *   [n_queries, max_samples] or [n_queries, n_probes, topk].
- * @param _out_indices
- *   The device pointer to the output indices [n_queries, n_probes, topk].
- *   These are the indices of the records as they appear in the database view formed by the probed
- *   clusters / defined by the `_chunk_indices`.
- *   The indices can have values within the range [0, max_samples).
- *   Ignored  when `Capacity == 0`.
- */
-template <typename OutT,
-          typename LutT,
-          uint32_t PqBits,
-          int Capacity,
-          bool PrecompBaseDiff,
-          bool EnableSMemLut>
-__global__ void compute_similarity_kernel(uint32_t n_rows,
-                                          uint32_t dim,
-                                          uint32_t n_probes,
-                                          uint32_t pq_dim,
-                                          uint32_t n_queries,
-                                          distance::DistanceType metric,
-                                          codebook_gen codebook_kind,
-                                          uint32_t topk,
-                                          uint32_t max_samples,
-                                          const float* cluster_centers,
-                                          const float* pq_centers,
-                                          const uint8_t* const* pq_dataset,
-                                          const uint32_t* cluster_labels,
-                                          const uint32_t* _chunk_indices,
-                                          const float* queries,
-                                          const uint32_t* index_list,
-                                          float* query_kths,
-                                          LutT* lut_scores,
-                                          OutT* _out_scores,
-                                          uint32_t* _out_indices)
-{
-  /* Shared memory:
-
-    * lut_scores: lookup table (LUT) of size = `pq_dim << PqBits`  (when EnableSMemLut)
-    * base_diff: size = dim (which is equal to `pq_dim * pq_len`)  or dim*2
-    * topk::block_sort: some amount of shared memory, but overlaps with the rest:
-        block_sort only needs shared memory for `.done()` operation, which can come very last.
-  */
-  extern __shared__ __align__(256) uint8_t smem_buf[];  // NOLINT
-  constexpr bool kManageLocalTopK = Capacity > 0;
-
-  constexpr uint32_t PqShift = 1u << PqBits;  // NOLINT
-  constexpr uint32_t PqMask  = PqShift - 1u;  // NOLINT
-
-  const uint32_t pq_len   = dim / pq_dim;
-  const uint32_t lut_size = pq_dim * PqShift;
-
-  if constexpr (EnableSMemLut) {
-    lut_scores = reinterpret_cast<LutT*>(smem_buf);
-  } else {
-    lut_scores += lut_size * blockIdx.x;
-  }
-
-  float* base_diff = nullptr;
-  if constexpr (PrecompBaseDiff) {
-    if constexpr (EnableSMemLut) {
-      base_diff = reinterpret_cast<float*>(lut_scores + lut_size);
-    } else {
-      base_diff = reinterpret_cast<float*>(smem_buf);
-    }
-  }
-
-  for (int ib = blockIdx.x; ib < n_queries * n_probes; ib += gridDim.x) {
-    if (ib >= gridDim.x) {
-      // sync shared memory accesses on the second and further iterations
-      __syncthreads();
-    }
-    uint32_t query_ix;
-    uint32_t probe_ix;
-    if (index_list == nullptr) {
-      query_ix = ib % n_queries;
-      probe_ix = ib / n_queries;
-    } else {
-      auto ordered_ix = index_list[ib];
-      query_ix        = ordered_ix / n_probes;
-      probe_ix        = ordered_ix % n_probes;
-    }
-
-    const uint32_t* chunk_indices = _chunk_indices + (n_probes * query_ix);
-    const float* query            = queries + (dim * query_ix);
-    OutT* out_scores;
-    uint32_t* out_indices = nullptr;
-    if constexpr (kManageLocalTopK) {
-      // Store topk calculated distances to out_scores (and its indices to out_indices)
-      out_scores  = _out_scores + topk * (probe_ix + (n_probes * query_ix));
-      out_indices = _out_indices + topk * (probe_ix + (n_probes * query_ix));
-    } else {
-      // Store all calculated distances to out_scores
-      out_scores = _out_scores + max_samples * query_ix;
-    }
-    uint32_t label              = cluster_labels[n_probes * query_ix + probe_ix];
-    const float* cluster_center = cluster_centers + (dim * label);
-    const float* pq_center;
-    if (codebook_kind == codebook_gen::PER_SUBSPACE) {
-      pq_center = pq_centers;
-    } else {
-      pq_center = pq_centers + (pq_len << PqBits) * label;
-    }
-
-    if constexpr (PrecompBaseDiff) {
-      // Reduce number of memory reads later by pre-computing parts of the score
-      switch (metric) {
-        case distance::DistanceType::L2SqrtExpanded:
-        case distance::DistanceType::L2Expanded: {
-          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
-            base_diff[i] = query[i] - cluster_center[i];
-          }
-        } break;
-        case distance::DistanceType::InnerProduct: {
-          float2 pvals;
-          for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
-            pvals.x                                 = query[i];
-            pvals.y                                 = cluster_center[i] * pvals.x;
-            reinterpret_cast<float2*>(base_diff)[i] = pvals;
-          }
-        } break;
-        default: __builtin_unreachable();
-      }
-      __syncthreads();
-    }
-
-    {
-      // Create a lookup table
-      // For each subspace, the lookup table stores the distance between the actual query vector
-      // (projected into the subspace) and all possible pq vectors in that subspace.
-      for (uint32_t i = threadIdx.x; i < lut_size; i += blockDim.x) {
-        const uint32_t i_pq  = i >> PqBits;
-        uint32_t j           = i_pq * pq_len;
-        const uint32_t j_end = pq_len + j;
-        auto cur_pq_center   = pq_center + (i & PqMask) +
-                             (codebook_kind == codebook_gen::PER_SUBSPACE ? j * PqShift : 0u);
-        float score = 0.0;
-        do {
-          float pq_c = *cur_pq_center;
-          cur_pq_center += PqShift;
-          switch (metric) {
-            case distance::DistanceType::L2SqrtExpanded:
-            case distance::DistanceType::L2Expanded: {
-              float diff;
-              if constexpr (PrecompBaseDiff) {
-                diff = base_diff[j];
-              } else {
-                diff = query[j] - cluster_center[j];
-              }
-              diff -= pq_c;
-              score += diff * diff;
-            } break;
-            case distance::DistanceType::InnerProduct: {
-              // NB: we negate the scores as we hardcoded select-topk to always compute the minimum
-              float q;
-              if constexpr (PrecompBaseDiff) {
-                float2 pvals = reinterpret_cast<float2*>(base_diff)[j];
-                q            = pvals.x;
-                score -= pvals.y;
-              } else {
-                q = query[j];
-                score -= q * cluster_center[j];
-              }
-              score -= q * pq_c;
-            } break;
-            default: __builtin_unreachable();
-          }
-        } while (++j < j_end);
-        lut_scores[i] = LutT(score);
-      }
-    }
-
-    // Define helper types for efficient access to the pq_dataset, which is stored in an interleaved
-    // format. The chunks of PQ data are stored in kIndexGroupVecLen-bytes-long chunks, interleaved
-    // in groups of kIndexGroupSize elems (which is normally equal to the warp size) for the fastest
-    // possible access by thread warps.
-    //
-    // Consider one record in the pq_dataset is `pq_dim * pq_bits`-bit-long.
-    // Assuming `kIndexGroupVecLen = 16`, one chunk of data read by a thread at once is 128-bits.
-    // Then, such a chunk contains `chunk_size = 128 / pq_bits` record elements, and the record
-    // consists of `ceildiv(pq_dim, chunk_size)` chunks. The chunks are interleaved in groups of 32,
-    // so that the warp can achieve the best coalesced read throughput.
-    using group_align  = Pow2<kIndexGroupSize>;
-    using vec_align    = Pow2<kIndexGroupVecLen>;
-    using local_topk_t = block_sort_t<Capacity, OutT, uint32_t>;
-    using op_t         = uint32_t;
-    using vec_t        = TxN_t<op_t, kIndexGroupVecLen / sizeof(op_t)>;
-
-    uint32_t sample_offset = 0;
-    if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; }
-    uint32_t n_samples            = chunk_indices[probe_ix] - sample_offset;
-    uint32_t n_samples_aligned    = group_align::roundUp(n_samples);
-    constexpr uint32_t kChunkSize = (kIndexGroupVecLen * 8u) / PqBits;
-    uint32_t pq_line_width        = div_rounding_up_unsafe(pq_dim, kChunkSize) * kIndexGroupVecLen;
-    auto pq_thread_data = pq_dataset[label] + group_align::roundDown(threadIdx.x) * pq_line_width +
-                          group_align::mod(threadIdx.x) * vec_align::Value;
-    pq_line_width *= blockDim.x;
-
-    constexpr OutT kDummy = upper_bound<OutT>();
-    OutT query_kth        = kDummy;
-    if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
-    local_topk_t block_topk(topk, nullptr, query_kth);
-    OutT early_stop_limit = kDummy;
-    switch (metric) {
-      // If the metric is non-negative, we can use the query_kth approximation as an early stop
-      // threshold to skip some iterations when computing the score. Add such metrics here.
-      case distance::DistanceType::L2SqrtExpanded:
-      case distance::DistanceType::L2Expanded: {
-        early_stop_limit = query_kth;
-      } break;
-      default: break;
-    }
-
-    // Ensure lut_scores is written by all threads before using it in ivfpq-compute-score
-    __threadfence_block();
-    __syncthreads();
-
-    // Compute a distance for each sample
-    for (uint32_t i = threadIdx.x; i < n_samples_aligned;
-         i += blockDim.x, pq_thread_data += pq_line_width) {
-      OutT score = kDummy;
-      bool valid = i < n_samples;
-      if (valid) {
-        score = ivfpq_compute_score<OutT, LutT, vec_t, PqBits>(
-          pq_dim,
-          reinterpret_cast<const vec_t::io_t*>(pq_thread_data),
-          lut_scores,
-          early_stop_limit);
-      }
-      if constexpr (kManageLocalTopK) {
-        block_topk.add(score, sample_offset + i);
-      } else {
-        if (valid) { out_scores[sample_offset + i] = score; }
-      }
-    }
-    if constexpr (kManageLocalTopK) {
-      // sync threads before the topk merging operation, because we reuse smem_buf
-      __syncthreads();
-      block_topk.done(smem_buf);
-      block_topk.store(out_scores, out_indices);
-      if (threadIdx.x == 0) { atomicMin(query_kths + query_ix, float(out_scores[topk - 1])); }
-    } else {
-      // fill in the rest of the out_scores with dummy values
-      if (probe_ix + 1 == n_probes) {
-        for (uint32_t i = threadIdx.x + sample_offset + n_samples; i < max_samples;
-             i += blockDim.x) {
-          out_scores[i] = kDummy;
-        }
-      }
-    }
-  }
-}
-
-// The signature of the kernel defined by a minimal set of template parameters
-template <typename OutT, typename LutT>
-using compute_similarity_kernel_t =
-  decltype(&compute_similarity_kernel<OutT, LutT, 8, 0, true, true>);
-
-// The config struct lifts the runtime parameters to the template parameters
-template <typename OutT, typename LutT, bool PrecompBaseDiff, bool EnableSMemLut>
-struct compute_similarity_kernel_config {
- public:
-  static auto get(uint32_t pq_bits, uint32_t k_max) -> compute_similarity_kernel_t<OutT, LutT>
-  {
-    return kernel_choose_bits(pq_bits, k_max);
-  }
-
- private:
-  static auto kernel_choose_bits(uint32_t pq_bits, uint32_t k_max)
-    -> compute_similarity_kernel_t<OutT, LutT>
-  {
-    switch (pq_bits) {
-      case 4: return kernel_try_capacity<4, kMaxCapacity>(k_max);
-      case 5: return kernel_try_capacity<5, kMaxCapacity>(k_max);
-      case 6: return kernel_try_capacity<6, kMaxCapacity>(k_max);
-      case 7: return kernel_try_capacity<7, kMaxCapacity>(k_max);
-      case 8: return kernel_try_capacity<8, kMaxCapacity>(k_max);
-      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
-    }
-  }
-
-  template <uint32_t PqBits, int Capacity>
-  static auto kernel_try_capacity(uint32_t k_max) -> compute_similarity_kernel_t<OutT, LutT>
-  {
-    if constexpr (Capacity > 0) {
-      if (k_max == 0 || k_max > Capacity) { return kernel_try_capacity<PqBits, 0>(k_max); }
-    }
-    if constexpr (Capacity > 1) {
-      if (k_max * 2 <= Capacity) { return kernel_try_capacity<PqBits, (Capacity / 2)>(k_max); }
-    }
-    return compute_similarity_kernel<OutT, LutT, PqBits, Capacity, PrecompBaseDiff, EnableSMemLut>;
-  }
-};
-
-// A standalone accessor function is necessary to make sure template specializations work correctly
-// (we "extern template" this function)
-template <typename OutT, typename LutT, bool PrecompBaseDiff, bool EnableSMemLut>
-auto get_compute_similarity_kernel(uint32_t pq_bits, uint32_t k_max)
-  -> compute_similarity_kernel_t<OutT, LutT>
-{
-  return compute_similarity_kernel_config<OutT, LutT, PrecompBaseDiff, EnableSMemLut>::get(pq_bits,
-                                                                                           k_max);
-}
-
 /**
  * An approximation to the number of times each cluster appears in a batched sample.
  *
@@ -930,318 +404,6 @@ constexpr inline auto expected_probe_coresidency(uint32_t n_clusters,
   return 1 + (n_queries - 1) * n_probes / (2 * n_clusters);
 }
 
-/**
- * Estimate a carveout value as expected by `cudaFuncAttributePreferredSharedMemoryCarveout`
- * (which does not take into account `reservedSharedMemPerBlock`),
- * given by a desired schmem-L1 split and a per-block memory requirement in bytes.
- *
- * NB: As per the programming guide, the memory carveout setting is just a hint for the driver; it's
- * free to choose any shmem-L1 configuration it deems appropriate. For example, if you set the
- * carveout to zero, it will choose a non-zero config that will allow to run at least one active
- * block per SM.
- *
- * @param shmem_fraction
- *   a fraction representing a desired split (shmem / (shmem + L1)) [0, 1].
- * @param shmem_per_block
- *   a shared memory usage per block (dynamic + static shared memory sizes), in bytes.
- * @param dev_props
- *   device properties.
- * @return
- *   a carveout value in percents [0, 100].
- */
-constexpr inline auto estimate_carveout(double shmem_fraction,
-                                        size_t shmem_per_block,
-                                        const cudaDeviceProp& dev_props) -> int
-{
-  using shmem_unit = Pow2<128>;
-  size_t m         = shmem_unit::roundUp(shmem_per_block);
-  size_t r         = dev_props.reservedSharedMemPerBlock;
-  size_t s         = dev_props.sharedMemPerMultiprocessor;
-  return (size_t(100 * s * m * shmem_fraction) - (m - 1) * r) / (s * (m + r));
-}
-
-/** Select an appropriate kernel instance and launch parameters. */
-template <typename OutT, typename LutT>
-struct compute_similarity {
-  /** Estimate the occupancy for the given kernel on the given device. */
-  struct occupancy_t {
-    using shmem_unit = Pow2<128>;
-
-    int blocks_per_sm = 0;
-    double occupancy  = 0.0;
-    double shmem_use  = 1.0;
-
-    inline occupancy_t() = default;
-    inline occupancy_t(size_t smem,
-                       uint32_t n_threads,
-                       compute_similarity_kernel_t<OutT, LutT> kernel,
-                       const cudaDeviceProp& dev_props)
-    {
-      RAFT_CUDA_TRY(
-        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks_per_sm, kernel, n_threads, smem));
-      occupancy = double(blocks_per_sm * n_threads) / double(dev_props.maxThreadsPerMultiProcessor);
-      shmem_use = double(shmem_unit::roundUp(smem) * blocks_per_sm) /
-                  double(dev_props.sharedMemPerMultiprocessor);
-    }
-  };
-
-  struct selected {
-    compute_similarity_kernel_t<OutT, LutT> kernel;
-    dim3 grid_dim;
-    dim3 block_dim;
-    size_t smem_size;
-    size_t device_lut_size;
-
-    template <typename... Args>
-    void operator()(rmm::cuda_stream_view stream, Args... args)
-    {
-      kernel<<<grid_dim, block_dim, smem_size, stream>>>(args...);
-      RAFT_CHECK_CUDA(stream);
-    }
-  };
-
-  /**
-   * Use heuristics to choose an optimal instance of the search kernel.
-   * It selects among a few kernel variants (with/out using shared mem for
-   * lookup tables / precomputed distances) and tries to choose the block size
-   * to maximize kernel occupancy.
-   *
-   * @param manage_local_topk
-   *    whether use the fused calculate+select or just calculate the distances for each
-   *    query and probed cluster.
-   *
-   * @param locality_hint
-   *    beyond this limit do not consider increasing the number of active blocks per SM
-   *    would improve locality anymore.
-   */
-  static inline auto select(const cudaDeviceProp& dev_props,
-                            bool manage_local_topk,
-                            int locality_hint,
-                            double preferred_shmem_carveout,
-                            uint32_t pq_bits,
-                            uint32_t pq_dim,
-                            uint32_t precomp_data_count,
-                            uint32_t n_queries,
-                            uint32_t n_probes,
-                            uint32_t topk) -> selected
-  {
-    // Shared memory for storing the lookup table
-    size_t lut_mem = sizeof(LutT) * (pq_dim << pq_bits);
-    // Shared memory for storing pre-computed pieces to speedup the lookup table construction
-    // (e.g. the distance between a cluster center and the query for L2).
-    size_t bdf_mem = sizeof(float) * precomp_data_count;
-    // Shared memory for the fused top-k component; it may overlap with the other uses of shared
-    // memory and depends on the number of threads.
-    struct ltk_mem_t {
-      uint32_t subwarp_size;
-      uint32_t topk;
-      bool manage_local_topk;
-      ltk_mem_t(bool manage_local_topk, uint32_t topk)
-        : manage_local_topk(manage_local_topk), topk(topk)
-      {
-        subwarp_size = WarpSize;
-        while (topk * 2 <= subwarp_size) {
-          subwarp_size /= 2;
-        }
-      }
-
-      [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t
-      {
-        return manage_local_topk ? matrix::detail::select::warpsort::
-                                     template calc_smem_size_for_block_wide<OutT, uint32_t>(
-                                       n_threads / subwarp_size, topk)
-                                 : 0;
-      }
-    } ltk_mem{manage_local_topk, topk};
-
-    // Total amount of work; should be enough to occupy the GPU.
-    uint32_t n_blocks = n_queries * n_probes;
-
-    // The minimum block size we may want:
-    //   1. It's a power-of-two for efficient L1 caching of pq_centers values
-    //      (multiples of `1 << pq_bits`).
-    //   2. It should be large enough to fully utilize an SM.
-    uint32_t n_threads_min = WarpSize;
-    while (dev_props.maxBlocksPerMultiProcessor * int(n_threads_min) <
-           dev_props.maxThreadsPerMultiProcessor) {
-      n_threads_min *= 2;
-    }
-    // Further increase the minimum block size to make sure full device occupancy
-    // (NB: this may lead to `n_threads_min` being larger than the kernel's maximum)
-    while (int(n_blocks * n_threads_min) <
-             dev_props.multiProcessorCount * dev_props.maxThreadsPerMultiProcessor &&
-           int(n_threads_min) < dev_props.maxThreadsPerBlock) {
-      n_threads_min *= 2;
-    }
-    // Even further, increase it to allow less blocks per SM if there not enough queries.
-    // With this, we reduce the chance of different clusters being processed by two blocks
-    // on the same SM and thus improve the data locality for L1 caching.
-    while (int(n_queries * n_threads_min) < dev_props.maxThreadsPerMultiProcessor &&
-           int(n_threads_min) < dev_props.maxThreadsPerBlock) {
-      n_threads_min *= 2;
-    }
-
-    // Granularity of changing the number of threads when computing the maximum block size.
-    // It's good to have it multiple of the PQ book width.
-    uint32_t n_threads_gty = round_up_safe<uint32_t>(1u << pq_bits, WarpSize);
-
-    /*
-     Shared memory / L1 cache balance is the main limiter of this kernel.
-     The more blocks per SM we launch, the more shared memory we need. Besides that, we have
-     three versions of the kernel varying in performance and shmem usage.
-
-     We try the most demanding and the fastest kernel first, trying to maximize occupancy with
-     the minimum number of blocks (just one, really). Then, we tweak the `n_threads` to further
-     optimize occupancy and data locality for the L1 cache.
-     */
-    auto conf_fast        = get_compute_similarity_kernel<OutT, LutT, true, true>;
-    auto conf_no_basediff = get_compute_similarity_kernel<OutT, LutT, false, true>;
-    auto conf_no_smem_lut = get_compute_similarity_kernel<OutT, LutT, true, false>;
-    auto topk_or_zero     = manage_local_topk ? topk : 0u;
-    std::array candidates{
-      std::make_tuple(conf_fast(pq_bits, topk_or_zero), lut_mem + bdf_mem, true),
-      std::make_tuple(conf_no_basediff(pq_bits, topk_or_zero), lut_mem, true),
-      std::make_tuple(conf_no_smem_lut(pq_bits, topk_or_zero), bdf_mem, false)};
-
-    // we may allow slightly lower than 100% occupancy;
-    constexpr double kTargetOccupancy = 0.75;
-    // This struct is used to select the better candidate
-    occupancy_t selected_perf{};
-    selected selected_config;
-    for (auto [kernel, smem_size_const, lut_is_in_shmem] : candidates) {
-      if (smem_size_const > dev_props.sharedMemPerBlockOptin) {
-        // Even a single block cannot fit into an SM due to shmem requirements. Skip the candidate.
-        continue;
-      }
-
-      // First, we set the carveout hint to the preferred value. The driver will increase this if
-      // needed to run at least one block per SM. At the same time, if more blocks fit into one SM,
-      // this carveout value will limit the calculated occupancy. When we're done selecting the best
-      // launch configuration, we will tighten the carveout once more, based on the final memory
-      // usage and occupancy.
-      const int max_carveout =
-        estimate_carveout(preferred_shmem_carveout, smem_size_const, dev_props);
-      RAFT_CUDA_TRY(
-        cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, max_carveout));
-
-      // Get the theoretical maximum possible number of threads per block
-      cudaFuncAttributes kernel_attrs;
-      RAFT_CUDA_TRY(cudaFuncGetAttributes(&kernel_attrs, kernel));
-      uint32_t n_threads =
-        round_down_safe<uint32_t>(kernel_attrs.maxThreadsPerBlock, n_threads_gty);
-
-      // Actual required shmem depens on the number of threads
-      size_t smem_size = max(smem_size_const, ltk_mem(n_threads));
-
-      // Make sure the kernel can get enough shmem.
-      cudaError_t cuda_status =
-        cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
-      if (cuda_status != cudaSuccess) {
-        RAFT_EXPECTS(
-          cuda_status == cudaGetLastError(),
-          "Tried to reset the expected cuda error code, but it didn't match the expectation");
-        // Failed to request enough shmem for the kernel. Skip the candidate.
-        continue;
-      }
-
-      occupancy_t cur(smem_size, n_threads, kernel, dev_props);
-      if (cur.blocks_per_sm <= 0) {
-        // For some reason, we still cannot make this kernel run. Skip the candidate.
-        continue;
-      }
-
-      {
-        // Try to reduce the number of threads to increase occupancy and data locality
-        auto n_threads_tmp = n_threads_min;
-        while (n_threads_tmp * 2 < n_threads) {
-          n_threads_tmp *= 2;
-        }
-        if (n_threads_tmp < n_threads) {
-          while (n_threads_tmp >= n_threads_min) {
-            auto smem_size_tmp = max(smem_size_const, ltk_mem(n_threads_tmp));
-            occupancy_t tmp(smem_size_tmp, n_threads_tmp, kernel, dev_props);
-            bool select_it = false;
-            if (lut_is_in_shmem && locality_hint >= tmp.blocks_per_sm) {
-              // Normally, the smaller the block the better for L1 cache hit rate.
-              // Hence, the occupancy should be "just good enough"
-              select_it = tmp.occupancy >= min(kTargetOccupancy, cur.occupancy);
-            } else if (lut_is_in_shmem) {
-              // If we don't have enough repeating probes (locality_hint < tmp.blocks_per_sm),
-              // the locality is not going to improve with increasing the number of blocks per SM.
-              // Hence, the only metric here is the occupancy.
-              bool improves_occupancy = tmp.occupancy > cur.occupancy;
-              // Otherwise, the performance still improves with a smaller block size,
-              // given there is enough work to do
-              bool improves_parallelism =
-                tmp.occupancy == cur.occupancy &&
-                7u * tmp.blocks_per_sm * dev_props.multiProcessorCount <= n_blocks;
-              select_it = improves_occupancy || improves_parallelism;
-            } else {
-              // If we don't use shared memory for the lookup table, increasing the number of blocks
-              // is very taxing on the global memory usage.
-              // In this case, the occupancy must increase a lot to make it worth the cost.
-              select_it = tmp.occupancy >= min(1.0, cur.occupancy / kTargetOccupancy);
-            }
-            if (select_it) {
-              n_threads = n_threads_tmp;
-              smem_size = smem_size_tmp;
-              cur       = tmp;
-            }
-            n_threads_tmp /= 2;
-          }
-        }
-      }
-
-      {
-        if (selected_perf.occupancy <= 0.0  // no candidate yet
-            || (selected_perf.occupancy < cur.occupancy * kTargetOccupancy &&
-                selected_perf.shmem_use >= cur.shmem_use)  // much improved occupancy
-        ) {
-          selected_perf = cur;
-          if (lut_is_in_shmem) {
-            selected_config = {
-              kernel, dim3(n_blocks, 1, 1), dim3(n_threads, 1, 1), smem_size, size_t(0)};
-          } else {
-            // When the global memory is used for the lookup table, we need to minimize the grid
-            // size; otherwise, the kernel may quickly run out of memory.
-            auto n_blocks_min =
-              std::min<uint32_t>(n_blocks, cur.blocks_per_sm * dev_props.multiProcessorCount);
-            selected_config = {kernel,
-                               dim3(n_blocks_min, 1, 1),
-                               dim3(n_threads, 1, 1),
-                               smem_size,
-                               size_t(n_blocks_min) * size_t(pq_dim << pq_bits)};
-          }
-          // Actual shmem/L1 split wildly rounds up the specified preferred carveout, so we set here
-          // a rather conservative bar; most likely, the kernel gets more shared memory than this,
-          // and the occupancy doesn't get hurt.
-          auto carveout = std::min<int>(max_carveout, std::ceil(100.0 * cur.shmem_use));
-          RAFT_CUDA_TRY(
-            cudaFuncSetAttribute(kernel, cudaFuncAttributePreferredSharedMemoryCarveout, carveout));
-          if (cur.occupancy >= kTargetOccupancy) { break; }
-        } else if (selected_perf.occupancy > 0.0) {
-          // If we found a reasonable candidate on a previous iteration, and this one is not better,
-          // then don't try any more candidates because they are much slower anyway.
-          break;
-        }
-      }
-    }
-
-    RAFT_EXPECTS(selected_perf.occupancy > 0.0,
-                 "Couldn't determine a working kernel launch configuration.");
-
-    return selected_config;
-  }
-};
-
-inline auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) -> bool
-{
-  if (k > kMaxCapacity) { return false; }             // warp_sort not possible
-  if (n_probes <= 16) { return false; }               // too few clusters
-  if (n_queries * n_probes <= 256) { return false; }  // overall amount of work is too small
-  return true;
-}
-
 /**
  * The "main part" of the search, which assumes that outer-level `search` has already:
  *
@@ -1364,16 +526,16 @@ void ivfpq_search_worker(raft::device_resources const& handle,
     } break;
   }
 
-  auto search_instance = compute_similarity<ScoreT, LutT>::select(handle.get_device_properties(),
-                                                                  manage_local_topk,
-                                                                  coresidency,
-                                                                  preferred_shmem_carveout,
-                                                                  index.pq_bits(),
-                                                                  index.pq_dim(),
-                                                                  precomp_data_count,
-                                                                  n_queries,
-                                                                  n_probes,
-                                                                  topK);
+  auto search_instance = compute_similarity_select<ScoreT, LutT>(handle.get_device_properties(),
+                                                                 manage_local_topk,
+                                                                 coresidency,
+                                                                 preferred_shmem_carveout,
+                                                                 index.pq_bits(),
+                                                                 index.pq_dim(),
+                                                                 precomp_data_count,
+                                                                 n_queries,
+                                                                 n_probes,
+                                                                 topK);
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
   std::optional<device_vector<float>> query_kths_buf{std::nullopt};
diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
new file mode 100644
index 0000000000..2ad32080c6
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>  // int64_t
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft::neighbors::ivf_pq {
+
+/**
+ * @defgroup ivf_pq IVF PQ Algorithm
+ * @{
+ */
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset a device matrix view to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-pq index
+ */
+template <typename T, typename IdxT = uint32_t>
+index<IdxT> build(raft::device_resources const& handle,
+                  const index_params& params,
+                  raft::device_matrix_view<const T, IdxT, row_major> dataset) RAFT_EXPLICIT;
+
+/**
+ * @brief Extend the index with the new data.
+ * *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device matrix view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx
+ */
+template <typename T, typename IdxT>
+index<IdxT> extend(raft::device_resources const& handle,
+                   raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+                   std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,
+                   const index<IdxT>& idx) RAFT_EXPLICIT;
+
+/**
+ * @brief Extend the index with the new data.
+ * *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device matrix view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx
+ */
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,
+            index<IdxT>* idx) RAFT_EXPLICIT;
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] idx ivf-pq constructed index
+ * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
+ * k]
+ */
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const index<IdxT>& idx,
+            raft::device_matrix_view<const T, IdxT, row_major> queries,
+            raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
+            raft::device_matrix_view<float, IdxT, row_major> distances) RAFT_EXPLICIT;
+
+/** @} */  // end group ivf_pq
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_pq::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset a device/host pointer to a row-major matrix [n_rows, dim]
+ * @param[in] n_rows the number of samples
+ * @param[in] dim the dimensionality of the data
+ *
+ * @return the constructed ivf-pq index
+ */
+template <typename T, typename IdxT = uint32_t>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           const T* dataset,
+           IdxT n_rows,
+           uint32_t dim) -> index<IdxT> RAFT_EXPLICIT;
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are unchanged.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_pq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_pq::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   auto index = ivf_pq::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[inout] idx original index
+ * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] n_rows the number of samples
+ *
+ * @return the constructed extended ivf-pq index
+ */
+template <typename T, typename IdxT>
+auto extend(raft::device_resources const& handle,
+            const index<IdxT>& idx,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) -> index<IdxT> RAFT_EXPLICIT;
+
+/**
+ * @brief Extend the index with the new data.
+ * *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[inout] idx
+ * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] n_rows the number of samples
+ */
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            index<IdxT>* idx,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) RAFT_EXPLICIT;
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`:
+ * @code{.cpp}
+ *   ...
+ *   // Create a pooling memory resource with a pre-defined initial size.
+ *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
+ *     rmm::mr::get_current_device_resource(), 1024 * 1024);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
+ *   ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
+ *   ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
+ *   ...
+ * @endcode
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] idx ivf-pq constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param[in] n_queries the batch size
+ * @param[in] k the number of neighbors to find for each query.
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ * @param[in] mr an optional memory resource to use across the searches (you can provide a large
+ * enough memory pool here to avoid memory allocations within search).
+ */
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const raft::neighbors::ivf_pq::search_params& params,
+            const index<IdxT>& idx,
+            const T* queries,
+            uint32_t n_queries,
+            uint32_t k,
+            IdxT* neighbors,
+            float* distances,
+            rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+
+}  // namespace raft::neighbors::ivf_pq
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                        \
+  extern template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
+    raft::device_resources const& handle,                                                       \
+    const raft::neighbors::ivf_pq::index_params& params,                                        \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                                \
+                                                                                                \
+  extern template auto raft::neighbors::ivf_pq::build(                                          \
+    raft::device_resources const& handle,                                                       \
+    const raft::neighbors::ivf_pq::index_params& params,                                        \
+    const T* dataset,                                                                           \
+    IdxT n_rows,                                                                                \
+    uint32_t dim)                                                                               \
+    ->raft::neighbors::ivf_pq::index<IdxT>;
+
+instantiate_raft_neighbors_ivf_pq_build(float, int64_t);
+instantiate_raft_neighbors_ivf_pq_build(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_pq_build(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_build
+
+#define instantiate_raft_neighbors_ivf_pq_extend(T, IdxT)                                        \
+  extern template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
+    raft::device_resources const& handle,                                                        \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                              \
+    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,            \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx);                                            \
+                                                                                                 \
+  extern template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                        \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                              \
+    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,            \
+    raft::neighbors::ivf_pq::index<IdxT>* idx);                                                  \
+                                                                                                 \
+  extern template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                                             \
+    const T* new_vectors,                                                                        \
+    const IdxT* new_indices,                                                                     \
+    IdxT n_rows)                                                                                 \
+    ->raft::neighbors::ivf_pq::index<IdxT>;                                                      \
+                                                                                                 \
+  extern template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                        \
+    raft::neighbors::ivf_pq::index<IdxT>* idx,                                                   \
+    const T* new_vectors,                                                                        \
+    const IdxT* new_indices,                                                                     \
+    IdxT n_rows);
+
+instantiate_raft_neighbors_ivf_pq_extend(float, int64_t);
+instantiate_raft_neighbors_ivf_pq_extend(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_extend
+
+#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)        \
+  extern template void raft::neighbors::ivf_pq::search<T, IdxT>( \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    raft::device_matrix_view<const T, IdxT, row_major> queries,  \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,   \
+    raft::device_matrix_view<float, IdxT, row_major> distances); \
+                                                                 \
+  extern template void raft::neighbors::ivf_pq::search<T, IdxT>( \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    const T* queries,                                            \
+    uint32_t n_queries,                                          \
+    uint32_t k,                                                  \
+    IdxT* neighbors,                                             \
+    float* distances,                                            \
+    rmm::mr::device_memory_resource* mr)
+
+instantiate_raft_neighbors_ivf_pq_search(float, int64_t);
+instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_pq_search(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_search
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index e69de29bb2..7b3dd05efd 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "ivf_pq-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "ivf_pq-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index 9da5649ef8..a75aa22e57 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -21,7 +21,6 @@
 #include <raft/neighbors/specializations/fused_l2_knn.cuh>
 
 #include <raft/neighbors/specializations/ivf_flat.cuh>
-#include <raft/neighbors/specializations/ivf_pq.cuh>
 #include <raft/neighbors/specializations/refine.cuh>
 
 #include <raft/cluster/specializations.cuh>
diff --git a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
deleted file mode 100644
index f1c46b1225..0000000000
--- a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-namespace {
-using fp8s_t = fp_8bit<5, true>;
-using fp8u_t = fp_8bit<5, false>;
-}  // namespace
-
-#define RAFT_INST(OutT, LutT)                                                                     \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, true, true>(uint32_t, uint32_t)  \
-    ->compute_similarity_kernel_t<OutT, LutT>;                                                    \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, true, false>(uint32_t, uint32_t) \
-    ->compute_similarity_kernel_t<OutT, LutT>;                                                    \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, false, true>(uint32_t, uint32_t) \
-    ->compute_similarity_kernel_t<OutT, LutT>;
-
-#define RAFT_INST_ALL_OUT_T(LutT) \
-  RAFT_INST(float, LutT)          \
-  RAFT_INST(half, LutT)
-
-RAFT_INST_ALL_OUT_T(float)
-RAFT_INST_ALL_OUT_T(half)
-RAFT_INST_ALL_OUT_T(fp8s_t)
-RAFT_INST_ALL_OUT_T(fp8u_t)
-
-#undef RAFT_INST
-#undef RAFT_INST_ALL_OUT_T
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/include/raft/neighbors/specializations/ivf_pq.cuh b/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
deleted file mode 100644
index 55a7cd5858..0000000000
--- a/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/cluster/specializations.cuh>
-#include <raft/distance/specializations.cuh>
-#include <raft/matrix/specializations.cuh>
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#ifdef RAFT_DECL_BUILD_EXTEND
-#undef RAFT_DECL_BUILD_EXTEND
-#endif
-
-#ifdef RAFT_DECL_SEARCH
-#undef RAFT_DECL_SEARCH
-#endif
-
-// We define overloads for build and extend with void return type. This is used in the Cython
-// wrappers, where exception handling is not compatible with return type that has nontrivial
-// constructor.
-#define RAFT_DECL_BUILD_EXTEND(T, IdxT)                                          \
-  extern template auto build(raft::device_resources const&,                      \
-                             const raft::neighbors::ivf_pq::index_params&,       \
-                             raft::device_matrix_view<const T, IdxT, row_major>) \
-    ->raft::neighbors::ivf_pq::index<IdxT>;                                      \
-                                                                                 \
-  extern template auto extend(                                                   \
-    raft::device_resources const&,                                               \
-    raft::device_matrix_view<const T, IdxT, row_major>,                          \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>>,        \
-    const raft::neighbors::ivf_pq::index<IdxT>&)                                 \
-    ->raft::neighbors::ivf_pq::index<IdxT>;                                      \
-                                                                                 \
-  extern template void extend(                                                   \
-    raft::device_resources const&,                                               \
-    raft::device_matrix_view<const T, IdxT, row_major>,                          \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>>,        \
-    raft::neighbors::ivf_pq::index<IdxT>*);
-
-RAFT_DECL_BUILD_EXTEND(float, int64_t)
-RAFT_DECL_BUILD_EXTEND(int8_t, int64_t)
-RAFT_DECL_BUILD_EXTEND(uint8_t, int64_t)
-
-#undef RAFT_DECL_BUILD_EXTEND
-
-#define RAFT_DECL_SEARCH(T, IdxT)                                                 \
-  extern template void search(raft::device_resources const&,                      \
-                              const raft::neighbors::ivf_pq::search_params&,      \
-                              const raft::neighbors::ivf_pq::index<IdxT>&,        \
-                              raft::device_matrix_view<const T, IdxT, row_major>, \
-                              raft::device_matrix_view<IdxT, IdxT, row_major>,    \
-                              raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_DECL_SEARCH(float, int64_t);
-RAFT_DECL_SEARCH(int8_t, int64_t);
-RAFT_DECL_SEARCH(uint8_t, int64_t);
-
-#undef RAFT_DECL_SEARCH
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/include/raft/util/detail/cub_wrappers.cuh b/cpp/include/raft/util/detail/cub_wrappers.cuh
index 8c70331165..94225a9175 100644
--- a/cpp/include/raft/util/detail/cub_wrappers.cuh
+++ b/cpp/include/raft/util/detail/cub_wrappers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ void sortPairs(rmm::device_uvector<char>& workspace,
                int len,
                cudaStream_t stream)
 {
-  size_t worksize;
+  size_t worksize = 0;  // prevent warnings about using uninitialized value of worksize..
   cub::DeviceRadixSort::SortPairs(
     nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream);
   workspace.resize(worksize, stream);
diff --git a/cpp/src/distance/specializations/detail/inner_product_float_float_float_int.cu b/cpp/include/raft/util/inline.hpp
similarity index 50%
rename from cpp/src/distance/specializations/detail/inner_product_float_float_float_int.cu
rename to cpp/include/raft/util/inline.hpp
index 2b06ca4dc2..1b625a8a72 100644
--- a/cpp/src/distance/specializations/detail/inner_product_float_float_float_int.cu
+++ b/cpp/include/raft/util/inline.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,24 +14,10 @@
  * limitations under the License.
  */
 
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
+#pragma once
 
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::InnerProduct, float, float, float, int>(
-  raft::resources const& handle,
-  const float* x,
-  const float* y,
-  float* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  float metric_arg);
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
+#ifdef RAFT_COMPILED
+#define RAFT_INLINE_CONDITIONAL
+#else
+#define RAFT_INLINE_CONDITIONAL inline
+#endif  // RAFT_COMPILED
diff --git a/cpp/src/distance/fused_l2_nn.cu b/cpp/src/distance/fused_l2_nn.cu
new file mode 100644
index 0000000000..6011aaec29
--- /dev/null
+++ b/cpp/src/distance/fused_l2_nn.cu
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>            // int64_t
+#include <raft/core/kvp.hpp>  // raft::KeyValuePair
+#include <raft/distance/fused_l2_nn-inl.cuh>
+
+#define instantiate_raft_distance_fusedL2NNMinReduce(DataT, OutT, IdxT)                   \
+  template void raft::distance::fusedL2NNMinReduce<DataT, OutT, IdxT>(OutT * min,         \
+                                                                      const DataT* x,     \
+                                                                      const DataT* y,     \
+                                                                      const DataT* xn,    \
+                                                                      const DataT* yn,    \
+                                                                      IdxT m,             \
+                                                                      IdxT n,             \
+                                                                      IdxT k,             \
+                                                                      void* workspace,    \
+                                                                      bool sqrt,          \
+                                                                      bool initOutBuffer, \
+                                                                      cudaStream_t stream)
+
+instantiate_raft_distance_fusedL2NNMinReduce(double, double, int);
+instantiate_raft_distance_fusedL2NNMinReduce(double, double, int64_t);
+instantiate_raft_distance_fusedL2NNMinReduce(float, float, int);
+instantiate_raft_distance_fusedL2NNMinReduce(float, float, int64_t);
+
+// We can't have comma's in the macro expansion, so we use the COMMA macro:
+#define COMMA ,
+
+instantiate_raft_distance_fusedL2NNMinReduce(double, raft::KeyValuePair<int COMMA double>, int);
+instantiate_raft_distance_fusedL2NNMinReduce(double,
+                                             raft::KeyValuePair<int64_t COMMA double>,
+                                             int64_t);
+instantiate_raft_distance_fusedL2NNMinReduce(float, raft::KeyValuePair<int COMMA float>, int);
+instantiate_raft_distance_fusedL2NNMinReduce(float,
+                                             raft::KeyValuePair<int64_t COMMA float>,
+                                             int64_t);
+
+#undef COMMA
+
+#undef instantiate_raft_distance_fusedL2NNMinReduce
diff --git a/cpp/src/distance/specializations/detail/00_write_template.py b/cpp/src/distance/specializations/detail/00_write_template.py
deleted file mode 100644
index 3f2f853569..0000000000
--- a/cpp/src/distance/specializations/detail/00_write_template.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/usr/bin/env python3
-
-# NOTE: this template is not perfectly formatted. Use pre-commit to get
-# everything in shape again.
-template = """/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp> // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh> // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh> // pairwise_matrix_instantiation_point
-INCLUDE_SM_HEADERS
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<OpT,
-                                                  IdxT,
-                                                  DataT,
-                                                  OutT,
-                                                  FinopT>(
-  OpT,
-  pairwise_matrix_params<IdxT, DataT, OutT, FinopT>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
-"""
-
-data_type_instances = [
-    dict(
-        DataT="float",
-        AccT="float",
-        OutT="float",
-        IdxT="int",
-    ),
-    dict(
-        DataT="double",
-        AccT="double",
-        OutT="double",
-        IdxT="int",
-    ),
-]
-
-op_instances = [
-    dict(
-        path_prefix="canberra",
-        OpT="ops::canberra_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="correlation",
-        OpT="ops::correlation_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="cosine",
-        OpT="ops::cosine_distance_op<DataT, AccT, IdxT>",
-        archs = [60, 80],
-    ),
-    dict(
-        path_prefix="hamming_unexpanded",
-        OpT="ops::hamming_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="hellinger_expanded",
-        OpT="ops::hellinger_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    # inner product is handled by cublas.
-    dict(
-        path_prefix="jensen_shannon",
-        OpT="ops::jensen_shannon_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="kl_divergence",
-        OpT="ops::kl_divergence_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="l1",
-        OpT="ops::l1_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="l2_expanded",
-        OpT="ops::l2_exp_distance_op<DataT, AccT, IdxT>",
-        archs = [60, 80],
-    ),
-    dict(
-        path_prefix="l2_unexpanded",
-        OpT="ops::l2_unexp_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="l_inf",
-        OpT="ops::l_inf_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="lp_unexpanded",
-        OpT="ops::lp_unexp_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="russel_rao",
-        OpT="ops::russel_rao_distance_op<DataT, AccT, IdxT>",
-        archs = [60],
-     ),
-]
-
-def fill_in(s, template):
-    for k, v in template.items():
-        s = s.replace(k, v)
-    return s
-
-def fill_include_sm_headers(op_instance):
-    include_headers ="\n".join([
-        f"#include <raft/distance/detail/pairwise_matrix/dispatch_sm{arch}.cuh>"
-        for arch in op_instance["archs"]
-    ])
-
-    return {
-        "path_prefix": op_instance["path_prefix"],
-        "OpT": op_instance["OpT"],
-        "INCLUDE_SM_HEADERS": include_headers
-    }
-
-for op_instance in op_instances:
-    op_instance = fill_include_sm_headers(op_instance)
-
-    for data_type_instance in data_type_instances:
-        op_data_instance = {
-            k : fill_in(v, data_type_instance)
-            for k, v in op_instance.items()
-        }
-        instance = {
-            **op_data_instance,
-            **data_type_instance,
-            "FinopT": "decltype(raft::identity_op())",
-        }
-
-        text = fill_in(template, instance)
-
-        path = fill_in("path_prefix_DataT_AccT_OutT_IdxT.cu", instance)
-        with open(path, "w") as f:
-            f.write(text)
diff --git a/cpp/src/distance/specializations/detail/canberra_double_double_double_int.cu b/cpp/src/distance/specializations/detail/canberra_double_double_double_int.cu
deleted file mode 100644
index 037d218178..0000000000
--- a/cpp/src/distance/specializations/detail/canberra_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::canberra_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::canberra_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/canberra_float_float_float_int.cu b/cpp/src/distance/specializations/detail/canberra_float_float_float_int.cu
deleted file mode 100644
index 0ed8ea7bb0..0000000000
--- a/cpp/src/distance/specializations/detail/canberra_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::canberra_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::canberra_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/correlation_double_double_double_int.cu b/cpp/src/distance/specializations/detail/correlation_double_double_double_int.cu
deleted file mode 100644
index 0c11f0621e..0000000000
--- a/cpp/src/distance/specializations/detail/correlation_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::correlation_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::correlation_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/correlation_float_float_float_int.cu b/cpp/src/distance/specializations/detail/correlation_float_float_float_int.cu
deleted file mode 100644
index 396e158554..0000000000
--- a/cpp/src/distance/specializations/detail/correlation_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::correlation_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::correlation_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/cosine_double_double_double_int.cu b/cpp/src/distance/specializations/detail/cosine_double_double_double_int.cu
deleted file mode 100644
index e9afb6f563..0000000000
--- a/cpp/src/distance/specializations/detail/cosine_double_double_double_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::cosine_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::cosine_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/cosine_float_float_float_int.cu b/cpp/src/distance/specializations/detail/cosine_float_float_float_int.cu
deleted file mode 100644
index 1033c491d6..0000000000
--- a/cpp/src/distance/specializations/detail/cosine_float_float_float_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::cosine_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::cosine_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
deleted file mode 100644
index 195115914d..0000000000
--- a/cpp/src/distance/specializations/detail/hamming_unexpanded_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::hamming_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::hamming_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
deleted file mode 100644
index a74c6c404e..0000000000
--- a/cpp/src/distance/specializations/detail/hamming_unexpanded_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::hamming_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::hamming_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
deleted file mode 100644
index bac1dd7bd0..0000000000
--- a/cpp/src/distance/specializations/detail/hellinger_expanded_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::hellinger_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::hellinger_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
deleted file mode 100644
index 77c113b1a9..0000000000
--- a/cpp/src/distance/specializations/detail/hellinger_expanded_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::hellinger_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::hellinger_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/inner_product_double_double_double_int.cu b/cpp/src/distance/specializations/detail/inner_product_double_double_double_int.cu
deleted file mode 100644
index 3db0a3572e..0000000000
--- a/cpp/src/distance/specializations/detail/inner_product_double_double_double_int.cu
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-template void distance<raft::distance::DistanceType::InnerProduct, double, double, double, int>(
-  raft::resources const& handle,
-  const double* x,
-  const double* y,
-  double* dist,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  std::size_t worksize,
-  bool isRowMajor,
-  double metric_arg);
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
deleted file mode 100644
index 188e52c152..0000000000
--- a/cpp/src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void
-  pairwise_matrix_instantiation_point<ops::jensen_shannon_distance_op<double, double, int>,
-                                      int,
-                                      double,
-                                      double,
-                                      decltype(raft::identity_op())>(
-    ops::jensen_shannon_distance_op<double, double, int>,
-    pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-    cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu b/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
deleted file mode 100644
index b0afbf7bb2..0000000000
--- a/cpp/src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void
-  pairwise_matrix_instantiation_point<ops::jensen_shannon_distance_op<float, float, int>,
-                                      int,
-                                      float,
-                                      float,
-                                      decltype(raft::identity_op())>(
-    ops::jensen_shannon_distance_op<float, float, int>,
-    pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-    cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
deleted file mode 100644
index f06ae85414..0000000000
--- a/cpp/src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::kl_divergence_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu b/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
deleted file mode 100644
index 00d5a5ee5b..0000000000
--- a/cpp/src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::kl_divergence_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::kl_divergence_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
deleted file mode 100644
index 5c235316da..0000000000
--- a/cpp/src/distance/specializations/detail/l1_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l1_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::l1_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
deleted file mode 100644
index fb293ca83d..0000000000
--- a/cpp/src/distance/specializations/detail/l1_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l1_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::l1_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
deleted file mode 100644
index 2c02f0224f..0000000000
--- a/cpp/src/distance/specializations/detail/l2_expanded_double_double_double_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l2_exp_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::l2_exp_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
deleted file mode 100644
index 85e25a25ca..0000000000
--- a/cpp/src/distance/specializations/detail/l2_expanded_float_float_float_int.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l2_exp_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::l2_exp_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
deleted file mode 100644
index 5b4d995d14..0000000000
--- a/cpp/src/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l2_unexp_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::l2_unexp_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
deleted file mode 100644
index a63c3f0bb8..0000000000
--- a/cpp/src/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l2_unexp_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::l2_unexp_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l_inf_double_double_double_int.cu b/cpp/src/distance/specializations/detail/l_inf_double_double_double_int.cu
deleted file mode 100644
index 831167523f..0000000000
--- a/cpp/src/distance/specializations/detail/l_inf_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l_inf_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::l_inf_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/l_inf_float_float_float_int.cu b/cpp/src/distance/specializations/detail/l_inf_float_float_float_int.cu
deleted file mode 100644
index 02e667cbe3..0000000000
--- a/cpp/src/distance/specializations/detail/l_inf_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::l_inf_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::l_inf_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
deleted file mode 100644
index ebd71065ec..0000000000
--- a/cpp/src/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::lp_unexp_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::lp_unexp_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
deleted file mode 100644
index b94a81fdce..0000000000
--- a/cpp/src/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::lp_unexp_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::lp_unexp_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu b/cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu
deleted file mode 100644
index 6f952fcc37..0000000000
--- a/cpp/src/distance/specializations/detail/russel_rao_double_double_double_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::russel_rao_distance_op<double, double, int>,
-                                                  int,
-                                                  double,
-                                                  double,
-                                                  decltype(raft::identity_op())>(
-  ops::russel_rao_distance_op<double, double, int>,
-  pairwise_matrix_params<int, double, double, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu b/cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu
deleted file mode 100644
index 3223ce33a7..0000000000
--- a/cpp/src/distance/specializations/detail/russel_rao_float_float_float_int.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>                            // raft::identity_op
-#include <raft/distance/detail/distance_ops/all_ops.cuh>      // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>  // pairwise_matrix_instantiation_point
-#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
-
-namespace raft::distance::detail {
-
-template void pairwise_matrix_instantiation_point<ops::russel_rao_distance_op<float, float, int>,
-                                                  int,
-                                                  float,
-                                                  float,
-                                                  decltype(raft::identity_op())>(
-  ops::russel_rao_distance_op<float, float, int>,
-  pairwise_matrix_params<int, float, float, decltype(raft::identity_op())>,
-  cudaStream_t);
-
-}  // namespace raft::distance::detail
diff --git a/cpp/src/distance/specializations/fused_l2_nn_double_int.cu b/cpp/src/distance/specializations/fused_l2_nn_double_int.cu
deleted file mode 100644
index b49132b042..0000000000
--- a/cpp/src/distance/specializations/fused_l2_nn_double_int.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/kvp.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-
-template void fusedL2NNMinReduce<double, raft::KeyValuePair<int, double>, int>(
-  raft::KeyValuePair<int, double>* min,
-  const double* x,
-  const double* y,
-  const double* xn,
-  const double* yn,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-template void fusedL2NNMinReduce<double, double, int>(double* min,
-                                                      const double* x,
-                                                      const double* y,
-                                                      const double* xn,
-                                                      const double* yn,
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      void* workspace,
-                                                      bool sqrt,
-                                                      bool initOutBuffer,
-                                                      cudaStream_t stream);
-
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu b/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu
deleted file mode 100644
index b1e3a900a9..0000000000
--- a/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/kvp.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-
-template void fusedL2NNMinReduce<double, raft::KeyValuePair<int64_t, double>, int64_t>(
-  raft::KeyValuePair<int64_t, double>* min,
-  const double* x,
-  const double* y,
-  const double* xn,
-  const double* yn,
-  int64_t m,
-  int64_t n,
-  int64_t k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-template void fusedL2NNMinReduce<double, double, int64_t>(double* min,
-                                                          const double* x,
-                                                          const double* y,
-                                                          const double* xn,
-                                                          const double* yn,
-                                                          int64_t m,
-                                                          int64_t n,
-                                                          int64_t k,
-                                                          void* workspace,
-                                                          bool sqrt,
-                                                          bool initOutBuffer,
-                                                          cudaStream_t stream);
-
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/specializations/fused_l2_nn_float_int.cu b/cpp/src/distance/specializations/fused_l2_nn_float_int.cu
deleted file mode 100644
index 44b4953d8c..0000000000
--- a/cpp/src/distance/specializations/fused_l2_nn_float_int.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/kvp.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-
-template void fusedL2NNMinReduce<float, raft::KeyValuePair<int, float>, int>(
-  raft::KeyValuePair<int, float>* min,
-  const float* x,
-  const float* y,
-  const float* xn,
-  const float* yn,
-  int m,
-  int n,
-  int k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-template void fusedL2NNMinReduce<float, float, int>(float* min,
-                                                    const float* x,
-                                                    const float* y,
-                                                    const float* xn,
-                                                    const float* yn,
-                                                    int m,
-                                                    int n,
-                                                    int k,
-                                                    void* workspace,
-                                                    bool sqrt,
-                                                    bool initOutBuffer,
-                                                    cudaStream_t stream);
-
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu b/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu
deleted file mode 100644
index 9ca2b639a9..0000000000
--- a/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/kvp.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/distance/specializations.cuh>
-
-namespace raft {
-namespace distance {
-
-template void fusedL2NNMinReduce<float, raft::KeyValuePair<int64_t, float>, int64_t>(
-  raft::KeyValuePair<int64_t, float>* min,
-  const float* x,
-  const float* y,
-  const float* xn,
-  const float* yn,
-  int64_t m,
-  int64_t n,
-  int64_t k,
-  void* workspace,
-  bool sqrt,
-  bool initOutBuffer,
-  cudaStream_t stream);
-template void fusedL2NNMinReduce<float, float, int64_t>(float* min,
-                                                        const float* x,
-                                                        const float* y,
-                                                        const float* xn,
-                                                        const float* yn,
-                                                        int64_t m,
-                                                        int64_t n,
-                                                        int64_t k,
-                                                        void* workspace,
-                                                        bool sqrt,
-                                                        bool initOutBuffer,
-                                                        cudaStream_t stream);
-
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
new file mode 100644
index 0000000000..48f0021595
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+header = """
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT) \\
+    template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \\
+        const cudaDeviceProp& dev_props,                                \\
+        bool manage_local_topk,                                         \\
+        int locality_hint,                                              \\
+        double preferred_shmem_carveout,                                \\
+        uint32_t pq_bits,                                               \\
+        uint32_t pq_dim,                                                \\
+        uint32_t precomp_data_count,                                    \\
+        uint32_t n_queries,                                             \\
+        uint32_t n_probes,                                              \\
+        uint32_t topk) -> raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+
+#define COMMA ,
+"""
+
+trailer = """
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
+"""
+
+types = dict(
+    half_fp8_false=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>"),
+    half_fp8_true=("half", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>"),
+    half_half=("half", "half"),
+    float_half=("float", "half"),
+    float_float= ("float", "float"),
+    float_fp8_false=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>"),
+    float_fp8_true=("float", "raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>"),
+)
+
+for path_key, (OutT, LutT) in types.items():
+    path = f"ivf_pq_compute_similarity_{path_key}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(f"instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select({OutT}, {LutT});\n")
+        f.write(trailer)
+    print(f"src/neighbors/detail/{path}")
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
new file mode 100644
index 0000000000..4adc0d2029
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
@@ -0,0 +1,40 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(float, float);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
new file mode 100644
index 0000000000..2cdcb2f47e
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
@@ -0,0 +1,41 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
new file mode 100644
index 0000000000..db0226a83b
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
@@ -0,0 +1,41 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  float, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
new file mode 100644
index 0000000000..4d8efbc6a4
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
@@ -0,0 +1,40 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(float, half);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
new file mode 100644
index 0000000000..ec02753375
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
@@ -0,0 +1,41 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA false>);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
new file mode 100644
index 0000000000..20573a3570
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
@@ -0,0 +1,41 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
+  half, raft::neighbors::ivf_pq::detail::fp_8bit<5u COMMA true>);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
new file mode 100644
index 0000000000..eb0217fe4e
--- /dev/null
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
@@ -0,0 +1,40 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
+
+#define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)  \
+  template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
+    const cudaDeviceProp& dev_props,                                                    \
+    bool manage_local_topk,                                                             \
+    int locality_hint,                                                                  \
+    double preferred_shmem_carveout,                                                    \
+    uint32_t pq_bits,                                                                   \
+    uint32_t pq_dim,                                                                    \
+    uint32_t precomp_data_count,                                                        \
+    uint32_t n_queries,                                                                 \
+    uint32_t n_probes,                                                                  \
+    uint32_t topk)                                                                      \
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+
+#define COMMA ,
+instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(half, half);
+
+#undef COMMA
+
+#undef instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select
diff --git a/cpp/src/neighbors/ivfpq_build_float_int64_t.cu b/cpp/src/neighbors/ivfpq_build_float_int64_t.cu
new file mode 100644
index 0000000000..6771964cae
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_build_float_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                         \
+                                                                                         \
+  template auto raft::neighbors::ivf_pq::build(                                          \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    const T* dataset,                                                                    \
+    IdxT n_rows,                                                                         \
+    uint32_t dim)                                                                        \
+    ->raft::neighbors::ivf_pq::index<IdxT>;
+
+instantiate_raft_neighbors_ivf_pq_build(float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_build
diff --git a/cpp/src/neighbors/ivfpq_build_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_build_int8_t_int64_t.cu
new file mode 100644
index 0000000000..759045faa7
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_build_int8_t_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                         \
+                                                                                         \
+  template auto raft::neighbors::ivf_pq::build(                                          \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    const T* dataset,                                                                    \
+    IdxT n_rows,                                                                         \
+    uint32_t dim)                                                                        \
+    ->raft::neighbors::ivf_pq::index<IdxT>;
+
+instantiate_raft_neighbors_ivf_pq_build(int8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_build
diff --git a/cpp/src/neighbors/ivfpq_build_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_build_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..62a47e9bcf
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_build_uint8_t_int64_t.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                         \
+                                                                                         \
+  template auto raft::neighbors::ivf_pq::build(                                          \
+    raft::device_resources const& handle,                                                \
+    const raft::neighbors::ivf_pq::index_params& params,                                 \
+    const T* dataset,                                                                    \
+    IdxT n_rows,                                                                         \
+    uint32_t dim)                                                                        \
+    ->raft::neighbors::ivf_pq::index<IdxT>;
+
+instantiate_raft_neighbors_ivf_pq_build(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_build
diff --git a/cpp/src/neighbors/ivfpq_deserialize.cu b/cpp/src/neighbors/ivfpq_deserialize.cu
index 8d54e3cc55..bb6ac13966 100644
--- a/cpp/src/neighbors/ivfpq_deserialize.cu
+++ b/cpp/src/neighbors/ivfpq_deserialize.cu
@@ -14,9 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
+#include <raft/neighbors/ivf_pq_serialize.cuh>
 #include <raft_runtime/neighbors/ivf_pq.hpp>
 
 namespace raft::runtime::neighbors::ivf_pq {
diff --git a/cpp/src/neighbors/ivfpq_extend_float_int64_t.cu b/cpp/src/neighbors/ivfpq_extend_float_int64_t.cu
new file mode 100644
index 0000000000..247fe7803f
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_extend_float_int64_t.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_extend(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx);                                     \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    raft::neighbors::ivf_pq::index<IdxT>* idx);                                           \
+                                                                                          \
+  template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                                      \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows)                                                                          \
+    ->raft::neighbors::ivf_pq::index<IdxT>;                                               \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::neighbors::ivf_pq::index<IdxT>* idx,                                            \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows);
+
+instantiate_raft_neighbors_ivf_pq_extend(float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_extend
diff --git a/cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu
new file mode 100644
index 0000000000..2961dd0353
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_extend(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx);                                     \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    raft::neighbors::ivf_pq::index<IdxT>* idx);                                           \
+                                                                                          \
+  template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                                      \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows)                                                                          \
+    ->raft::neighbors::ivf_pq::index<IdxT>;                                               \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::neighbors::ivf_pq::index<IdxT>* idx,                                            \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows);
+
+instantiate_raft_neighbors_ivf_pq_extend(int8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_extend
diff --git a/cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..9827486fcf
--- /dev/null
+++ b/cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
+
+#define instantiate_raft_neighbors_ivf_pq_extend(T, IdxT)                                 \
+  template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx);                                     \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
+    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    raft::neighbors::ivf_pq::index<IdxT>* idx);                                           \
+                                                                                          \
+  template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,                                      \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows)                                                                          \
+    ->raft::neighbors::ivf_pq::index<IdxT>;                                               \
+                                                                                          \
+  template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
+    raft::device_resources const& handle,                                                 \
+    raft::neighbors::ivf_pq::index<IdxT>* idx,                                            \
+    const T* new_vectors,                                                                 \
+    const IdxT* new_indices,                                                              \
+    IdxT n_rows);
+
+instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_pq_extend
diff --git a/cpp/src/neighbors/ivfpq_search_float_int64_t.cu b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
index 91093d3a39..ab946d2b65 100644
--- a/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
@@ -14,26 +14,29 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
-#include <raft_runtime/neighbors/ivf_pq.hpp>
+#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)        \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    raft::device_matrix_view<const T, IdxT, row_major> queries,  \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,   \
+    raft::device_matrix_view<float, IdxT, row_major> distances); \
+                                                                 \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    const T* queries,                                            \
+    uint32_t n_queries,                                          \
+    uint32_t k,                                                  \
+    IdxT* neighbors,                                             \
+    float* distances,                                            \
+    rmm::mr::device_memory_resource* mr)
 
-namespace raft::runtime::neighbors::ivf_pq {
+instantiate_raft_neighbors_ivf_pq_search(float, int64_t);
 
-#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
-  void search(raft::device_resources const& handle,                                               \
-              const raft::neighbors::ivf_pq::search_params& params,                               \
-              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
-              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
-              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
-              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
-  {                                                                                               \
-    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
-  }
-
-RAFT_SEARCH_INST(float, int64_t);
-
-#undef RAFT_INST_SEARCH
-
-}  // namespace raft::runtime::neighbors::ivf_pq
+#undef instantiate_raft_neighbors_ivf_pq_search
diff --git a/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
index e1552c0b27..af54a9312a 100644
--- a/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
@@ -14,26 +14,29 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
-#include <raft_runtime/neighbors/ivf_pq.hpp>
+#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)        \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    raft::device_matrix_view<const T, IdxT, row_major> queries,  \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,   \
+    raft::device_matrix_view<float, IdxT, row_major> distances); \
+                                                                 \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    const T* queries,                                            \
+    uint32_t n_queries,                                          \
+    uint32_t k,                                                  \
+    IdxT* neighbors,                                             \
+    float* distances,                                            \
+    rmm::mr::device_memory_resource* mr)
 
-namespace raft::runtime::neighbors::ivf_pq {
+instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t);
 
-#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
-  void search(raft::device_resources const& handle,                                               \
-              const raft::neighbors::ivf_pq::search_params& params,                               \
-              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
-              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
-              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
-              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
-  {                                                                                               \
-    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
-  }
-
-RAFT_SEARCH_INST(int8_t, int64_t);
-
-#undef RAFT_INST_SEARCH
-
-}  // namespace raft::runtime::neighbors::ivf_pq
+#undef instantiate_raft_neighbors_ivf_pq_search
diff --git a/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
index 85195a7551..7b49487506 100644
--- a/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
@@ -14,26 +14,29 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_pq-inl.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
-#include <raft_runtime/neighbors/ivf_pq.hpp>
+#define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)        \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    raft::device_matrix_view<const T, IdxT, row_major> queries,  \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,   \
+    raft::device_matrix_view<float, IdxT, row_major> distances); \
+                                                                 \
+  template void raft::neighbors::ivf_pq::search<T, IdxT>(        \
+    raft::device_resources const& handle,                        \
+    const raft::neighbors::ivf_pq::search_params& params,        \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx,             \
+    const T* queries,                                            \
+    uint32_t n_queries,                                          \
+    uint32_t k,                                                  \
+    IdxT* neighbors,                                             \
+    float* distances,                                            \
+    rmm::mr::device_memory_resource* mr)
 
-namespace raft::runtime::neighbors::ivf_pq {
+instantiate_raft_neighbors_ivf_pq_search(uint8_t, int64_t);
 
-#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
-  void search(raft::device_resources const& handle,                                               \
-              const raft::neighbors::ivf_pq::search_params& params,                               \
-              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
-              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
-              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
-              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
-  {                                                                                               \
-    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
-  }
-
-RAFT_SEARCH_INST(uint8_t, int64_t);
-
-#undef RAFT_INST_SEARCH
-
-}  // namespace raft::runtime::neighbors::ivf_pq
+#undef instantiate_raft_neighbors_ivf_pq_search
diff --git a/cpp/src/neighbors/ivfpq_serialize.cu b/cpp/src/neighbors/ivfpq_serialize.cu
index e251f1442f..0ba1b929b7 100644
--- a/cpp/src/neighbors/ivfpq_serialize.cu
+++ b/cpp/src/neighbors/ivfpq_serialize.cu
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_pq_serialize.cuh>
 
 #include <raft_runtime/neighbors/ivf_pq.hpp>
 

From 1a5c5889f7be29804d2b3dc3de3499e97cab1e0d Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 13 Apr 2023 15:14:23 +0200
Subject: [PATCH 28/89] Fix tests

---
 cpp/CMakeLists.txt                            |  1 +
 .../detail/ivf_flat_interleaved_scan-inl.cuh  |  2 +-
 .../detail/ivf_pq_compute_similarity-ext.cuh  | 91 +++++++++++--------
 .../detail/ivf_pq_compute_similarity-inl.cuh  | 59 ++++++++++--
 .../raft/neighbors/detail/ivf_pq_search.cuh   | 43 ++++-----
 cpp/include/raft/neighbors/ivf_flat_types.hpp |  7 +-
 .../ivf_pq_compute_similarity_00_generate.py  | 27 +++++-
 .../ivf_pq_compute_similarity_float_float.cu  | 26 +++++-
 ...f_pq_compute_similarity_float_fp8_false.cu | 26 +++++-
 ...vf_pq_compute_similarity_float_fp8_true.cu | 26 +++++-
 .../ivf_pq_compute_similarity_float_half.cu   | 26 +++++-
 ...vf_pq_compute_similarity_half_fp8_false.cu | 26 +++++-
 ...ivf_pq_compute_similarity_half_fp8_true.cu | 26 +++++-
 .../ivf_pq_compute_similarity_half_half.cu    | 26 +++++-
 cpp/test/cluster/linkage.cu                   |  9 ++
 cpp/test/distance/dist_adj_threshold.cuh      |  2 +
 cpp/test/distance/fused_l2_nn.cu              |  7 +-
 cpp/test/distance/masked_nn.cu                | 19 ++--
 cpp/test/neighbors/ann_ivf_pq.cuh             |  1 +
 .../ann_ivf_pq/test_float_uint32_t.cu         |  7 ++
 .../sparse/neighbors/connect_components.cu    |  9 ++
 21 files changed, 377 insertions(+), 89 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e3486062d5..1c2f5f266c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -387,6 +387,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/ivfpq_deserialize.cu
     src/neighbors/ivfpq_extend_float_int64_t.cu
     src/neighbors/ivfpq_extend_int8_t_int64_t.cu
+    src/neighbors/ivfpq_extend_uint8_t_int64_t.cu
     src/neighbors/ivfpq_search_float_int64_t.cu
     src/neighbors/ivfpq_search_int8_t_int64_t.cu
     src/neighbors/ivfpq_search_uint8_t_int64_t.cu
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
index 4848022e30..01bfbf4a43 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
@@ -997,7 +997,7 @@ struct select_interleaved_scan_kernel {
     if constexpr (Veclen > 1) {
       if (veclen % Veclen != 0) {
         return select_interleaved_scan_kernel<T, AccT, IdxT, Capacity, 1>::run(
-          capacity, veclen, select_min, std::forward<Args>(args)...);
+          capacity, 1, select_min, std::forward<Args>(args)...);
       }
     }
     // NB: this is the limitation of the warpsort structures that use a huge number of
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
index 2e15ebd665..104a31e869 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
@@ -16,38 +16,12 @@
 
 #pragma once
 
-#include <raft/spatial/knn/detail/ann_utils.cuh>
-
-#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
-#include <raft/neighbors/ivf_pq_types.hpp>
-
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/device_mdarray.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/core/logger.hpp>
-#include <raft/core/nvtx.hpp>
-#include <raft/core/operators.hpp>
-#include <raft/distance/distance_types.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/map.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/detail/select_warpsort.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/device_atomics.cuh>
-#include <raft/util/device_loads_stores.cuh>
-#include <raft/util/pow2_utils.cuh>
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
-#include <raft/util/vectorized.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-#include <cub/cub.cuh>
-
-#include <cuda_fp16.h>
-
-#include <optional>
+#include <cuda_fp16.h>                               // __half
+#include <raft/distance/distance_types.hpp>          // raft::distance::DistanceType
+#include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>  // raft::neighbors::ivf_pq::detail::fp_8bit
+#include <raft/neighbors/ivf_pq_types.hpp>           // raft::neighbors::ivf_pq::codebook_gen
+#include <raft/util/raft_explicit.hpp>               // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>                  // rmm::cuda_stream_view
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE
 
@@ -112,11 +86,32 @@ struct selected {
   dim3 block_dim;
   size_t smem_size;
   size_t device_lut_size;
-
-  template <typename... Args>
-  void operator()(rmm::cuda_stream_view stream, Args... args);
 };
 
+template <typename OutT, typename LutT>
+void compute_similarity_run(selected<OutT, LutT> s,
+                            rmm::cuda_stream_view stream,
+                            uint32_t n_rows,
+                            uint32_t dim,
+                            uint32_t n_probes,
+                            uint32_t pq_dim,
+                            uint32_t n_queries,
+                            distance::DistanceType metric,
+                            codebook_gen codebook_kind,
+                            uint32_t topk,
+                            uint32_t max_samples,
+                            const float* cluster_centers,
+                            const float* pq_centers,
+                            const uint8_t* const* pq_dataset,
+                            const uint32_t* cluster_labels,
+                            const uint32_t* _chunk_indices,
+                            const float* queries,
+                            const uint32_t* index_list,
+                            float* query_kths,
+                            LutT* lut_scores,
+                            OutT* _out_scores,
+                            uint32_t* _out_indices) RAFT_EXPLICIT;
+
 /**
  * Use heuristics to choose an optimal instance of the search kernel.
  * It selects among a few kernel variants (with/out using shared mem for
@@ -159,7 +154,31 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props,
     uint32_t n_queries,                                                                        \
     uint32_t n_probes,                                                                         \
     uint32_t topk)                                                                             \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                                   \
+                                                                                               \
+  extern template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                                   \
+    rmm::cuda_stream_view stream,                                                              \
+    uint32_t n_rows,                                                                           \
+    uint32_t dim,                                                                              \
+    uint32_t n_probes,                                                                         \
+    uint32_t pq_dim,                                                                           \
+    uint32_t n_queries,                                                                        \
+    raft::distance::DistanceType metric,                                                       \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                       \
+    uint32_t topk,                                                                             \
+    uint32_t max_samples,                                                                      \
+    const float* cluster_centers,                                                              \
+    const float* pq_centers,                                                                   \
+    const uint8_t* const* pq_dataset,                                                          \
+    const uint32_t* cluster_labels,                                                            \
+    const uint32_t* _chunk_indices,                                                            \
+    const float* queries,                                                                      \
+    const uint32_t* index_list,                                                                \
+    float* query_kths,                                                                         \
+    LutT* lut_scores,                                                                          \
+    OutT* _out_scores,                                                                         \
+    uint32_t* _out_indices);
 
 #define COMMA ,
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
index 4ce205bda2..de79d3f6c8 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
@@ -38,8 +38,9 @@ static constexpr int kMaxCapacity = 128;
 static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)),
               "kMaxCapacity must be a power of two, not smaller than the WarpSize.");
 
-// inline here, because it may be compiled multiple times.
-inline auto is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) -> bool
+// using weak attribute here, because it may be compiled multiple times.
+auto __attribute__((weak)) is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries)
+  -> bool
 {
   if (k > kMaxCapacity) { return false; }             // warp_sort not possible
   if (n_probes <= 16) { return false; }               // too few clusters
@@ -558,15 +559,55 @@ struct selected {
   dim3 block_dim;
   size_t smem_size;
   size_t device_lut_size;
-
-  template <typename... Args>
-  void operator()(rmm::cuda_stream_view stream, Args... args)
-  {
-    kernel<<<grid_dim, block_dim, smem_size, stream>>>(args...);
-    RAFT_CHECK_CUDA(stream);
-  }
 };
 
+template <typename OutT, typename LutT>
+void compute_similarity_run(selected<OutT, LutT> s,
+                            rmm::cuda_stream_view stream,
+                            uint32_t n_rows,
+                            uint32_t dim,
+                            uint32_t n_probes,
+                            uint32_t pq_dim,
+                            uint32_t n_queries,
+                            distance::DistanceType metric,
+                            codebook_gen codebook_kind,
+                            uint32_t topk,
+                            uint32_t max_samples,
+                            const float* cluster_centers,
+                            const float* pq_centers,
+                            const uint8_t* const* pq_dataset,
+                            const uint32_t* cluster_labels,
+                            const uint32_t* _chunk_indices,
+                            const float* queries,
+                            const uint32_t* index_list,
+                            float* query_kths,
+                            LutT* lut_scores,
+                            OutT* _out_scores,
+                            uint32_t* _out_indices)
+{
+  s.kernel<<<s.grid_dim, s.block_dim, s.smem_size, stream>>>(n_rows,
+                                                             dim,
+                                                             n_probes,
+                                                             pq_dim,
+                                                             n_queries,
+                                                             metric,
+                                                             codebook_kind,
+                                                             topk,
+                                                             max_samples,
+                                                             cluster_centers,
+                                                             pq_centers,
+                                                             pq_dataset,
+                                                             cluster_labels,
+                                                             _chunk_indices,
+                                                             queries,
+                                                             index_list,
+                                                             query_kths,
+                                                             lut_scores,
+                                                             _out_scores,
+                                                             _out_indices);
+  RAFT_CHECK_CUDA(stream);
+}
+
 /**
  * Use heuristics to choose an optimal instance of the search kernel.
  * It selects among a few kernel variants (with/out using shared mem for
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 4256b1631a..53d1fd6290 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -548,27 +548,28 @@ void ivfpq_search_worker(raft::device_resources const& handle,
                 raft::const_op<float>{dummy_block_sort_t<ScoreT, IdxT>::queue_t::kDummy});
     query_kths = query_kths_buf->data_handle();
   }
-  search_instance(stream,
-                  index.size(),
-                  index.rot_dim(),
-                  n_probes,
-                  index.pq_dim(),
-                  n_queries,
-                  index.metric(),
-                  index.codebook_kind(),
-                  topK,
-                  max_samples,
-                  index.centers_rot().data_handle(),
-                  index.pq_centers().data_handle(),
-                  index.data_ptrs().data_handle(),
-                  clusters_to_probe,
-                  chunk_index.data(),
-                  query,
-                  index_list_sorted,
-                  query_kths,
-                  device_lut.data(),
-                  distances_buf.data(),
-                  neighbors_ptr);
+  compute_similarity_run(search_instance,
+                         stream,
+                         index.size(),
+                         index.rot_dim(),
+                         n_probes,
+                         index.pq_dim(),
+                         n_queries,
+                         index.metric(),
+                         index.codebook_kind(),
+                         topK,
+                         max_samples,
+                         index.centers_rot().data_handle(),
+                         index.pq_centers().data_handle(),
+                         index.data_ptrs().data_handle(),
+                         clusters_to_probe,
+                         chunk_index.data(),
+                         query,
+                         index_list_sorted,
+                         query_kths,
+                         device_lut.data(),
+                         distances_buf.data(),
+                         neighbors_ptr);
 
   // Select topk vectors for each query
   rmm::device_uvector<ScoreT> topk_dists(n_queries * topK, stream, mr);
diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index 39269738dc..e9d8111f47 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -380,10 +380,11 @@ struct index : ann::index {
   {
     // TODO: consider padding the dimensions and fixing veclen to its maximum possible value as a
     // template parameter (https://github.com/rapidsai/raft/issues/711)
+
+    // NOTE: keep this consistent with the select_interleaved_scan_kernel logic
+    // in detail/ivf_flat_interleaved_scan-inl.cuh.
     uint32_t veclen = std::max<uint32_t>(1, 16 / sizeof(T));
-    while (dim % veclen != 0) {
-      veclen = veclen >> 1;
-    }
+    if (dim % veclen != 0) { veclen = 1; }
     return veclen;
   }
 };
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
index 48f0021595..e3eead977c 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
@@ -31,7 +31,32 @@
         uint32_t precomp_data_count,                                    \\
         uint32_t n_queries,                                             \\
         uint32_t n_probes,                                              \\
-        uint32_t topk) -> raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+        uint32_t topk) -> raft::neighbors::ivf_pq::detail::selected<OutT, LutT>; \\
+\\
+    template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>( \\
+        raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,        \\
+        rmm::cuda_stream_view stream,                                   \\
+        uint32_t n_rows,                                                \\
+        uint32_t dim,                                                   \\
+        uint32_t n_probes,                                              \\
+        uint32_t pq_dim,                                                \\
+        uint32_t n_queries,                                             \\
+        raft::distance::DistanceType metric,                                  \\
+        raft::neighbors::ivf_pq::codebook_gen codebook_kind,            \\
+        uint32_t topk,                                                  \\
+        uint32_t max_samples,                                           \\
+        const float* cluster_centers,                                   \\
+        const float* pq_centers,                                        \\
+        const uint8_t* const* pq_dataset,                               \\
+        const uint32_t* cluster_labels,                                 \\
+        const uint32_t* _chunk_indices,                                 \\
+        const float* queries,                                           \\
+        const uint32_t* index_list,                                     \\
+        float* query_kths,                                              \\
+        LutT* lut_scores,                                               \\
+        OutT* _out_scores,                                              \\
+        uint32_t* _out_indices);
+
 
 #define COMMA ,
 """
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
index 4adc0d2029..ea31db16f4 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
@@ -30,7 +30,31 @@
     uint32_t n_queries,                                                                 \
     uint32_t n_probes,                                                                  \
     uint32_t topk)                                                                      \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
 
 #define COMMA ,
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(float, float);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
index 2cdcb2f47e..02826f6f45 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
@@ -30,7 +30,31 @@
     uint32_t n_queries,                                                                 \
     uint32_t n_probes,                                                                  \
     uint32_t topk)                                                                      \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
 
 #define COMMA ,
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
index db0226a83b..61558da70b 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
@@ -30,7 +30,31 @@
     uint32_t n_queries,                                                                 \
     uint32_t n_probes,                                                                  \
     uint32_t topk)                                                                      \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
 
 #define COMMA ,
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
index 4d8efbc6a4..c97ee5b20b 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
@@ -30,7 +30,31 @@
     uint32_t n_queries,                                                                 \
     uint32_t n_probes,                                                                  \
     uint32_t topk)                                                                      \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
 
 #define COMMA ,
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(float, half);
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
index ec02753375..de4325b33b 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
@@ -30,7 +30,31 @@
     uint32_t n_queries,                                                                 \
     uint32_t n_probes,                                                                  \
     uint32_t topk)                                                                      \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
 
 #define COMMA ,
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
index 20573a3570..baec4f1bea 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
@@ -30,7 +30,31 @@
     uint32_t n_queries,                                                                 \
     uint32_t n_probes,                                                                  \
     uint32_t topk)                                                                      \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
 
 #define COMMA ,
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
index eb0217fe4e..a73ed07ce4 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
@@ -30,7 +30,31 @@
     uint32_t n_queries,                                                                 \
     uint32_t n_probes,                                                                  \
     uint32_t topk)                                                                      \
-    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;
+    ->raft::neighbors::ivf_pq::detail::selected<OutT, LutT>;                            \
+                                                                                        \
+  template void raft::neighbors::ivf_pq::detail::compute_similarity_run<OutT, LutT>(    \
+    raft::neighbors::ivf_pq::detail::selected<OutT, LutT> s,                            \
+    rmm::cuda_stream_view stream,                                                       \
+    uint32_t n_rows,                                                                    \
+    uint32_t dim,                                                                       \
+    uint32_t n_probes,                                                                  \
+    uint32_t pq_dim,                                                                    \
+    uint32_t n_queries,                                                                 \
+    raft::distance::DistanceType metric,                                                \
+    raft::neighbors::ivf_pq::codebook_gen codebook_kind,                                \
+    uint32_t topk,                                                                      \
+    uint32_t max_samples,                                                               \
+    const float* cluster_centers,                                                       \
+    const float* pq_centers,                                                            \
+    const uint8_t* const* pq_dataset,                                                   \
+    const uint32_t* cluster_labels,                                                     \
+    const uint32_t* _chunk_indices,                                                     \
+    const float* queries,                                                               \
+    const uint32_t* index_list,                                                         \
+    float* query_kths,                                                                  \
+    LutT* lut_scores,                                                                   \
+    OutT* _out_scores,                                                                  \
+    uint32_t* _out_indices);
 
 #define COMMA ,
 instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(half, half);
diff --git a/cpp/test/cluster/linkage.cu b/cpp/test/cluster/linkage.cu
index 4946d52f26..c5239063c3 100644
--- a/cpp/test/cluster/linkage.cu
+++ b/cpp/test/cluster/linkage.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+// XXX: We allow the instantiation of fused_l2_nn here:
+// raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(colors.data(), params.n_row);
+// raft::linkage::connect_components<value_idx, value_t>(
+//   handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op);
+//
+// TODO: consider adding this to libraft.so or creating an instance in a
+// separate translation unit for this test.
+#undef RAFT_EXPLICIT_INSTANTIATE
+
 #include "../test_utils.cuh"
 
 #include <raft/distance/distance_types.hpp>
diff --git a/cpp/test/distance/dist_adj_threshold.cuh b/cpp/test/distance/dist_adj_threshold.cuh
index ad02be64aa..78663b3cd1 100644
--- a/cpp/test/distance/dist_adj_threshold.cuh
+++ b/cpp/test/distance/dist_adj_threshold.cuh
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <cstdint>  // uint8_t
+
 namespace raft::distance {
 
 template <typename AccT, typename DataT, typename OutT, typename Index>
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 383ad39319..84ad52a324 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -92,12 +92,13 @@ void naive(raft::KeyValuePair<int, DataT>* min,
   static const dim3 TPB(32, 16, 1);
   dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1);
   RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
-  auto blks = raft::ceildiv(m, 256);
-  MinAndDistanceReduceOp<int, DataT> op;
+  auto blks                    = raft::ceildiv(m, 256);
+  using MinAndDistanceReduceOp = raft::distance::detail::MinAndDistanceReduceOpImpl<int, DataT>;
+  MinAndDistanceReduceOp op;
   detail::initKernel<DataT, raft::KeyValuePair<int, DataT>, int>
     <<<blks, 256, 0, stream>>>(min, m, std::numeric_limits<DataT>::max(), op);
   RAFT_CUDA_TRY(cudaGetLastError());
-  naiveKernel<DataT, Sqrt, MinAndDistanceReduceOp<int, DataT>, 16>
+  naiveKernel<DataT, Sqrt, MinAndDistanceReduceOp, 16>
     <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace, std::numeric_limits<DataT>::max());
   RAFT_CUDA_TRY(cudaGetLastError());
 }
diff --git a/cpp/test/distance/masked_nn.cu b/cpp/test/distance/masked_nn.cu
index d01911206b..2b6081fcc3 100644
--- a/cpp/test/distance/masked_nn.cu
+++ b/cpp/test/distance/masked_nn.cu
@@ -20,6 +20,7 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/kvp.hpp>
+#include <raft/distance/detail/fused_l2_nn.cuh>  // MinAndDistanceReduceOpImpl
 #include <raft/distance/detail/masked_nn.cuh>
 #include <raft/distance/masked_nn.cuh>
 #include <raft/linalg/norm.cuh>
@@ -133,9 +134,10 @@ __global__ __launch_bounds__(32 * NWARPS,
       __shared__ typename WarpReduce::TempStorage temp[NWARPS];
       int warpId = threadIdx.x / raft::WarpSize;
       raft::KeyValuePair<int, DataT> tmp;
-      tmp.key   = include_dist ? nidx : -1;
-      tmp.value = include_dist ? acc : maxVal;
-      tmp       = WarpReduce(temp[warpId]).Reduce(tmp, raft::distance::KVPMinReduce<int, DataT>{});
+      tmp.key            = include_dist ? nidx : -1;
+      tmp.value          = include_dist ? acc : maxVal;
+      using KVPMinReduce = raft::distance::detail::KVPMinReduceImpl<int, DataT>;
+      tmp                = WarpReduce(temp[warpId]).Reduce(tmp, KVPMinReduce{});
       if (threadIdx.x % raft::WarpSize == 0 && midx < m) {
         while (atomicCAS(workspace + midx, 0, 1) == 1)
           ;
@@ -215,9 +217,10 @@ auto reference(const raft::handle_t& handle, Inputs<DataT> inp, const Params& p)
   RAFT_CUDA_TRY(cudaMemsetAsync(workspace.data(), 0, sizeof(int) * m, stream));
 
   // Initialize output
-  auto out  = raft::make_device_vector<OutT, int>(handle, m);
-  auto blks = raft::ceildiv(m, 256);
-  MinAndDistanceReduceOp<int, DataT> op;
+  auto out                     = raft::make_device_vector<OutT, int>(handle, m);
+  auto blks                    = raft::ceildiv(m, 256);
+  using MinAndDistanceReduceOp = raft::distance::detail::MinAndDistanceReduceOpImpl<int, DataT>;
+  MinAndDistanceReduceOp op;
   raft::distance::detail::initKernel<DataT, raft::KeyValuePair<int, DataT>, int>
     <<<blks, 256, 0, stream>>>(out.data_handle(), m, std::numeric_limits<DataT>::max(), op);
   RAFT_CUDA_TRY(cudaGetLastError());
@@ -265,8 +268,8 @@ auto run_masked_nn(const raft::handle_t& handle, Inputs<DataT> inp, const Params
 
   // Create parameters for masked_l2_nn
   using IdxT       = int;
-  using RedOpT     = MinAndDistanceReduceOp<int, DataT>;
-  using PairRedOpT = raft::distance::KVPMinReduce<int, DataT>;
+  using RedOpT     = raft::distance::detail::MinAndDistanceReduceOpImpl<int, DataT>;
+  using PairRedOpT = raft::distance::detail::KVPMinReduceImpl<int, DataT>;
   using ParamT     = raft::distance::masked_l2_nn_params<RedOpT, PairRedOpT>;
 
   bool init_out = true;
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index c331081314..458a40d9f2 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -23,6 +23,7 @@
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_serialize.cuh>
 #include <raft/random/rng.cuh>
 #ifdef RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
index c14afe4d70..67f790a8ae 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+// XXX: the uint32_t instance is not compiled in libraft.so. So we allow
+// instantiating the template here.
+//
+// TODO: consider removing this test or consider adding an instantiation to the
+// library.
+#undef RAFT_EXPLICIT_INSTANTIATE
+
 #include "../ann_ivf_pq.cuh"
 
 namespace raft::neighbors::ivf_pq {
diff --git a/cpp/test/sparse/neighbors/connect_components.cu b/cpp/test/sparse/neighbors/connect_components.cu
index d200744329..a4fdd35558 100644
--- a/cpp/test/sparse/neighbors/connect_components.cu
+++ b/cpp/test/sparse/neighbors/connect_components.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+// XXX: We allow the instantiation of fused_l2_nn here:
+// raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(colors.data(), params.n_row);
+// raft::linkage::connect_components<value_idx, value_t>(
+//   handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op);
+//
+// TODO: consider adding this to libraft.so or creating an instance in a
+// separate translation unit for this test.
+#undef RAFT_EXPLICIT_INSTANTIATE
+
 #include <gtest/gtest.h>
 
 #include <cub/cub.cuh>

From 56a4c4a8f23290a56be7a17462f2061d3c8d6710 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 13 Apr 2023 16:02:40 +0200
Subject: [PATCH 29/89] Move raft_runtime src files

Instead of having the specializations in sub-directories, the
raft_runtime source files now mimic the include/ directory hierarchy.
---
 cpp/CMakeLists.txt                            | 64 +++++++++----------
 .../cluster/cluster_cost.cuh                  |  0
 .../cluster/cluster_cost_double.cu            |  0
 .../cluster/cluster_cost_float.cu             |  0
 .../cluster/kmeans_fit_double.cu              |  0
 .../cluster/kmeans_fit_float.cu               |  0
 .../cluster/kmeans_init_plus_plus_double.cu   |  0
 .../cluster/kmeans_init_plus_plus_float.cu    |  0
 .../cluster/update_centroids.cuh              |  0
 .../cluster/update_centroids_double.cu        |  0
 .../cluster/update_centroids_float.cu         |  0
 .../distance/fused_l2_min_arg.cu              |  0
 .../distance/pairwise_distance.cu             |  0
 .../brute_force_knn_int64_t_float.cu          |  0
 .../neighbors/ivf_flat_build.cu               |  0
 .../neighbors/ivf_flat_search.cu              |  0
 .../neighbors/ivfpq_build.cu                  |  0
 .../neighbors/ivfpq_deserialize.cu            |  2 +
 .../neighbors/ivfpq_search_float_int64_t.cu   | 39 +++++++++++
 .../neighbors/ivfpq_search_int8_t_int64_t.cu  | 39 +++++++++++
 .../neighbors/ivfpq_search_uint8_t_int64_t.cu | 39 +++++++++++
 .../neighbors/ivfpq_serialize.cu              |  1 +
 .../neighbors/refine_d_int64_t_float.cu       |  0
 .../neighbors/refine_d_int64_t_int8_t.cu      |  0
 .../neighbors/refine_d_int64_t_uint8_t.cu     |  0
 .../neighbors/refine_h_int64_t_float.cu       |  0
 .../neighbors/refine_h_int64_t_int8_t.cu      |  0
 .../neighbors/refine_h_int64_t_uint8_t.cu     |  0
 cpp/src/{ => raft_runtime}/random/common.cuh  |  0
 ...rmat_rectangular_generator_int64_double.cu |  0
 .../rmat_rectangular_generator_int64_float.cu |  0
 .../rmat_rectangular_generator_int_double.cu  |  0
 .../rmat_rectangular_generator_int_float.cu   |  0
 33 files changed, 151 insertions(+), 33 deletions(-)
 rename cpp/src/{ => raft_runtime}/cluster/cluster_cost.cuh (100%)
 rename cpp/src/{ => raft_runtime}/cluster/cluster_cost_double.cu (100%)
 rename cpp/src/{ => raft_runtime}/cluster/cluster_cost_float.cu (100%)
 rename cpp/src/{ => raft_runtime}/cluster/kmeans_fit_double.cu (100%)
 rename cpp/src/{ => raft_runtime}/cluster/kmeans_fit_float.cu (100%)
 rename cpp/src/{ => raft_runtime}/cluster/kmeans_init_plus_plus_double.cu (100%)
 rename cpp/src/{ => raft_runtime}/cluster/kmeans_init_plus_plus_float.cu (100%)
 rename cpp/src/{ => raft_runtime}/cluster/update_centroids.cuh (100%)
 rename cpp/src/{ => raft_runtime}/cluster/update_centroids_double.cu (100%)
 rename cpp/src/{ => raft_runtime}/cluster/update_centroids_float.cu (100%)
 rename cpp/src/{ => raft_runtime}/distance/fused_l2_min_arg.cu (100%)
 rename cpp/src/{ => raft_runtime}/distance/pairwise_distance.cu (100%)
 rename cpp/src/{ => raft_runtime}/neighbors/brute_force_knn_int64_t_float.cu (100%)
 rename cpp/src/{ => raft_runtime}/neighbors/ivf_flat_build.cu (100%)
 rename cpp/src/{ => raft_runtime}/neighbors/ivf_flat_search.cu (100%)
 rename cpp/src/{ => raft_runtime}/neighbors/ivfpq_build.cu (100%)
 rename cpp/src/{ => raft_runtime}/neighbors/ivfpq_deserialize.cu (96%)
 create mode 100644 cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu
 create mode 100644 cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu
 create mode 100644 cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu
 rename cpp/src/{ => raft_runtime}/neighbors/ivfpq_serialize.cu (96%)
 rename cpp/src/{ => raft_runtime}/neighbors/refine_d_int64_t_float.cu (100%)
 rename cpp/src/{ => raft_runtime}/neighbors/refine_d_int64_t_int8_t.cu (100%)
 rename cpp/src/{ => raft_runtime}/neighbors/refine_d_int64_t_uint8_t.cu (100%)
 rename cpp/src/{ => raft_runtime}/neighbors/refine_h_int64_t_float.cu (100%)
 rename cpp/src/{ => raft_runtime}/neighbors/refine_h_int64_t_int8_t.cu (100%)
 rename cpp/src/{ => raft_runtime}/neighbors/refine_h_int64_t_uint8_t.cu (100%)
 rename cpp/src/{ => raft_runtime}/random/common.cuh (100%)
 rename cpp/src/{ => raft_runtime}/random/rmat_rectangular_generator_int64_double.cu (100%)
 rename cpp/src/{ => raft_runtime}/random/rmat_rectangular_generator_int64_float.cu (100%)
 rename cpp/src/{ => raft_runtime}/random/rmat_rectangular_generator_int_double.cu (100%)
 rename cpp/src/{ => raft_runtime}/random/rmat_rectangular_generator_int_float.cu (100%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1c2f5f266c..378c1004cd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -275,20 +275,8 @@ set_target_properties(raft_compiled PROPERTIES EXPORT_NAME compiled)
 if(RAFT_COMPILE_LIBRARY)
   add_library(
     raft_lib
-    src/distance/pairwise_distance.cu
-    src/distance/fused_l2_min_arg.cu
-    src/cluster/update_centroids_float.cu
-    src/cluster/update_centroids_double.cu
-    src/cluster/cluster_cost_float.cu
-    src/cluster/cluster_cost_double.cu
     src/core/logger.cpp
     src/linalg/detail/coalesced_reduction.cu
-    src/neighbors/refine_d_int64_t_float.cu
-    src/neighbors/refine_d_int64_t_int8_t.cu
-    src/neighbors/refine_d_int64_t_uint8_t.cu
-    src/neighbors/refine_h_int64_t_float.cu
-    src/neighbors/refine_h_int64_t_int8_t.cu
-    src/neighbors/refine_h_int64_t_uint8_t.cu
     src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
     src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
     src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
@@ -303,10 +291,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/specializations/refine_h_int64_t_float.cu
     src/neighbors/specializations/refine_h_int64_t_int8_t.cu
     src/neighbors/specializations/refine_h_int64_t_uint8_t.cu
-    src/cluster/kmeans_fit_float.cu
-    src/cluster/kmeans_fit_double.cu
-    src/cluster/kmeans_init_plus_plus_double.cu
-    src/cluster/kmeans_init_plus_plus_float.cu
     src/distance/distance.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
@@ -342,7 +326,6 @@ if(RAFT_COMPILE_LIBRARY)
     # These are somehow missing a kernel definition which is causing a compile error.
     # src/distance/specializations/detail/kernels/rbf_kernel_double.cu
     # src/distance/specializations/detail/kernels/rbf_kernel_float.cu
-    src/neighbors/brute_force_knn_int64_t_float.cu
     src/distance/specializations/detail/kernels/tanh_kernel_double.cu
     src/distance/specializations/detail/kernels/tanh_kernel_float.cu
     src/matrix/detail/select_k_double_int64_t.cu
@@ -351,9 +334,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/matrix/detail/select_k_float_uint32_t.cu
     src/matrix/detail/select_k_half_int64_t.cu
     src/matrix/detail/select_k_half_uint32_t.cu
-    src/neighbors/ivfpq_build.cu
-    src/neighbors/ivfpq_deserialize.cu
-    src/neighbors/ivfpq_serialize.cu
     src/neighbors/ivfpq_search_float_int64_t.cu
     src/neighbors/ivfpq_search_int8_t_int64_t.cu
     src/neighbors/ivfpq_search_uint8_t_int64_t.cu
@@ -361,10 +341,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
     src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
     src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
-    src/random/rmat_rectangular_generator_int_double.cu
-    src/random/rmat_rectangular_generator_int64_double.cu
-    src/random/rmat_rectangular_generator_int_float.cu
-    src/random/rmat_rectangular_generator_int64_float.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
@@ -372,30 +348,52 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/specializations/ball_cover_all_knn_query.cu
     src/neighbors/specializations/ball_cover_build_index.cu
     src/neighbors/specializations/ball_cover_knn_query.cu
-    src/neighbors/ivf_flat_search.cu
-    src/neighbors/ivf_flat_build.cu
     src/neighbors/specializations/ivfflat_build_float_int64_t.cu
     src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
     src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
     src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
     src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
     src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
-    src/neighbors/ivfpq_build.cu
     src/neighbors/ivfpq_build_float_int64_t.cu
     src/neighbors/ivfpq_build_int8_t_int64_t.cu
     src/neighbors/ivfpq_build_uint8_t_int64_t.cu
-    src/neighbors/ivfpq_deserialize.cu
     src/neighbors/ivfpq_extend_float_int64_t.cu
     src/neighbors/ivfpq_extend_int8_t_int64_t.cu
     src/neighbors/ivfpq_extend_uint8_t_int64_t.cu
     src/neighbors/ivfpq_search_float_int64_t.cu
     src/neighbors/ivfpq_search_int8_t_int64_t.cu
     src/neighbors/ivfpq_search_uint8_t_int64_t.cu
-    src/neighbors/ivfpq_serialize.cu
-    src/random/rmat_rectangular_generator_int_double.cu
-    src/random/rmat_rectangular_generator_int64_double.cu
-    src/random/rmat_rectangular_generator_int_float.cu
-    src/random/rmat_rectangular_generator_int64_float.cu
+    src/raft_runtime/cluster/cluster_cost.cuh
+    src/raft_runtime/cluster/cluster_cost_double.cu
+    src/raft_runtime/cluster/cluster_cost_float.cu
+    src/raft_runtime/cluster/kmeans_fit_double.cu
+    src/raft_runtime/cluster/kmeans_fit_float.cu
+    src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu
+    src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu
+    src/raft_runtime/cluster/update_centroids.cuh
+    src/raft_runtime/cluster/update_centroids_double.cu
+    src/raft_runtime/cluster/update_centroids_float.cu
+    src/raft_runtime/distance/fused_l2_min_arg.cu
+    src/raft_runtime/distance/pairwise_distance.cu
+    src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
+    src/raft_runtime/neighbors/ivf_flat_build.cu
+    src/raft_runtime/neighbors/ivf_flat_search.cu
+    src/raft_runtime/neighbors/ivfpq_build.cu
+    src/raft_runtime/neighbors/ivfpq_deserialize.cu
+    src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu
+    src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu
+    src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu
+    src/raft_runtime/neighbors/ivfpq_serialize.cu
+    src/raft_runtime/neighbors/refine_d_int64_t_float.cu
+    src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu
+    src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu
+    src/raft_runtime/neighbors/refine_h_int64_t_float.cu
+    src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu
+    src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu
+    src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu
+    src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu
+    src/raft_runtime/random/rmat_rectangular_generator_int_double.cu
+    src/raft_runtime/random/rmat_rectangular_generator_int_float.cu
     src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
     src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
     src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
diff --git a/cpp/src/cluster/cluster_cost.cuh b/cpp/src/raft_runtime/cluster/cluster_cost.cuh
similarity index 100%
rename from cpp/src/cluster/cluster_cost.cuh
rename to cpp/src/raft_runtime/cluster/cluster_cost.cuh
diff --git a/cpp/src/cluster/cluster_cost_double.cu b/cpp/src/raft_runtime/cluster/cluster_cost_double.cu
similarity index 100%
rename from cpp/src/cluster/cluster_cost_double.cu
rename to cpp/src/raft_runtime/cluster/cluster_cost_double.cu
diff --git a/cpp/src/cluster/cluster_cost_float.cu b/cpp/src/raft_runtime/cluster/cluster_cost_float.cu
similarity index 100%
rename from cpp/src/cluster/cluster_cost_float.cu
rename to cpp/src/raft_runtime/cluster/cluster_cost_float.cu
diff --git a/cpp/src/cluster/kmeans_fit_double.cu b/cpp/src/raft_runtime/cluster/kmeans_fit_double.cu
similarity index 100%
rename from cpp/src/cluster/kmeans_fit_double.cu
rename to cpp/src/raft_runtime/cluster/kmeans_fit_double.cu
diff --git a/cpp/src/cluster/kmeans_fit_float.cu b/cpp/src/raft_runtime/cluster/kmeans_fit_float.cu
similarity index 100%
rename from cpp/src/cluster/kmeans_fit_float.cu
rename to cpp/src/raft_runtime/cluster/kmeans_fit_float.cu
diff --git a/cpp/src/cluster/kmeans_init_plus_plus_double.cu b/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu
similarity index 100%
rename from cpp/src/cluster/kmeans_init_plus_plus_double.cu
rename to cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu
diff --git a/cpp/src/cluster/kmeans_init_plus_plus_float.cu b/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu
similarity index 100%
rename from cpp/src/cluster/kmeans_init_plus_plus_float.cu
rename to cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu
diff --git a/cpp/src/cluster/update_centroids.cuh b/cpp/src/raft_runtime/cluster/update_centroids.cuh
similarity index 100%
rename from cpp/src/cluster/update_centroids.cuh
rename to cpp/src/raft_runtime/cluster/update_centroids.cuh
diff --git a/cpp/src/cluster/update_centroids_double.cu b/cpp/src/raft_runtime/cluster/update_centroids_double.cu
similarity index 100%
rename from cpp/src/cluster/update_centroids_double.cu
rename to cpp/src/raft_runtime/cluster/update_centroids_double.cu
diff --git a/cpp/src/cluster/update_centroids_float.cu b/cpp/src/raft_runtime/cluster/update_centroids_float.cu
similarity index 100%
rename from cpp/src/cluster/update_centroids_float.cu
rename to cpp/src/raft_runtime/cluster/update_centroids_float.cu
diff --git a/cpp/src/distance/fused_l2_min_arg.cu b/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
similarity index 100%
rename from cpp/src/distance/fused_l2_min_arg.cu
rename to cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
diff --git a/cpp/src/distance/pairwise_distance.cu b/cpp/src/raft_runtime/distance/pairwise_distance.cu
similarity index 100%
rename from cpp/src/distance/pairwise_distance.cu
rename to cpp/src/raft_runtime/distance/pairwise_distance.cu
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float.cu b/cpp/src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
similarity index 100%
rename from cpp/src/neighbors/brute_force_knn_int64_t_float.cu
rename to cpp/src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
diff --git a/cpp/src/neighbors/ivf_flat_build.cu b/cpp/src/raft_runtime/neighbors/ivf_flat_build.cu
similarity index 100%
rename from cpp/src/neighbors/ivf_flat_build.cu
rename to cpp/src/raft_runtime/neighbors/ivf_flat_build.cu
diff --git a/cpp/src/neighbors/ivf_flat_search.cu b/cpp/src/raft_runtime/neighbors/ivf_flat_search.cu
similarity index 100%
rename from cpp/src/neighbors/ivf_flat_search.cu
rename to cpp/src/raft_runtime/neighbors/ivf_flat_search.cu
diff --git a/cpp/src/neighbors/ivfpq_build.cu b/cpp/src/raft_runtime/neighbors/ivfpq_build.cu
similarity index 100%
rename from cpp/src/neighbors/ivfpq_build.cu
rename to cpp/src/raft_runtime/neighbors/ivfpq_build.cu
diff --git a/cpp/src/neighbors/ivfpq_deserialize.cu b/cpp/src/raft_runtime/neighbors/ivfpq_deserialize.cu
similarity index 96%
rename from cpp/src/neighbors/ivfpq_deserialize.cu
rename to cpp/src/raft_runtime/neighbors/ivfpq_deserialize.cu
index bb6ac13966..45b731fdcf 100644
--- a/cpp/src/neighbors/ivfpq_deserialize.cu
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_deserialize.cu
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <raft/neighbors/ivf_pq.cuh>
 #include <raft/neighbors/ivf_pq_serialize.cuh>
+
 #include <raft_runtime/neighbors/ivf_pq.hpp>
 
 namespace raft::runtime::neighbors::ivf_pq {
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu b/cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu
new file mode 100644
index 0000000000..91093d3a39
--- /dev/null
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/specializations.cuh>
+
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
+  void search(raft::device_resources const& handle,                                               \
+              const raft::neighbors::ivf_pq::search_params& params,                               \
+              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
+              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
+              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
+              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
+  {                                                                                               \
+    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
+  }
+
+RAFT_SEARCH_INST(float, int64_t);
+
+#undef RAFT_INST_SEARCH
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu b/cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu
new file mode 100644
index 0000000000..e1552c0b27
--- /dev/null
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/specializations.cuh>
+
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
+  void search(raft::device_resources const& handle,                                               \
+              const raft::neighbors::ivf_pq::search_params& params,                               \
+              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
+              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
+              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
+              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
+  {                                                                                               \
+    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
+  }
+
+RAFT_SEARCH_INST(int8_t, int64_t);
+
+#undef RAFT_INST_SEARCH
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..85195a7551
--- /dev/null
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/specializations.cuh>
+
+#include <raft_runtime/neighbors/ivf_pq.hpp>
+
+namespace raft::runtime::neighbors::ivf_pq {
+
+#define RAFT_SEARCH_INST(T, IdxT)                                                                 \
+  void search(raft::device_resources const& handle,                                               \
+              const raft::neighbors::ivf_pq::search_params& params,                               \
+              const raft::neighbors::ivf_pq::index<IdxT>& idx,                                    \
+              raft::device_matrix_view<const T, IdxT, row_major> queries,                         \
+              raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,                          \
+              raft::device_matrix_view<float, IdxT, row_major> distances)                         \
+  {                                                                                               \
+    raft::neighbors::ivf_pq::search<T, IdxT>(handle, params, idx, queries, neighbors, distances); \
+  }
+
+RAFT_SEARCH_INST(uint8_t, int64_t);
+
+#undef RAFT_INST_SEARCH
+
+}  // namespace raft::runtime::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/ivfpq_serialize.cu b/cpp/src/raft_runtime/neighbors/ivfpq_serialize.cu
similarity index 96%
rename from cpp/src/neighbors/ivfpq_serialize.cu
rename to cpp/src/raft_runtime/neighbors/ivfpq_serialize.cu
index 0ba1b929b7..21bd221c45 100644
--- a/cpp/src/neighbors/ivfpq_serialize.cu
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_serialize.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <raft/neighbors/ivf_pq.cuh>
 #include <raft/neighbors/ivf_pq_serialize.cuh>
 
 #include <raft_runtime/neighbors/ivf_pq.hpp>
diff --git a/cpp/src/neighbors/refine_d_int64_t_float.cu b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_float.cu
similarity index 100%
rename from cpp/src/neighbors/refine_d_int64_t_float.cu
rename to cpp/src/raft_runtime/neighbors/refine_d_int64_t_float.cu
diff --git a/cpp/src/neighbors/refine_d_int64_t_int8_t.cu b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu
similarity index 100%
rename from cpp/src/neighbors/refine_d_int64_t_int8_t.cu
rename to cpp/src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu
diff --git a/cpp/src/neighbors/refine_d_int64_t_uint8_t.cu b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu
similarity index 100%
rename from cpp/src/neighbors/refine_d_int64_t_uint8_t.cu
rename to cpp/src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu
diff --git a/cpp/src/neighbors/refine_h_int64_t_float.cu b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_float.cu
similarity index 100%
rename from cpp/src/neighbors/refine_h_int64_t_float.cu
rename to cpp/src/raft_runtime/neighbors/refine_h_int64_t_float.cu
diff --git a/cpp/src/neighbors/refine_h_int64_t_int8_t.cu b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu
similarity index 100%
rename from cpp/src/neighbors/refine_h_int64_t_int8_t.cu
rename to cpp/src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu
diff --git a/cpp/src/neighbors/refine_h_int64_t_uint8_t.cu b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu
similarity index 100%
rename from cpp/src/neighbors/refine_h_int64_t_uint8_t.cu
rename to cpp/src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu
diff --git a/cpp/src/random/common.cuh b/cpp/src/raft_runtime/random/common.cuh
similarity index 100%
rename from cpp/src/random/common.cuh
rename to cpp/src/raft_runtime/random/common.cuh
diff --git a/cpp/src/random/rmat_rectangular_generator_int64_double.cu b/cpp/src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu
similarity index 100%
rename from cpp/src/random/rmat_rectangular_generator_int64_double.cu
rename to cpp/src/raft_runtime/random/rmat_rectangular_generator_int64_double.cu
diff --git a/cpp/src/random/rmat_rectangular_generator_int64_float.cu b/cpp/src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu
similarity index 100%
rename from cpp/src/random/rmat_rectangular_generator_int64_float.cu
rename to cpp/src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu
diff --git a/cpp/src/random/rmat_rectangular_generator_int_double.cu b/cpp/src/raft_runtime/random/rmat_rectangular_generator_int_double.cu
similarity index 100%
rename from cpp/src/random/rmat_rectangular_generator_int_double.cu
rename to cpp/src/raft_runtime/random/rmat_rectangular_generator_int_double.cu
diff --git a/cpp/src/random/rmat_rectangular_generator_int_float.cu b/cpp/src/raft_runtime/random/rmat_rectangular_generator_int_float.cu
similarity index 100%
rename from cpp/src/random/rmat_rectangular_generator_int_float.cu
rename to cpp/src/raft_runtime/random/rmat_rectangular_generator_int_float.cu

From cb80db69b542f33dc9c4de3b12fbcad8fadb3def Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 13 Apr 2023 16:59:57 +0200
Subject: [PATCH 30/89] Split ivf_flat

---
 cpp/CMakeLists.txt                            |  15 +-
 cpp/include/raft/neighbors/ivf_flat-ext.cuh   | 501 ++++++++++++++++++
 cpp/include/raft/neighbors/ivf_flat-inl.cuh   |  84 ++-
 cpp/include/raft/neighbors/ivf_flat.cuh       |  25 +
 .../raft/neighbors/specializations.cuh        |   1 -
 .../neighbors/specializations/ivf_flat.cuh    |  54 --
 cpp/src/neighbors/ivf_flat_00_generate.py     | 127 +++++
 .../neighbors/ivf_flat_build_float_int64_t.cu |  41 ++
 .../ivf_flat_build_int8_t_int64_t.cu          |  41 ++
 .../ivf_flat_build_uint8_t_int64_t.cu         |  41 ++
 .../ivf_flat_extend_float_int64_t.cu          |  49 ++
 .../ivf_flat_extend_int8_t_int64_t.cu         |  49 ++
 .../ivf_flat_extend_uint8_t_int64_t.cu        |  49 ++
 .../ivf_flat_search_float_int64_t.cu          |  40 ++
 .../ivf_flat_search_int8_t_int64_t.cu         |  40 ++
 .../ivf_flat_search_uint8_t_int64_t.cu        |  40 ++
 .../ivfflat_build_float_int64_t.cu            |  31 --
 .../ivfflat_build_int8_t_int64_t.cu           |  31 --
 .../ivfflat_build_uint8_t_int64_t.cu          |  31 --
 .../ivfflat_extend_float_int64_t.cu           |  37 --
 .../ivfflat_extend_int8_t_int64_t.cu          |  37 --
 .../ivfflat_extend_uint8_t_int64_t.cu         |  37 --
 .../ivfflat_search_float_int64_t.cu           |  58 --
 .../ivfflat_search_int8_t_int64_t.cu          |  49 --
 .../ivfflat_search_uint8_t_int64_t.cu         |  49 --
 .../ivfpq_build_float_int64_t.cu              |  32 --
 .../ivfpq_build_int8_t_int64_t.cu             |  32 --
 .../ivfpq_build_uint8_t_int64_t.cu            |  32 --
 .../ivfpq_extend_float_int64_t.cu             |  39 --
 .../ivfpq_extend_int8_t_int64_t.cu            |  39 --
 .../ivfpq_extend_uint8_t_int64_t.cu           |  39 --
 .../ivfpq_search_float_int64_t.cu             |  34 --
 .../ivfpq_search_int8_t_int64_t.cu            |  34 --
 .../ivfpq_search_uint8_t_int64_t.cu           |  34 --
 .../raft_runtime/neighbors/ivf_flat_build.cu  |   2 +-
 .../raft_runtime/neighbors/ivf_flat_search.cu |   2 +-
 36 files changed, 1095 insertions(+), 781 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/ivf_flat-ext.cuh
 delete mode 100644 cpp/include/raft/neighbors/specializations/ivf_flat.cuh
 create mode 100644 cpp/src/neighbors/ivf_flat_00_generate.py
 create mode 100644 cpp/src/neighbors/ivf_flat_build_float_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
 create mode 100644 cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfflat_build_float_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfflat_search_float_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfpq_build_float_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfpq_search_float_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 378c1004cd..cf0937beed 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -334,6 +334,15 @@ if(RAFT_COMPILE_LIBRARY)
     src/matrix/detail/select_k_float_uint32_t.cu
     src/matrix/detail/select_k_half_int64_t.cu
     src/matrix/detail/select_k_half_uint32_t.cu
+    src/neighbors/ivf_flat_build_float_int64_t.cu
+    src/neighbors/ivf_flat_extend_float_int64_t.cu
+    src/neighbors/ivf_flat_search_float_int64_t.cu
+    src/neighbors/ivf_flat_build_int8_t_int64_t.cu
+    src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
+    src/neighbors/ivf_flat_search_int8_t_int64_t.cu
+    src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
+    src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
+    src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
     src/neighbors/ivfpq_search_float_int64_t.cu
     src/neighbors/ivfpq_search_int8_t_int64_t.cu
     src/neighbors/ivfpq_search_uint8_t_int64_t.cu
@@ -348,12 +357,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/specializations/ball_cover_all_knn_query.cu
     src/neighbors/specializations/ball_cover_build_index.cu
     src/neighbors/specializations/ball_cover_knn_query.cu
-    src/neighbors/specializations/ivfflat_build_float_int64_t.cu
-    src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
-    src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
-    src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
-    src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
     src/neighbors/ivfpq_build_float_int64_t.cu
     src/neighbors/ivfpq_build_int8_t_int64_t.cu
     src/neighbors/ivfpq_build_uint8_t_int64_t.cu
diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
new file mode 100644
index 0000000000..675b2fd97b
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_flat_build.cuh>
+#include <raft/neighbors/detail/ivf_flat_search.cuh>
+#include <raft/neighbors/ivf_flat_serialize.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft::neighbors::ivf_flat {
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_flat::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ * @param[in] n_rows the number of samples
+ * @param[in] dim the dimensionality of the data
+ *
+ * @return the constructed ivf-flat index
+ */
+template <typename T, typename IdxT>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           const T* dataset,
+           IdxT n_rows,
+           uint32_t dim) -> index<T, IdxT> RAFT_EXPLICIT;
+
+/**
+ * @defgroup ivf_flat IVF Flat Algorithm
+ * @{
+ */
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_flat::build(handle, dataset, index_params);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_flat::search(handle, index, queries, out_inds, out_dists, search_params, k);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ * @tparam int_t precision / type of integral arguments
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-flat index
+ */
+template <typename T, typename IdxT>
+auto build(raft::device_resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, IdxT, row_major> dataset)
+  -> index<T, IdxT> RAFT_EXPLICIT;
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2Unexpanded
+ * - InnerProduct
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_flat::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_flat::index<decltype(dataset::Type), decltype(dataset::index_type)> index;
+ *   ivf_flat::build(handle, dataset, index_params, index);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // search K nearest neighbours for each of the N queries
+ *   ivf_flat::search(handle, index, queries, out_inds, out_dists, search_params, k);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ * @tparam int_t precision / type of integral arguments
+ * @tparam matrix_IdxT matrix indexing type
+ *
+ * @param[in] handle
+ * @param[in] params configure the index building
+ * @param[in] dataset raft::device_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_flat::index
+ *
+ */
+template <typename T, typename IdxT>
+void build(raft::device_resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, IdxT, row_major> dataset,
+           raft::neighbors::ivf_flat::index<T, IdxT>& idx) RAFT_EXPLICIT;
+
+/** @} */
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] orig_index original index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] n_rows number of rows in `new_vectors`
+ *
+ * @return the constructed extended ivf-flat index
+ */
+template <typename T, typename IdxT>
+auto extend(raft::device_resources const& handle,
+            const index<T, IdxT>& orig_index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) -> index<T, IdxT> RAFT_EXPLICIT;
+
+/**
+ * @ingroup ivf_flat
+ * @{
+ */
+
+/**
+ * @brief Build a new index containing the data of the original plus new extra vectors.
+ *
+ * Implementation note:
+ *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    centers are adjusted to match the newly labeled data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
+ *   // fill the index with the data
+ *   auto index = ivf_flat::extend(handle, index_empty, dataset);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices optional raft::device_matrix_view to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] orig_index original index
+ *
+ * @return the constructed extended ivf-flat index
+ */
+template <typename T, typename IdxT>
+auto extend(raft::device_resources const& handle,
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            const index<T, IdxT>& orig_index) -> index<T, IdxT> RAFT_EXPLICIT;
+
+/** @} */
+
+/**
+ * @brief Extend the index in-place with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   // fill the index with the data
+ *   ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param handle
+ * @param[inout] index
+ * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices a device pointer to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] n_rows the number of samples
+ */
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            index<T, IdxT>* index,
+            const T* new_vectors,
+            const IdxT* new_indices,
+            IdxT n_rows) RAFT_EXPLICIT;
+
+/**
+ * @ingroup ivf_flat
+ * @{
+ */
+
+/**
+ * @brief Extend the index in-place with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   ivf_flat::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
+ *   ivf_flat::extend(handle, dataset, no_opt, &index_empty);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] handle
+ * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
+ * @param[in] new_indices optional raft::device_matrix_view to a vector of indices [n_rows].
+ *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] index pointer to index, to be overwritten in-place
+ */
+template <typename T, typename IdxT>
+void extend(raft::device_resources const& handle,
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            index<T, IdxT>* index) RAFT_EXPLICIT;
+
+/** @} */
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`:
+ * @code{.cpp}
+ *   ...
+ *   // Create a pooling memory resource with a pre-defined initial size.
+ *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
+ *     rmm::mr::get_current_device_resource(), 1024 * 1024);
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_flat::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
+ *   ivf_flat::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
+ *   ivf_flat::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
+ *   ...
+ * @endcode
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] index ivf-flat constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param[in] n_queries the batch size
+ * @param[in] k the number of neighbors to find for each query.
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ * @param[in] mr an optional memory resource to use across the searches (you can provide a large
+ * enough memory pool here to avoid memory allocations within search).
+ */
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const index<T, IdxT>& index,
+            const T* queries,
+            uint32_t n_queries,
+            uint32_t k,
+            IdxT* neighbors,
+            float* distances,
+            rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+
+/**
+ * @ingroup ivf_flat
+ * @{
+ */
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`:
+ * @code{.cpp}
+ *   ...
+ *   // use default search parameters
+ *   ivf_flat::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_flat::search(handle, index, queries1, out_inds1, out_dists1, search_params, K);
+ *   ivf_flat::search(handle, index, queries2, out_inds2, out_dists2, search_params, K);
+ *   ivf_flat::search(handle, index, queries3, out_inds3, out_dists3, search_params, K);
+ *   ...
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
+ * @tparam int_t precision / type of integral arguments
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] index ivf-flat constructed index
+ * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
+ * [n_queries, k]
+ * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
+ */
+template <typename T, typename IdxT>
+void search(raft::device_resources const& handle,
+            const search_params& params,
+            const index<T, IdxT>& index,
+            raft::device_matrix_view<const T, IdxT, row_major> queries,
+            raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
+            raft::device_matrix_view<float, IdxT, row_major> distances) RAFT_EXPLICIT;
+
+/** @} */
+
+}  // namespace raft::neighbors::ivf_flat
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)        \
+  extern template auto raft::neighbors::ivf_flat::build<T, IdxT>( \
+    raft::device_resources const& handle,                         \
+    const raft::neighbors::ivf_flat::index_params& params,        \
+    const T* dataset,                                             \
+    IdxT n_rows,                                                  \
+    uint32_t dim)                                                 \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                  \
+                                                                  \
+  extern template auto raft::neighbors::ivf_flat::build<T, IdxT>( \
+    raft::device_resources const& handle,                         \
+    const raft::neighbors::ivf_flat::index_params& params,        \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset)   \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                  \
+                                                                  \
+  extern template void raft::neighbors::ivf_flat::build<T, IdxT>( \
+    raft::device_resources const& handle,                         \
+    const raft::neighbors::ivf_flat::index_params& params,        \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset,   \
+    raft::neighbors::ivf_flat::index<T, IdxT>& idx);
+
+instantiate_raft_neighbors_ivf_flat_build(float, int64_t);
+instantiate_raft_neighbors_ivf_flat_build(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_build(uint8_t, int64_t);
+#undef instantiate_raft_neighbors_ivf_flat_build
+
+#define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
+  extern template auto raft::neighbors::ivf_flat::extend<T, IdxT>(         \
+    raft::device_resources const& handle,                                  \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index,           \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows)                                                           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  extern template auto raft::neighbors::ivf_flat::extend<T, IdxT>(         \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  extern template void raft::neighbors::ivf_flat::extend<T, IdxT>(         \
+    raft::device_resources const& handle,                                  \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index,                      \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows);                                                          \
+                                                                           \
+  extern template void raft::neighbors::ivf_flat::extend<T, IdxT>(         \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index);
+
+instantiate_raft_neighbors_ivf_flat_extend(float, int64_t);
+instantiate_raft_neighbors_ivf_flat_extend(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_extend(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_extend
+
+#define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)        \
+  extern template void raft::neighbors::ivf_flat::search<T, IdxT>( \
+    raft::device_resources const& handle,                          \
+    const raft::neighbors::ivf_flat::search_params& params,        \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \
+    const T* queries,                                              \
+    uint32_t n_queries,                                            \
+    uint32_t k,                                                    \
+    IdxT* neighbors,                                               \
+    float* distances,                                              \
+    rmm::mr::device_memory_resource* mr = nullptr);                \
+                                                                   \
+  extern template void raft::neighbors::ivf_flat::search<T, IdxT>( \
+    raft::device_resources const& handle,                          \
+    const raft::neighbors::ivf_flat::search_params& params,        \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \
+    raft::device_matrix_view<const T, IdxT, row_major> queries,    \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,     \
+    raft::device_matrix_view<float, IdxT, row_major> distances);
+
+instantiate_raft_neighbors_ivf_flat_search(float, int64_t);
+instantiate_raft_neighbors_ivf_flat_search(int8_t, int64_t);
+instantiate_raft_neighbors_ivf_flat_search(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_search
diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
index c573676504..1bfe206608 100644
--- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
@@ -97,8 +97,8 @@ auto build(raft::device_resources const& handle,
  *   ivf_flat::search(handle, index, queries, out_inds, out_dists, search_params, k);
  * @endcode
  *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
  * @tparam int_t precision / type of integral arguments
  *
  * @param[in] handle
@@ -107,17 +107,16 @@ auto build(raft::device_resources const& handle,
  *
  * @return the constructed ivf-flat index
  */
-template <typename value_t, typename idx_t>
+template <typename T, typename IdxT>
 auto build(raft::device_resources const& handle,
            const index_params& params,
-           raft::device_matrix_view<const value_t, idx_t, row_major> dataset)
-  -> index<value_t, idx_t>
+           raft::device_matrix_view<const T, IdxT, row_major> dataset) -> index<T, IdxT>
 {
   return raft::neighbors::ivf_flat::detail::build(handle,
                                                   params,
                                                   dataset.data_handle(),
-                                                  static_cast<idx_t>(dataset.extent(0)),
-                                                  static_cast<idx_t>(dataset.extent(1)));
+                                                  static_cast<IdxT>(dataset.extent(0)),
+                                                  static_cast<IdxT>(dataset.extent(1)));
 }
 
 /**
@@ -134,7 +133,7 @@ auto build(raft::device_resources const& handle,
  *   // use default index parameters
  *   ivf_flat::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   ivf_flat::index<decltype(dataset::value_type), decltype(dataset::index_type)> index;
+ *   ivf_flat::index<decltype(dataset::Type), decltype(dataset::index_type)> index;
  *   ivf_flat::build(handle, dataset, index_params, index);
  *   // use default search parameters
  *   ivf_flat::search_params search_params;
@@ -142,10 +141,10 @@ auto build(raft::device_resources const& handle,
  *   ivf_flat::search(handle, index, queries, out_inds, out_dists, search_params, k);
  * @endcode
  *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
  * @tparam int_t precision / type of integral arguments
- * @tparam matrix_idx_t matrix indexing type
+ * @tparam matrix_IdxT matrix indexing type
  *
  * @param[in] handle
  * @param[in] params configure the index building
@@ -153,17 +152,17 @@ auto build(raft::device_resources const& handle,
  * @param[out] idx reference to ivf_flat::index
  *
  */
-template <typename value_t, typename idx_t>
+template <typename T, typename IdxT>
 void build(raft::device_resources const& handle,
            const index_params& params,
-           raft::device_matrix_view<const value_t, idx_t, row_major> dataset,
-           raft::neighbors::ivf_flat::index<value_t, idx_t>& idx)
+           raft::device_matrix_view<const T, IdxT, row_major> dataset,
+           raft::neighbors::ivf_flat::index<T, IdxT>& idx)
 {
   idx = raft::neighbors::ivf_flat::detail::build(handle,
                                                  params,
                                                  dataset.data_handle(),
-                                                 static_cast<idx_t>(dataset.extent(0)),
-                                                 static_cast<idx_t>(dataset.extent(1)));
+                                                 static_cast<IdxT>(dataset.extent(0)),
+                                                 static_cast<IdxT>(dataset.extent(1)));
 }
 
 /** @} */
@@ -235,8 +234,8 @@ auto extend(raft::device_resources const& handle,
  *   auto index = ivf_flat::extend(handle, index_empty, dataset);
  * @endcode
  *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
  *
  * @param[in] handle
  * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
@@ -247,18 +246,17 @@ auto extend(raft::device_resources const& handle,
  *
  * @return the constructed extended ivf-flat index
  */
-template <typename value_t, typename idx_t>
+template <typename T, typename IdxT>
 auto extend(raft::device_resources const& handle,
-            raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
-            std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices,
-            const index<value_t, idx_t>& orig_index) -> index<value_t, idx_t>
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            const index<T, IdxT>& orig_index) -> index<T, IdxT>
 {
-  return extend<value_t, idx_t>(
-    handle,
-    orig_index,
-    new_vectors.data_handle(),
-    new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-    new_vectors.extent(0));
+  return extend<T, IdxT>(handle,
+                         orig_index,
+                         new_vectors.data_handle(),
+                         new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                         new_vectors.extent(0));
 }
 
 /** @} */
@@ -316,12 +314,12 @@ void extend(raft::device_resources const& handle,
  *   // train the index from a [N, D] dataset
  *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
  *   // fill the index with the data
- *   std::optional<raft::device_vector_view<const idx_t, idx_t>> no_op = std::nullopt;
+ *   std::optional<raft::device_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
  *   ivf_flat::extend(handle, dataset, no_opt, &index_empty);
  * @endcode
  *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices in the source dataset
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
  *
  * @param[in] handle
  * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
@@ -330,17 +328,17 @@ void extend(raft::device_resources const& handle,
  *    here to imply a continuous range `[0...n_rows)`.
  * @param[inout] index pointer to index, to be overwritten in-place
  */
-template <typename value_t, typename idx_t>
+template <typename T, typename IdxT>
 void extend(raft::device_resources const& handle,
-            raft::device_matrix_view<const value_t, idx_t, row_major> new_vectors,
-            std::optional<raft::device_vector_view<const idx_t, idx_t>> new_indices,
-            index<value_t, idx_t>* index)
+            raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
+            std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
+            index<T, IdxT>* index)
 {
   extend(handle,
          index,
          new_vectors.data_handle(),
          new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
-         static_cast<idx_t>(new_vectors.extent(0)));
+         static_cast<IdxT>(new_vectors.extent(0)));
 }
 
 /** @} */
@@ -428,8 +426,8 @@ void search(raft::device_resources const& handle,
  *   ...
  * @endcode
  *
- * @tparam value_t data element type
- * @tparam idx_t type of the indices
+ * @tparam T data element type
+ * @tparam IdxT type of the indices
  * @tparam int_t precision / type of integral arguments
  *
  * @param[in] handle
@@ -440,13 +438,13 @@ void search(raft::device_resources const& handle,
  * [n_queries, k]
  * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
  */
-template <typename value_t, typename idx_t>
+template <typename T, typename IdxT>
 void search(raft::device_resources const& handle,
             const search_params& params,
-            const index<value_t, idx_t>& index,
-            raft::device_matrix_view<const value_t, idx_t, row_major> queries,
-            raft::device_matrix_view<idx_t, idx_t, row_major> neighbors,
-            raft::device_matrix_view<float, idx_t, row_major> distances)
+            const index<T, IdxT>& index,
+            raft::device_matrix_view<const T, IdxT, row_major> queries,
+            raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
+            raft::device_matrix_view<float, IdxT, row_major> distances)
 {
   RAFT_EXPECTS(
     queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
diff --git a/cpp/include/raft/neighbors/ivf_flat.cuh b/cpp/include/raft/neighbors/ivf_flat.cuh
index e69de29bb2..8e3e2bb813 100644
--- a/cpp/include/raft/neighbors/ivf_flat.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "ivf_flat-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "ivf_flat-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index a75aa22e57..53fd47b23c 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -20,7 +20,6 @@
 #include <raft/neighbors/specializations/brute_force.cuh>
 #include <raft/neighbors/specializations/fused_l2_knn.cuh>
 
-#include <raft/neighbors/specializations/ivf_flat.cuh>
 #include <raft/neighbors/specializations/refine.cuh>
 
 #include <raft/cluster/specializations.cuh>
diff --git a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
deleted file mode 100644
index 0f17bd8586..0000000000
--- a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/neighbors/ivf_flat.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-// greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
-// function is used in both raft::neighbors::ivf_flat::search and
-// raft::neighbors::detail::refine_device. To prevent a duplicate instantiation
-// of this function (which defines ~270 kernels) in the refine specializations,
-// an extern template definition is provided here. Please check related function
-// calls after editing template definition below. Search for
-// `greppable-id-specializations-ivf-flat-search` to find them.
-#define RAFT_INST(T, IdxT)                                                               \
-  extern template auto build(raft::device_resources const& handle,                       \
-                             const index_params& params,                                 \
-                             raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<T, IdxT>;                                                                    \
-                                                                                         \
-  extern template auto extend(                                                           \
-    raft::device_resources const& handle,                                                \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                      \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,               \
-    const index<T, IdxT>& orig_index)                                                    \
-    ->index<T, IdxT>;                                                                    \
-                                                                                         \
-  extern template void extend(                                                           \
-    raft::device_resources const& handle,                                                \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                      \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,               \
-    raft::neighbors::ivf_flat::index<T, IdxT>* idx);
-
-RAFT_INST(float, int64_t);
-RAFT_INST(int8_t, int64_t);
-RAFT_INST(uint8_t, int64_t);
-
-#undef RAFT_INST
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/ivf_flat_00_generate.py b/cpp/src/neighbors/ivf_flat_00_generate.py
new file mode 100644
index 0000000000..a5bff90165
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_00_generate.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+
+header = """/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+"""
+
+types = dict(
+    float_int64_t= ("float", "int64_t"),
+    int8_t_int64_t=("int8_t", "int64_t"),
+    uint8_t_int64_t=("uint8_t", "int64_t"),
+)
+
+build_macro = """
+#define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)        \\
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>( \\
+    raft::device_resources const& handle,                         \\
+    const raft::neighbors::ivf_flat::index_params& params,        \\
+    const T* dataset,                                             \\
+    IdxT n_rows,                                                  \\
+    uint32_t dim)                                                 \\
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                  \\
+                                                                  \\
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>( \\
+    raft::device_resources const& handle,                         \\
+    const raft::neighbors::ivf_flat::index_params& params,        \\
+    raft::device_matrix_view<const T, IdxT, row_major> dataset)   \\
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                  \\
+                                                                  \\
+  template void raft::neighbors::ivf_flat::build<T, IdxT>( \\
+    raft::device_resources const& handle,                         \\
+    const raft::neighbors::ivf_flat::index_params& params,        \\
+    raft::device_matrix_view<const T, IdxT, row_major> dataset,   \\
+    raft::neighbors::ivf_flat::index<T, IdxT>& idx);
+"""
+
+extend_macro = """
+#define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \\
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(         \\
+    raft::device_resources const& handle,                                  \\
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index,           \\
+    const T* new_vectors,                                                  \\
+    const IdxT* new_indices,                                               \\
+    IdxT n_rows)                                                           \\
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \\
+                                                                           \\
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(         \\
+    raft::device_resources const& handle,                                  \\
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \\
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \\
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \\
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \\
+                                                                           \\
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(         \\
+    raft::device_resources const& handle,                                  \\
+    raft::neighbors::ivf_flat::index<T, IdxT>* index,                      \\
+    const T* new_vectors,                                                  \\
+    const IdxT* new_indices,                                               \\
+    IdxT n_rows);                                                          \\
+                                                                           \\
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(         \\
+    raft::device_resources const& handle,                                  \\
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \\
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \\
+    raft::neighbors::ivf_flat::index<T, IdxT>* index);
+"""
+
+search_macro = """
+#define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)        \\
+  template void raft::neighbors::ivf_flat::search<T, IdxT>( \\
+    raft::device_resources const& handle,                          \\
+    const raft::neighbors::ivf_flat::search_params& params,        \\
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \\
+    const T* queries,                                              \\
+    uint32_t n_queries,                                            \\
+    uint32_t k,                                                    \\
+    IdxT* neighbors,                                               \\
+    float* distances,                                              \\
+    rmm::mr::device_memory_resource* mr = nullptr);                \\
+                                                                   \\
+  template void raft::neighbors::ivf_flat::search<T, IdxT>( \\
+    raft::device_resources const& handle,                          \\
+    const raft::neighbors::ivf_flat::search_params& params,        \\
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \\
+    raft::device_matrix_view<const T, IdxT, row_major> queries,    \\
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,     \\
+    raft::device_matrix_view<float, IdxT, row_major> distances);
+"""
+
+macros = dict(
+    build=dict(
+        definition=build_macro,
+        name="instantiate_raft_neighbors_ivf_flat_build"),
+    extend=dict(
+        definition=extend_macro,
+        name="instantiate_raft_neighbors_ivf_flat_extend"),
+    search=dict(
+        definition=search_macro,
+        name="instantiate_raft_neighbors_ivf_flat_search"),
+)
+
+for type_path, (T, IdxT) in types.items():
+    for macro_path, macro in macros.items():
+        path = f"ivf_flat_{macro_path}_{type_path}.cu"
+        with open(path, "w") as f:
+            f.write(header)
+            f.write(macro['definition'])
+
+
+            f.write(f"{macro['name']}({T}, {IdxT});\n\n")
+            f.write(f"#undef {macro['name']}\n")
+
+        print(f"src/neighbors/{path}")
diff --git a/cpp/src/neighbors/ivf_flat_build_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_build_float_int64_t.cu
new file mode 100644
index 0000000000..ad4912efa4
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_build_float_int64_t.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)      \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    const T* dataset,                                           \
+    IdxT n_rows,                                                \
+    uint32_t dim)                                               \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset) \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template void raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset, \
+    raft::neighbors::ivf_flat::index<T, IdxT>& idx);
+instantiate_raft_neighbors_ivf_flat_build(float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_build
diff --git a/cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu
new file mode 100644
index 0000000000..3e5253048c
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)      \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    const T* dataset,                                           \
+    IdxT n_rows,                                                \
+    uint32_t dim)                                               \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset) \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template void raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset, \
+    raft::neighbors::ivf_flat::index<T, IdxT>& idx);
+instantiate_raft_neighbors_ivf_flat_build(int8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_build
diff --git a/cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..7ea9976645
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)      \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    const T* dataset,                                           \
+    IdxT n_rows,                                                \
+    uint32_t dim)                                               \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template auto raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset) \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                \
+                                                                \
+  template void raft::neighbors::ivf_flat::build<T, IdxT>(      \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::index_params& params,      \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset, \
+    raft::neighbors::ivf_flat::index<T, IdxT>& idx);
+instantiate_raft_neighbors_ivf_flat_build(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_build
diff --git a/cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu
new file mode 100644
index 0000000000..48d77488a6
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index,           \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows)                                                           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index,                      \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows);                                                          \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index);
+instantiate_raft_neighbors_ivf_flat_extend(float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_extend
diff --git a/cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
new file mode 100644
index 0000000000..68fe1e3677
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index,           \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows)                                                           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index,                      \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows);                                                          \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index);
+instantiate_raft_neighbors_ivf_flat_extend(int8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_extend
diff --git a/cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..aa371b96bc
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index,           \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows)                                                           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template auto raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
+    ->raft::neighbors::ivf_flat::index<T, IdxT>;                           \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index,                      \
+    const T* new_vectors,                                                  \
+    const IdxT* new_indices,                                               \
+    IdxT n_rows);                                                          \
+                                                                           \
+  template void raft::neighbors::ivf_flat::extend<T, IdxT>(                \
+    raft::device_resources const& handle,                                  \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
+    raft::neighbors::ivf_flat::index<T, IdxT>* index);
+instantiate_raft_neighbors_ivf_flat_extend(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_extend
diff --git a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
new file mode 100644
index 0000000000..ab29d7f63a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    const T* queries,                                           \
+    uint32_t n_queries,                                         \
+    uint32_t k,                                                 \
+    IdxT* neighbors,                                            \
+    float* distances,                                           \
+    rmm::mr::device_memory_resource* mr = nullptr);             \
+                                                                \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    raft::device_matrix_view<const T, IdxT, row_major> queries, \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
+    raft::device_matrix_view<float, IdxT, row_major> distances);
+instantiate_raft_neighbors_ivf_flat_search(float, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_search
diff --git a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
new file mode 100644
index 0000000000..00b8944d85
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    const T* queries,                                           \
+    uint32_t n_queries,                                         \
+    uint32_t k,                                                 \
+    IdxT* neighbors,                                            \
+    float* distances,                                           \
+    rmm::mr::device_memory_resource* mr = nullptr);             \
+                                                                \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    raft::device_matrix_view<const T, IdxT, row_major> queries, \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
+    raft::device_matrix_view<float, IdxT, row_major> distances);
+instantiate_raft_neighbors_ivf_flat_search(int8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_search
diff --git a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..b402626aa4
--- /dev/null
+++ b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/ivf_flat-inl.cuh>
+
+#define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    const T* queries,                                           \
+    uint32_t n_queries,                                         \
+    uint32_t k,                                                 \
+    IdxT* neighbors,                                            \
+    float* distances,                                           \
+    rmm::mr::device_memory_resource* mr = nullptr);             \
+                                                                \
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
+    raft::device_resources const& handle,                       \
+    const raft::neighbors::ivf_flat::search_params& params,     \
+    const raft::neighbors::ivf_flat::index<T, IdxT>& index,     \
+    raft::device_matrix_view<const T, IdxT, row_major> queries, \
+    raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
+    raft::device_matrix_view<float, IdxT, row_major> distances);
+instantiate_raft_neighbors_ivf_flat_search(uint8_t, int64_t);
+
+#undef instantiate_raft_neighbors_ivf_flat_search
diff --git a/cpp/src/neighbors/specializations/ivfflat_build_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_build_float_int64_t.cu
deleted file mode 100644
index 7082873d76..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_build_float_int64_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                               \
-  template auto build(raft::device_resources const& handle,                       \
-                      const index_params& params,                                 \
-                      raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<T, IdxT>;
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
deleted file mode 100644
index ebc1a7fefa..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_build_int8_t_int64_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                               \
-  template auto build(raft::device_resources const& handle,                       \
-                      const index_params& params,                                 \
-                      raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<T, IdxT>;
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
deleted file mode 100644
index 870db6e97e..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_build_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                               \
-  template auto build(raft::device_resources const& handle,                       \
-                      const index_params& params,                                 \
-                      raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<T, IdxT>;
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_extend_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
deleted file mode 100644
index 71af06ad71..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_extend_float_int64_t.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                           \
-  template auto extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
-    ->raft::neighbors::ivf_flat::index<T, IdxT>;                                              \
-                                                                                              \
-  template void extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       raft::neighbors::ivf_flat::index<T, IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
deleted file mode 100644
index bb7bb6e7eb..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_extend_int8_t_int64_t.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                           \
-  template auto extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
-    ->raft::neighbors::ivf_flat::index<T, IdxT>;                                              \
-                                                                                              \
-  template void extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       raft::neighbors::ivf_flat::index<T, IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
deleted file mode 100644
index 607b4b0913..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_extend_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                           \
-  template auto extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>& orig_index)           \
-    ->raft::neighbors::ivf_flat::index<T, IdxT>;                                              \
-                                                                                              \
-  template void extend(raft::device_resources const& handle,                                  \
-                       raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
-                       std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
-                       raft::neighbors::ivf_flat::index<T, IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_search_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_search_float_int64_t.cu
deleted file mode 100644
index dce7083139..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_search_float_int64_t.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-// greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
-// function is used in both raft::neighbors::ivf_flat::search and
-// raft::neighbors::detail::refine_device. To prevent a duplicate instantiation
-// of this function (which defines ~270 kernels) in the refine specializations,
-// an extern template definition is provided. To make sure
-// ivfflat_interleaved_scan is actually compiled here, we explicitly instantiate
-// it below. Please check related function calls after editing template
-// definition below. Search for `greppable-id-specializations-ivf-flat-search`
-// to find them.
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                          \
-  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< \
-    T,                                                                       \
-    typename raft::spatial::knn::detail::utils::config<T>::value_t,          \
-    IdxT>(const index<T, IdxT>& index,                                       \
-          const T* queries,                                                  \
-          const uint32_t* coarse_query_results,                              \
-          const uint32_t n_queries,                                          \
-          const raft::distance::DistanceType metric,                         \
-          const uint32_t n_probes,                                           \
-          const uint32_t k,                                                  \
-          const bool select_min,                                             \
-          IdxT* neighbors,                                                   \
-          float* distances,                                                  \
-          uint32_t& grid_dim_x,                                              \
-          rmm::cuda_stream_view stream);                                     \
-                                                                             \
-  template void search(raft::device_resources const&,                        \
-                       raft::neighbors::ivf_flat::search_params const&,      \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>&,     \
-                       raft::device_matrix_view<const T, IdxT, row_major>,   \
-                       raft::device_matrix_view<IdxT, IdxT, row_major>,      \
-                       raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
deleted file mode 100644
index b03d878bae..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_search_int8_t_int64_t.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                          \
-  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< \
-    T,                                                                       \
-    typename raft::spatial::knn::detail::utils::config<T>::value_t,          \
-    IdxT>(const index<T, IdxT>& index,                                       \
-          const T* queries,                                                  \
-          const uint32_t* coarse_query_results,                              \
-          const uint32_t n_queries,                                          \
-          const raft::distance::DistanceType metric,                         \
-          const uint32_t n_probes,                                           \
-          const uint32_t k,                                                  \
-          const bool select_min,                                             \
-          IdxT* neighbors,                                                   \
-          float* distances,                                                  \
-          uint32_t& grid_dim_x,                                              \
-          rmm::cuda_stream_view stream);                                     \
-                                                                             \
-  template void search(raft::device_resources const&,                        \
-                       raft::neighbors::ivf_flat::search_params const&,      \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>&,     \
-                       raft::device_matrix_view<const T, IdxT, row_major>,   \
-                       raft::device_matrix_view<IdxT, IdxT, row_major>,      \
-                       raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
deleted file mode 100644
index 2d42bae0d1..0000000000
--- a/cpp/src/neighbors/specializations/ivfflat_search_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_flat {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                          \
-  template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan< \
-    T,                                                                       \
-    typename raft::spatial::knn::detail::utils::config<T>::value_t,          \
-    IdxT>(const index<T, IdxT>& index,                                       \
-          const T* queries,                                                  \
-          const uint32_t* coarse_query_results,                              \
-          const uint32_t n_queries,                                          \
-          const raft::distance::DistanceType metric,                         \
-          const uint32_t n_probes,                                           \
-          const uint32_t k,                                                  \
-          const bool select_min,                                             \
-          IdxT* neighbors,                                                   \
-          float* distances,                                                  \
-          uint32_t& grid_dim_x,                                              \
-          rmm::cuda_stream_view stream);                                     \
-                                                                             \
-  template void search(raft::device_resources const&,                        \
-                       raft::neighbors::ivf_flat::search_params const&,      \
-                       const raft::neighbors::ivf_flat::index<T, IdxT>&,     \
-                       raft::device_matrix_view<const T, IdxT, row_major>,   \
-                       raft::device_matrix_view<IdxT, IdxT, row_major>,      \
-                       raft::device_matrix_view<float, IdxT, row_major>);
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/src/neighbors/specializations/ivfpq_build_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_build_float_int64_t.cu
deleted file mode 100644
index d559291b93..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_build_float_int64_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                        \
-  template auto build<T, IdxT>(raft::device_resources const& handle,                       \
-                               const index_params& params,                                 \
-                               raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<IdxT>;
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
deleted file mode 100644
index c8b31e1fff..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_build_int8_t_int64_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                        \
-  template auto build<T, IdxT>(raft::device_resources const& handle,                       \
-                               const index_params& params,                                 \
-                               raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<IdxT>;
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
deleted file mode 100644
index 5fc62969f0..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_build_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                        \
-  template auto build<T, IdxT>(raft::device_resources const& handle,                       \
-                               const index_params& params,                                 \
-                               raft::device_matrix_view<const T, IdxT, row_major> dataset) \
-    ->index<IdxT>;
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_extend_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
deleted file mode 100644
index 4cc616f32d..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_extend_float_int64_t.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                   \
-  template auto extend<T, IdxT>(                                                      \
-    raft::device_resources const& handle,                                             \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                   \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices, \
-    const index<IdxT>& idx)                                                           \
-    ->index<IdxT>;                                                                    \
-  template void extend<T, IdxT>(                                                      \
-    raft::device_resources const& handle,                                             \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                   \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices, \
-    index<IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
deleted file mode 100644
index a3117aae0f..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_extend_int8_t_int64_t.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                   \
-  template auto extend<T, IdxT>(                                                      \
-    raft::device_resources const& handle,                                             \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                   \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices, \
-    const index<IdxT>& idx)                                                           \
-    ->index<IdxT>;                                                                    \
-  template void extend<T, IdxT>(                                                      \
-    raft::device_resources const& handle,                                             \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                   \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices, \
-    index<IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
deleted file mode 100644
index a5e3d68569..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_extend_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                   \
-  template auto extend<T, IdxT>(                                                      \
-    raft::device_resources const& handle,                                             \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                   \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices, \
-    const index<IdxT>& idx)                                                           \
-    ->index<IdxT>;                                                                    \
-  template void extend<T, IdxT>(                                                      \
-    raft::device_resources const& handle,                                             \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                   \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices, \
-    index<IdxT>* idx);
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_search_float_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_search_float_int64_t.cu
deleted file mode 100644
index 92a4d89e6b..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_search_float_int64_t.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                         \
-  template void search<T, IdxT>(raft::device_resources const& handle,                       \
-                                const search_params& params,                                \
-                                const index<IdxT>& idx,                                     \
-                                raft::device_matrix_view<const T, IdxT, row_major> queries, \
-                                raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
-                                raft::device_matrix_view<float, IdxT, row_major> distances);
-
-RAFT_MAKE_INSTANCE(float, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
deleted file mode 100644
index 62a8b48ad5..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_search_int8_t_int64_t.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                         \
-  template void search<T, IdxT>(raft::device_resources const& handle,                       \
-                                const search_params& params,                                \
-                                const index<IdxT>& idx,                                     \
-                                raft::device_matrix_view<const T, IdxT, row_major> queries, \
-                                raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
-                                raft::device_matrix_view<float, IdxT, row_major> distances);
-
-RAFT_MAKE_INSTANCE(int8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
deleted file mode 100644
index 3bcf134a22..0000000000
--- a/cpp/src/neighbors/specializations/ivfpq_search_uint8_t_int64_t.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq {
-
-#define RAFT_MAKE_INSTANCE(T, IdxT)                                                         \
-  template void search<T, IdxT>(raft::device_resources const& handle,                       \
-                                const search_params& params,                                \
-                                const index<IdxT>& idx,                                     \
-                                raft::device_matrix_view<const T, IdxT, row_major> queries, \
-                                raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,  \
-                                raft::device_matrix_view<float, IdxT, row_major> distances);
-
-RAFT_MAKE_INSTANCE(uint8_t, int64_t);
-
-#undef RAFT_MAKE_INSTANCE
-
-}  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/src/raft_runtime/neighbors/ivf_flat_build.cu b/cpp/src/raft_runtime/neighbors/ivf_flat_build.cu
index 0d82fdbb08..48a40ab56e 100644
--- a/cpp/src/raft_runtime/neighbors/ivf_flat_build.cu
+++ b/cpp/src/raft_runtime/neighbors/ivf_flat_build.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_flat.cuh>
 #include <raft_runtime/neighbors/ivf_flat.hpp>
 
 namespace raft::runtime::neighbors::ivf_flat {
diff --git a/cpp/src/raft_runtime/neighbors/ivf_flat_search.cu b/cpp/src/raft_runtime/neighbors/ivf_flat_search.cu
index b843ee7c30..eefc7f2932 100644
--- a/cpp/src/raft_runtime/neighbors/ivf_flat_search.cu
+++ b/cpp/src/raft_runtime/neighbors/ivf_flat_search.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <raft/neighbors/specializations.cuh>
+#include <raft/neighbors/ivf_flat.cuh>
 #include <raft_runtime/neighbors/ivf_flat.hpp>
 
 namespace raft::runtime::neighbors::ivf_flat {

From d528f4b3ee64b8b46da009d0a94b42feb9d18d35 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 13 Apr 2023 17:33:35 +0200
Subject: [PATCH 31/89] Split refine

---
 cpp/CMakeLists.txt                            |   9 +-
 cpp/include/raft/neighbors/refine-ext.cuh     | 129 ++++++++++++++++++
 cpp/include/raft/neighbors/refine.cuh         |  25 ++++
 .../raft/neighbors/specializations.cuh        |   2 -
 .../raft/neighbors/specializations/refine.cuh |  51 -------
 cpp/src/neighbors/refine_00_generate.py       |  57 ++++++++
 cpp/src/neighbors/refine_float_float.cu       |  41 ++++++
 cpp/src/neighbors/refine_int8_t_float.cu      |  41 ++++++
 cpp/src/neighbors/refine_uint8_t_float.cu     |  41 ++++++
 .../specializations/refine_d_int64_t_float.cu |  31 -----
 .../refine_d_int64_t_int8_t.cu                |  31 -----
 .../refine_d_int64_t_uint8_t.cu               |  31 -----
 .../specializations/refine_h_int64_t_float.cu |  31 -----
 .../refine_h_int64_t_int8_t.cu                |  30 ----
 .../refine_h_int64_t_uint8_t.cu               |  31 -----
 .../neighbors/refine_d_int64_t_float.cu       |   1 -
 16 files changed, 337 insertions(+), 245 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/refine-ext.cuh
 delete mode 100644 cpp/include/raft/neighbors/specializations/refine.cuh
 create mode 100644 cpp/src/neighbors/refine_00_generate.py
 create mode 100644 cpp/src/neighbors/refine_float_float.cu
 create mode 100644 cpp/src/neighbors/refine_int8_t_float.cu
 create mode 100644 cpp/src/neighbors/refine_uint8_t_float.cu
 delete mode 100644 cpp/src/neighbors/specializations/refine_d_int64_t_float.cu
 delete mode 100644 cpp/src/neighbors/specializations/refine_d_int64_t_int8_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/refine_h_int64_t_float.cu
 delete mode 100644 cpp/src/neighbors/specializations/refine_h_int64_t_int8_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/refine_h_int64_t_uint8_t.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cf0937beed..eab6cffe89 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -285,12 +285,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
     src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
     src/neighbors/detail/selection_faiss.cu
-    src/neighbors/specializations/refine_d_int64_t_float.cu
-    src/neighbors/specializations/refine_d_int64_t_int8_t.cu
-    src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
-    src/neighbors/specializations/refine_h_int64_t_float.cu
-    src/neighbors/specializations/refine_h_int64_t_int8_t.cu
-    src/neighbors/specializations/refine_h_int64_t_uint8_t.cu
     src/distance/distance.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
@@ -366,6 +360,9 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/ivfpq_search_float_int64_t.cu
     src/neighbors/ivfpq_search_int8_t_int64_t.cu
     src/neighbors/ivfpq_search_uint8_t_int64_t.cu
+    src/neighbors/refine_float_float.cu
+    src/neighbors/refine_int8_t_float.cu
+    src/neighbors/refine_uint8_t_float.cu
     src/raft_runtime/cluster/cluster_cost.cuh
     src/raft_runtime/cluster/cluster_cost_double.cu
     src/raft_runtime/cluster/cluster_cost_float.cu
diff --git a/cpp/include/raft/neighbors/refine-ext.cuh b/cpp/include/raft/neighbors/refine-ext.cuh
new file mode 100644
index 0000000000..c210e599ae
--- /dev/null
+++ b/cpp/include/raft/neighbors/refine-ext.cuh
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/matrix/matrix.cuh>
+#include <raft/neighbors/detail/refine.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/raft_explicit.hpp>
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft::neighbors {
+
+/**
+ * @defgroup ann_refine Approximate Nearest Neighbors Refinement
+ * @{
+ */
+
+/**
+ * @brief Refine nearest neighbor search.
+ *
+ * Refinement is an operation that follows an approximate NN search. The approximate search has
+ * already selected n_candidates neighbor candidates for each query. We narrow it down to k
+ * neighbors. For each query, we calculate the exact distance between the query and its
+ * n_candidates neighbor candidate, and select the k nearest ones.
+ *
+ * The k nearest neighbors and distances are returned.
+ *
+ * Example usage
+ * @code{.cpp}
+ *   using namespace raft::neighbors;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // search m = 4 * k nearest neighbours for each of the N queries
+ *   ivf_pq::search(handle, search_params, index, queries, N, 4 * k, neighbor_candidates,
+ *                  out_dists_tmp);
+ *   // refine it to the k nearest one
+ *   refine(handle, dataset, queries, neighbor_candidates, out_indices, out_dists,
+ *           index.metric());
+ * @endcode
+ *
+ *
+ * @param[in] handle the raft handle
+ * @param[in] dataset device matrix that stores the dataset [n_rows, dims]
+ * @param[in] queries device matrix of the queries [n_queris, dims]
+ * @param[in] neighbor_candidates indices of candidate vectors [n_queries, n_candidates], where
+ *   n_candidates >= k
+ * @param[out] indices device matrix that stores the refined indices [n_queries, k]
+ * @param[out] distances device matrix that stores the refined distances [n_queries, k]
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ */
+template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
+void refine(raft::device_resources const& handle,
+            raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
+            raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,
+            raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+            raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+            raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
+            distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT;
+
+/** Same as above, but all input and out data is in host memory.
+ * @param[in] handle the raft handle
+ * @param[in] dataset host matrix that stores the dataset [n_rows, dims]
+ * @param[in] queries host matrix of the queries [n_queris, dims]
+ * @param[in] neighbor_candidates host matrix with indices of candidate vectors [n_queries,
+ *   n_candidates], where n_candidates >= k
+ * @param[out] indices host matrix that stores the refined indices [n_queries, k]
+ * @param[out] distances host matrix that stores the refined distances [n_queries, k]
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ */
+template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
+void refine(raft::device_resources const& handle,
+            raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
+            raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,
+            raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
+            raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
+            raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
+            distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT;
+
+/** @} */  // end group ann_refine
+}  // namespace raft::neighbors
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)       \
+  extern template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>( \
+    raft::device_resources const& handle,                                              \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,             \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,             \
+    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,  \
+    raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                    \
+    raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,             \
+    raft::distance::DistanceType metric);                                              \
+                                                                                       \
+  extern template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>( \
+    raft::device_resources const& handle,                                              \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,               \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,               \
+    raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,    \
+    raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,                      \
+    raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,               \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_refine(int64_t, float, float, int64_t);
+instantiate_raft_neighbors_refine(int64_t, int8_t, float, int64_t);
+instantiate_raft_neighbors_refine(int64_t, uint8_t, float, int64_t);
+
+#undef instantiate_raft_neighbors_refine
diff --git a/cpp/include/raft/neighbors/refine.cuh b/cpp/include/raft/neighbors/refine.cuh
index e69de29bb2..06cbc8241c 100644
--- a/cpp/include/raft/neighbors/refine.cuh
+++ b/cpp/include/raft/neighbors/refine.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "refine-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "refine-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index 53fd47b23c..7d3aa0fd97 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -20,8 +20,6 @@
 #include <raft/neighbors/specializations/brute_force.cuh>
 #include <raft/neighbors/specializations/fused_l2_knn.cuh>
 
-#include <raft/neighbors/specializations/refine.cuh>
-
 #include <raft/cluster/specializations.cuh>
 #include <raft/distance/specializations.cuh>
 #include <raft/matrix/specializations.cuh>
diff --git a/cpp/include/raft/neighbors/specializations/refine.cuh b/cpp/include/raft/neighbors/specializations/refine.cuh
deleted file mode 100644
index aef4834c9f..0000000000
--- a/cpp/include/raft/neighbors/specializations/refine.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/neighbors/refine.cuh>
-
-namespace raft::neighbors {
-
-#ifdef RAFT_INST
-#undef RAFT_INST
-#endif
-
-#define RAFT_INST(T, IdxT)                                                        \
-  extern template void refine<IdxT, T, float, int64_t>(                           \
-    raft::device_resources const& handle,                                         \
-    raft::device_matrix_view<const T, int64_t, row_major> dataset,                \
-    raft::device_matrix_view<const T, int64_t, row_major> queries,                \
-    raft::device_matrix_view<const IdxT, int64_t, row_major> neighbor_candidates, \
-    raft::device_matrix_view<IdxT, int64_t, row_major> indices,                   \
-    raft::device_matrix_view<float, int64_t, row_major> distances,                \
-    distance::DistanceType metric);                                               \
-                                                                                  \
-  extern template void refine<IdxT, T, float, int64_t>(                           \
-    raft::device_resources const& handle,                                         \
-    raft::host_matrix_view<const T, int64_t, row_major> dataset,                  \
-    raft::host_matrix_view<const T, int64_t, row_major> queries,                  \
-    raft::host_matrix_view<const IdxT, int64_t, row_major> neighbor_candidates,   \
-    raft::host_matrix_view<IdxT, int64_t, row_major> indices,                     \
-    raft::host_matrix_view<float, int64_t, row_major> distances,                  \
-    distance::DistanceType metric);
-
-RAFT_INST(float, int64_t);
-RAFT_INST(uint8_t, int64_t);
-RAFT_INST(int8_t, int64_t);
-
-#undef RAFT_INST
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/refine_00_generate.py b/cpp/src/neighbors/refine_00_generate.py
new file mode 100644
index 0000000000..c3795a031e
--- /dev/null
+++ b/cpp/src/neighbors/refine_00_generate.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+header = """
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine-inl.cuh>
+
+#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)       \\
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(        \\
+    raft::device_resources const& handle,                                              \\
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,             \\
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,             \\
+    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,  \\
+    raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                    \\
+    raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,             \\
+    raft::distance::DistanceType metric);                                              \\
+                                                                                       \\
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(        \\
+    raft::device_resources const& handle,                                              \\
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,               \\
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,               \\
+    raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,    \\
+    raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,                      \\
+    raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,               \\
+    raft::distance::DistanceType metric);
+
+"""
+
+types = dict(
+    float_float= ("float", "float"),
+    int8_t_float=("int8_t", "float"),
+    uint8_t_float=("uint8_t", "float"),
+)
+
+for type_path, (data_t, distance_t) in types.items():
+    path = f"refine_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(f"instantiate_raft_neighbors_refine(int64_t, {data_t}, {distance_t}, int64_t);\n\n")
+        f.write(f"#undef instantiate_raft_neighbors_refine\n")
+
+    # for pasting into CMakeLists.txt
+    print(f"src/neighbors/{path}")
diff --git a/cpp/src/neighbors/refine_float_float.cu b/cpp/src/neighbors/refine_float_float.cu
new file mode 100644
index 0000000000..08976449f5
--- /dev/null
+++ b/cpp/src/neighbors/refine_float_float.cu
@@ -0,0 +1,41 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine-inl.cuh>
+
+#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,            \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,            \
+    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates, \
+    raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                   \
+    raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,            \
+    raft::distance::DistanceType metric);                                             \
+                                                                                      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,              \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,              \
+    raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,   \
+    raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,                     \
+    raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,              \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_refine(int64_t, float, float, int64_t);
+
+#undef instantiate_raft_neighbors_refine
diff --git a/cpp/src/neighbors/refine_int8_t_float.cu b/cpp/src/neighbors/refine_int8_t_float.cu
new file mode 100644
index 0000000000..66293b741a
--- /dev/null
+++ b/cpp/src/neighbors/refine_int8_t_float.cu
@@ -0,0 +1,41 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine-inl.cuh>
+
+#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,            \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,            \
+    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates, \
+    raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                   \
+    raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,            \
+    raft::distance::DistanceType metric);                                             \
+                                                                                      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,              \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,              \
+    raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,   \
+    raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,                     \
+    raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,              \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_refine(int64_t, int8_t, float, int64_t);
+
+#undef instantiate_raft_neighbors_refine
diff --git a/cpp/src/neighbors/refine_uint8_t_float.cu b/cpp/src/neighbors/refine_uint8_t_float.cu
new file mode 100644
index 0000000000..c5e4f5e19c
--- /dev/null
+++ b/cpp/src/neighbors/refine_uint8_t_float.cu
@@ -0,0 +1,41 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/neighbors/refine-inl.cuh>
+
+#define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,            \
+    raft::device_matrix_view<const data_t, matrix_idx, row_major> queries,            \
+    raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates, \
+    raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                   \
+    raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,            \
+    raft::distance::DistanceType metric);                                             \
+                                                                                      \
+  template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>(       \
+    raft::device_resources const& handle,                                             \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,              \
+    raft::host_matrix_view<const data_t, matrix_idx, row_major> queries,              \
+    raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,   \
+    raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,                     \
+    raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,              \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_refine(int64_t, uint8_t, float, int64_t);
+
+#undef instantiate_raft_neighbors_refine
diff --git a/cpp/src/neighbors/specializations/refine_d_int64_t_float.cu b/cpp/src/neighbors/specializations/refine_d_int64_t_float.cu
deleted file mode 100644
index 0b0125459d..0000000000
--- a/cpp/src/neighbors/specializations/refine_d_int64_t_float.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-
-template void refine<int64_t, float, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::device_matrix_view<const float, int64_t, row_major> dataset,
-  raft::device_matrix_view<const float, int64_t, row_major> queries,
-  raft::device_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::device_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::device_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/specializations/refine_d_int64_t_int8_t.cu b/cpp/src/neighbors/specializations/refine_d_int64_t_int8_t.cu
deleted file mode 100644
index d6c817b971..0000000000
--- a/cpp/src/neighbors/specializations/refine_d_int64_t_int8_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-
-template void refine<int64_t, int8_t, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::device_matrix_view<const int8_t, int64_t, row_major> dataset,
-  raft::device_matrix_view<const int8_t, int64_t, row_major> queries,
-  raft::device_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::device_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::device_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/specializations/refine_d_int64_t_uint8_t.cu b/cpp/src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
deleted file mode 100644
index 3e0ca627a6..0000000000
--- a/cpp/src/neighbors/specializations/refine_d_int64_t_uint8_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-
-template void refine<int64_t, uint8_t, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::device_matrix_view<const uint8_t, int64_t, row_major> dataset,
-  raft::device_matrix_view<const uint8_t, int64_t, row_major> queries,
-  raft::device_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::device_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::device_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/specializations/refine_h_int64_t_float.cu b/cpp/src/neighbors/specializations/refine_h_int64_t_float.cu
deleted file mode 100644
index 66a6bace53..0000000000
--- a/cpp/src/neighbors/specializations/refine_h_int64_t_float.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-
-template void refine<int64_t, float, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::host_matrix_view<const float, int64_t, row_major> dataset,
-  raft::host_matrix_view<const float, int64_t, row_major> queries,
-  raft::host_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::host_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::host_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/specializations/refine_h_int64_t_int8_t.cu b/cpp/src/neighbors/specializations/refine_h_int64_t_int8_t.cu
deleted file mode 100644
index 22824b3a8e..0000000000
--- a/cpp/src/neighbors/specializations/refine_h_int64_t_int8_t.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-template void refine<int64_t, int8_t, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::host_matrix_view<const int8_t, int64_t, row_major> dataset,
-  raft::host_matrix_view<const int8_t, int64_t, row_major> queries,
-  raft::host_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::host_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::host_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/neighbors/specializations/refine_h_int64_t_uint8_t.cu b/cpp/src/neighbors/specializations/refine_h_int64_t_uint8_t.cu
deleted file mode 100644
index 58dcfc87c9..0000000000
--- a/cpp/src/neighbors/specializations/refine_h_int64_t_uint8_t.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors {
-
-template void refine<int64_t, uint8_t, float, int64_t>(
-  raft::device_resources const& handle,
-  raft::host_matrix_view<const uint8_t, int64_t, row_major> dataset,
-  raft::host_matrix_view<const uint8_t, int64_t, row_major> queries,
-  raft::host_matrix_view<const int64_t, int64_t, row_major> neighbor_candidates,
-  raft::host_matrix_view<int64_t, int64_t, row_major> indices,
-  raft::host_matrix_view<float, int64_t, row_major> distances,
-  distance::DistanceType metric);
-
-}  // namespace raft::neighbors
diff --git a/cpp/src/raft_runtime/neighbors/refine_d_int64_t_float.cu b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_float.cu
index 8ad8f9e8f1..79cec55294 100644
--- a/cpp/src/raft_runtime/neighbors/refine_d_int64_t_float.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_float.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 

From 06e1b6fdf445be9d368e4dff2bddd847c2159910 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 13 Apr 2023 17:34:37 +0200
Subject: [PATCH 32/89] Remove unused fused l2 knn files

---
 .../raft/neighbors/specializations.cuh        |  1 -
 .../specializations/fused_l2_knn.cuh          | 80 -------------------
 .../fused_l2_knn_int_float_false.cu           | 42 ----------
 .../fused_l2_knn_int_float_true.cu            | 41 ----------
 .../fused_l2_knn_long_float_false.cu          | 41 ----------
 .../fused_l2_knn_long_float_true.cu           | 41 ----------
 cpp/src/spatial/knn/detail/fused_l2_knn.cu    | 46 -----------
 7 files changed, 292 deletions(-)
 delete mode 100644 cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
 delete mode 100644 cpp/src/neighbors/specializations/fused_l2_knn_int_float_false.cu
 delete mode 100644 cpp/src/neighbors/specializations/fused_l2_knn_int_float_true.cu
 delete mode 100644 cpp/src/neighbors/specializations/fused_l2_knn_long_float_false.cu
 delete mode 100644 cpp/src/neighbors/specializations/fused_l2_knn_long_float_true.cu
 delete mode 100644 cpp/src/spatial/knn/detail/fused_l2_knn.cu

diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index 7d3aa0fd97..fa91a815e9 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/neighbors/specializations/ball_cover.cuh>
 #include <raft/neighbors/specializations/brute_force.cuh>
-#include <raft/neighbors/specializations/fused_l2_knn.cuh>
 
 #include <raft/cluster/specializations.cuh>
 #include <raft/distance/specializations.cuh>
diff --git a/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh b/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
deleted file mode 100644
index 916db8f0a2..0000000000
--- a/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-extern template void fusedL2Knn<long, float, true>(size_t D,
-                                                   long* out_inds,
-                                                   float* out_dists,
-                                                   const float* index,
-                                                   const float* query,
-                                                   size_t n_index_rows,
-                                                   size_t n_query_rows,
-                                                   int k,
-                                                   bool rowMajorIndex,
-                                                   bool rowMajorQuery,
-                                                   cudaStream_t stream,
-                                                   raft::distance::DistanceType metric);
-
-extern template void fusedL2Knn<long, float, false>(size_t D,
-                                                    long* out_inds,
-                                                    float* out_dists,
-                                                    const float* index,
-                                                    const float* query,
-                                                    size_t n_index_rows,
-                                                    size_t n_query_rows,
-                                                    int k,
-                                                    bool rowMajorIndex,
-                                                    bool rowMajorQuery,
-                                                    cudaStream_t stream,
-                                                    raft::distance::DistanceType metric);
-
-extern template void fusedL2Knn<int, float, true>(size_t D,
-                                                  int* out_inds,
-                                                  float* out_dists,
-                                                  const float* index,
-                                                  const float* query,
-                                                  size_t n_index_rows,
-                                                  size_t n_query_rows,
-                                                  int k,
-                                                  bool rowMajorIndex,
-                                                  bool rowMajorQuery,
-                                                  cudaStream_t stream,
-                                                  raft::distance::DistanceType metric);
-
-extern template void fusedL2Knn<int, float, false>(size_t D,
-                                                   int* out_inds,
-                                                   float* out_dists,
-                                                   const float* index,
-                                                   const float* query,
-                                                   size_t n_index_rows,
-                                                   size_t n_query_rows,
-                                                   int k,
-                                                   bool rowMajorIndex,
-                                                   bool rowMajorQuery,
-                                                   cudaStream_t stream,
-                                                   raft::distance::DistanceType metric);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/neighbors/specializations/fused_l2_knn_int_float_false.cu b/cpp/src/neighbors/specializations/fused_l2_knn_int_float_false.cu
deleted file mode 100644
index 72fdac9526..0000000000
--- a/cpp/src/neighbors/specializations/fused_l2_knn_int_float_false.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void fusedL2Knn<int, float, false>(size_t D,
-                                            int* out_inds,
-                                            float* out_dists,
-                                            const float* index,
-                                            const float* query,
-                                            size_t n_index_rows,
-                                            size_t n_query_rows,
-                                            int k,
-                                            bool rowMajorIndex,
-                                            bool rowMajorQuery,
-                                            cudaStream_t stream,
-                                            raft::distance::DistanceType metric);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/neighbors/specializations/fused_l2_knn_int_float_true.cu b/cpp/src/neighbors/specializations/fused_l2_knn_int_float_true.cu
deleted file mode 100644
index c7616462fe..0000000000
--- a/cpp/src/neighbors/specializations/fused_l2_knn_int_float_true.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-template void fusedL2Knn<int, float, true>(size_t D,
-                                           int* out_inds,
-                                           float* out_dists,
-                                           const float* index,
-                                           const float* query,
-                                           size_t n_index_rows,
-                                           size_t n_query_rows,
-                                           int k,
-                                           bool rowMajorIndex,
-                                           bool rowMajorQuery,
-                                           cudaStream_t stream,
-                                           raft::distance::DistanceType metric);
-
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/neighbors/specializations/fused_l2_knn_long_float_false.cu b/cpp/src/neighbors/specializations/fused_l2_knn_long_float_false.cu
deleted file mode 100644
index 16bf058238..0000000000
--- a/cpp/src/neighbors/specializations/fused_l2_knn_long_float_false.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void fusedL2Knn<long, float, false>(size_t D,
-                                             long* out_inds,
-                                             float* out_dists,
-                                             const float* index,
-                                             const float* query,
-                                             size_t n_index_rows,
-                                             size_t n_query_rows,
-                                             int k,
-                                             bool rowMajorIndex,
-                                             bool rowMajorQuery,
-                                             cudaStream_t stream,
-                                             raft::distance::DistanceType metric);
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/neighbors/specializations/fused_l2_knn_long_float_true.cu b/cpp/src/neighbors/specializations/fused_l2_knn_long_float_true.cu
deleted file mode 100644
index 06cf55eae3..0000000000
--- a/cpp/src/neighbors/specializations/fused_l2_knn_long_float_true.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-namespace detail {
-
-template void fusedL2Knn<long, float, true>(size_t D,
-                                            long* out_inds,
-                                            float* out_dists,
-                                            const float* index,
-                                            const float* query,
-                                            size_t n_index_rows,
-                                            size_t n_query_rows,
-                                            int k,
-                                            bool rowMajorIndex,
-                                            bool rowMajorQuery,
-                                            cudaStream_t stream,
-                                            raft::distance::DistanceType metric);
-};  // namespace detail
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/spatial/knn/detail/fused_l2_knn.cu b/cpp/src/spatial/knn/detail/fused_l2_knn.cu
deleted file mode 100644
index 8795e265e9..0000000000
--- a/cpp/src/spatial/knn/detail/fused_l2_knn.cu
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstddef>                           // size_t
-#include <cstdint>                           // int_Xt
-#include <raft/distance/distance_types.hpp>  // DistanceType
-#include <raft/spatial/knn/detail/fused_l2_knn-inl.cuh>
-
-#define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs)  \
-  template void raft::spatial::knn::detail::fusedL2Knn<Mvalue_idx, Mvalue_t, MusePrevTopKs>( \
-    size_t D,                                                                                \
-    Mvalue_idx * out_inds,                                                                   \
-    Mvalue_t * out_dists,                                                                    \
-    const Mvalue_t* index,                                                                   \
-    const Mvalue_t* query,                                                                   \
-    size_t n_index_rows,                                                                     \
-    size_t n_query_rows,                                                                     \
-    int k,                                                                                   \
-    bool rowMajorIndex,                                                                      \
-    bool rowMajorQuery,                                                                      \
-    cudaStream_t stream,                                                                     \
-    raft::distance::DistanceType metric)
-
-instantiate_raft_spatial_knn_detail_fusedL2Knn(long, float, true);
-instantiate_raft_spatial_knn_detail_fusedL2Knn(long, float, false);
-instantiate_raft_spatial_knn_detail_fusedL2Knn(int, float, true);
-instantiate_raft_spatial_knn_detail_fusedL2Knn(int, float, false);
-
-// These are used by brute_force_knn:
-instantiate_raft_spatial_knn_detail_fusedL2Knn(unsigned int, float, true);
-instantiate_raft_spatial_knn_detail_fusedL2Knn(unsigned int, float, false);
-
-#undef instantiate_raft_spatial_knn_detail_fusedL2Knn

From 4a869e6fbc63d86c1ce19baf90336cccc699ccc1 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 13 Apr 2023 18:17:27 +0200
Subject: [PATCH 33/89] Split ball cover

---
 cpp/CMakeLists.txt                            |   4 +-
 cpp/include/raft/neighbors/ball_cover-ext.cuh | 337 ++++++++++++++++++
 cpp/include/raft/neighbors/ball_cover-inl.cuh |   5 -
 cpp/include/raft/neighbors/ball_cover.cuh     |  24 ++
 .../raft/neighbors/specializations.cuh        |   1 -
 .../neighbors/specializations/ball_cover.cuh  |  52 ---
 cpp/src/neighbors/ball_cover.cu               |  66 ++++
 .../ball_cover_all_knn_query.cu               |  33 --
 .../specializations/ball_cover_build_index.cu |  31 --
 .../specializations/ball_cover_knn_query.cu   |  34 --
 10 files changed, 428 insertions(+), 159 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/ball_cover-ext.cuh
 delete mode 100644 cpp/include/raft/neighbors/specializations/ball_cover.cuh
 create mode 100644 cpp/src/neighbors/ball_cover.cu
 delete mode 100644 cpp/src/neighbors/specializations/ball_cover_all_knn_query.cu
 delete mode 100644 cpp/src/neighbors/specializations/ball_cover_build_index.cu
 delete mode 100644 cpp/src/neighbors/specializations/ball_cover_knn_query.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eab6cffe89..e6ffe0d383 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -344,13 +344,11 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
     src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
     src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
+    src/neighbors/ball_cover.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
     src/neighbors/detail/ivf_flat_search.cu
-    src/neighbors/specializations/ball_cover_all_knn_query.cu
-    src/neighbors/specializations/ball_cover_build_index.cu
-    src/neighbors/specializations/ball_cover_knn_query.cu
     src/neighbors/ivfpq_build_float_int64_t.cu
     src/neighbors/ivfpq_build_int8_t_int64_t.cu
     src/neighbors/ivfpq_build_uint8_t_int64_t.cu
diff --git a/cpp/include/raft/neighbors/ball_cover-ext.cuh b/cpp/include/raft/neighbors/ball_cover-ext.cuh
new file mode 100644
index 0000000000..89ea855d31
--- /dev/null
+++ b/cpp/include/raft/neighbors/ball_cover-ext.cuh
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdint>
+
+#include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/ball_cover_types.hpp>
+#include <raft/spatial/knn/detail/ball_cover.cuh>
+#include <raft/spatial/knn/detail/ball_cover/common.cuh>
+#include <raft/util/raft_explicit.hpp>
+
+#include <thrust/transform.h>
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft::neighbors::ball_cover {
+
+/**
+ * @defgroup random_ball_cover Random Ball Cover algorithm
+ * @{
+ */
+
+/**
+ * Builds and populates a previously unbuilt BallCoverIndex
+ *
+ * Usage example:
+ * @code{.cpp}
+ *
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/ball_cover.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2Expanded;
+ *  BallCoverIndex index(handle, X, metric);
+ *
+ *  ball_cover::build_index(handle, index);
+ * @endcode
+ *
+ * @tparam idx_t knn index type
+ * @tparam value_t knn value type
+ * @tparam int_t integral type for knn params
+ * @tparam matrix_idx_t matrix indexing type
+ * @param[in] handle library resource management handle
+ * @param[inout] index an empty (and not previous built) instance of BallCoverIndex
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void build_index(raft::device_resources const& handle,
+                 BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index) RAFT_EXPLICIT;
+
+/** @} */  // end group random_ball_cover
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * performs an all neighbors knn, which can reuse memory when
+ * the index and query are the same array. This function will
+ * build the index and assumes rbc_build_index() has not already
+ * been called.
+ * @tparam idx_t knn index type
+ * @tparam value_t knn distance type
+ * @tparam int_t type for integers, such as number of rows/cols
+ * @param[in] handle raft handle for resource management
+ * @param[inout] index ball cover index which has not yet been built
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void all_knn_query(raft::device_resources const& handle,
+                   BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+                   int_t k,
+                   idx_t* inds,
+                   value_t* dists,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0) RAFT_EXPLICIT;
+
+/**
+ * @ingroup random_ball_cover
+ * @{
+ */
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * performs an all neighbors knn, which can reuse memory when
+ * the index and query are the same array. This function will
+ * build the index and assumes rbc_build_index() has not already
+ * been called.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/ball_cover.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2Expanded;
+ *
+ *  // Construct a ball cover index
+ *  BallCoverIndex index(handle, X, metric);
+ *
+ *  // Perform all neighbors knn query
+ *  ball_cover::all_knn_query(handle, index, inds, dists, k);
+ * @endcode
+ *
+ * @tparam idx_t knn index type
+ * @tparam value_t knn distance type
+ * @tparam int_t type for integers, such as number of rows/cols
+ * @tparam matrix_idx_t matrix indexing type
+ *
+ * @param[in] handle raft handle for resource management
+ * @param[in] index ball cover index which has not yet been built
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void all_knn_query(raft::device_resources const& handle,
+                   BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+                   raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
+                   raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
+                   int_t k,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0) RAFT_EXPLICIT;
+
+/** @} */
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * function does not build the index and assumes rbc_build_index() has
+ * already been called. Use this function when the index and
+ * query arrays are different, otherwise use rbc_all_knn_query().
+ * @tparam idx_t index type
+ * @tparam value_t distances type
+ * @tparam int_t integer type for size info
+ * @param[in] handle raft handle for resource management
+ * @param[inout] index ball cover index which has not yet been built
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] query the
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ * @param[in] n_query_pts number of query points
+ */
+template <typename idx_t, typename value_t, typename int_t>
+void knn_query(raft::device_resources const& handle,
+               const BallCoverIndex<idx_t, value_t, int_t>& index,
+               int_t k,
+               const value_t* query,
+               int_t n_query_pts,
+               idx_t* inds,
+               value_t* dists,
+               bool perform_post_filtering = true,
+               float weight                = 1.0) RAFT_EXPLICIT;
+/**
+ * @ingroup random_ball_cover
+ * @{
+ */
+
+/**
+ * Performs a faster exact knn in metric spaces using the triangle
+ * inequality with a number of landmark points to reduce the
+ * number of distance computations from O(n^2) to O(sqrt(n)). This
+ * function does not build the index and assumes rbc_build_index() has
+ * already been called. Use this function when the index and
+ * query arrays are different, otherwise use rbc_all_knn_query().
+ *
+ * Usage example:
+ * @code{.cpp}
+ *
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/ball_cover.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2Expanded;
+ *
+ *  // Build a ball cover index
+ *  BallCoverIndex index(handle, X, metric);
+ *  ball_cover::build_index(handle, index);
+ *
+ *  // Perform all neighbors knn query
+ *  ball_cover::knn_query(handle, index, inds, dists, k);
+ * @endcode
+
+ *
+ * @tparam idx_t index type
+ * @tparam value_t distances type
+ * @tparam int_t integer type for size info
+ * @tparam matrix_idx_t
+ * @param[in] handle raft handle for resource management
+ * @param[in] index ball cover index which has not yet been built
+ * @param[in] query device matrix containing query data points
+ * @param[out] inds output knn indices
+ * @param[out] dists output knn distances
+ * @param[in] k number of nearest neighbors to find
+ * @param[in] perform_post_filtering if this is false, only the closest k landmarks
+ *                               are considered (which will return approximate
+ *                               results).
+ * @param[in] weight a weight for overlap between the closest landmark and
+ *               the radius of other landmarks when pruning distances.
+ *               Setting this value below 1 can effectively turn off
+ *               computing distances against many other balls, enabling
+ *               approximate nearest neighbors. Recall can be adjusted
+ *               based on how many relevant balls are ignored. Note that
+ *               many datasets can still have great recall even by only
+ *               looking in the closest landmark.
+ */
+template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
+void knn_query(raft::device_resources const& handle,
+               const BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
+               raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,
+               raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,
+               raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,
+               int_t k,
+               bool perform_post_filtering = true,
+               float weight                = 1.0) RAFT_EXPLICIT;
+
+/** @} */
+
+// TODO: implement functions for:
+//  4. rbc_eps_neigh() - given a populated index, perform query against different query array
+//  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
+
+}  // namespace raft::neighbors::ball_cover
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+#define instantiate_raft_neighbors_ball_cover(idx_t, value_t, int_t, matrix_idx_t)                 \
+  extern template void                                                                             \
+  raft::neighbors::ball_cover::build_index<idx_t, value_t, int_t, matrix_idx_t>(                   \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index);      \
+                                                                                                   \
+  extern template void                                                                             \
+  raft::neighbors::ball_cover::all_knn_query<idx_t, value_t, int_t, matrix_idx_t>(                 \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,       \
+    int_t k,                                                                                       \
+    idx_t* inds,                                                                                   \
+    value_t* dists,                                                                                \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  extern template void                                                                             \
+  raft::neighbors::ball_cover::all_knn_query<idx_t, value_t, int_t, matrix_idx_t>(                 \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,       \
+    raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,                                 \
+    raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,                              \
+    int_t k,                                                                                       \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  extern template void raft::neighbors::ball_cover::knn_query<idx_t, value_t, int_t>(              \
+    raft::device_resources const& handle,                                                          \
+    const raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t>& index,               \
+    int_t k,                                                                                       \
+    const value_t* query,                                                                          \
+    int_t n_query_pts,                                                                             \
+    idx_t* inds,                                                                                   \
+    value_t* dists,                                                                                \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  extern template void                                                                             \
+  raft::neighbors::ball_cover::knn_query<idx_t, value_t, int_t, matrix_idx_t>(                     \
+    raft::device_resources const& handle,                                                          \
+    const raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index, \
+    raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,                        \
+    raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,                                 \
+    raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,                              \
+    int_t k,                                                                                       \
+    bool perform_post_filtering,                                                                   \
+    float weight);
+
+instantiate_raft_neighbors_ball_cover(int64_t, float, uint32_t, uint32_t);
+
+#undef instantiate_raft_neighbors_ball_cover
diff --git a/cpp/include/raft/neighbors/ball_cover-inl.cuh b/cpp/include/raft/neighbors/ball_cover-inl.cuh
index 619c57a35a..cc79bfd323 100644
--- a/cpp/include/raft/neighbors/ball_cover-inl.cuh
+++ b/cpp/include/raft/neighbors/ball_cover-inl.cuh
@@ -13,9 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef __BALL_COVER_H
-#define __BALL_COVER_H
-
 #pragma once
 
 #include <cstdint>
@@ -391,5 +388,3 @@ void knn_query(raft::device_resources const& handle,
 //  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
 
 }  // namespace raft::neighbors::ball_cover
-
-#endif
diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover.cuh
index e69de29bb2..de01e756a0 100644
--- a/cpp/include/raft/neighbors/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/ball_cover.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "ball_cover-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "ball_cover-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index fa91a815e9..1fe646bfa5 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <raft/neighbors/specializations/ball_cover.cuh>
 #include <raft/neighbors/specializations/brute_force.cuh>
 
 #include <raft/cluster/specializations.cuh>
diff --git a/cpp/include/raft/neighbors/specializations/ball_cover.cuh b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
deleted file mode 100644
index 33e1a272e3..0000000000
--- a/cpp/include/raft/neighbors/specializations/ball_cover.cuh
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/neighbors/ball_cover.cuh>
-#include <raft/neighbors/ball_cover_types.hpp>
-
-#include <cstdint>
-
-namespace raft::neighbors::ball_cover {
-extern template class BallCoverIndex<int, float, std::uint32_t, std::uint32_t>;
-extern template class BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>;
-
-extern template void build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  raft::device_resources const& handle,
-  BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index);
-
-extern template void knn_query<std::int64_t, float, std::uint32_t>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
-  std::uint32_t k,
-  const float* query,
-  std::uint32_t n_query_pts,
-  std::int64_t* inds,
-  float* dists,
-  bool perform_post_filtering,
-  float weight);
-
-extern template void all_knn_query<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  raft::device_resources const& handle,
-  BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
-  std::uint32_t k,
-  std::int64_t* inds,
-  float* dists,
-  bool perform_post_filtering,
-  float weight);
-
-};  // namespace raft::neighbors::ball_cover
diff --git a/cpp/src/neighbors/ball_cover.cu b/cpp/src/neighbors/ball_cover.cu
new file mode 100644
index 0000000000..4c49c1847b
--- /dev/null
+++ b/cpp/src/neighbors/ball_cover.cu
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/neighbors/ball_cover-inl.cuh>
+
+#define instantiate_raft_neighbors_ball_cover(idx_t, value_t, int_t, matrix_idx_t)                 \
+  template void raft::neighbors::ball_cover::build_index<idx_t, value_t, int_t, matrix_idx_t>(     \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index);      \
+                                                                                                   \
+  template void raft::neighbors::ball_cover::all_knn_query<idx_t, value_t, int_t, matrix_idx_t>(   \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,       \
+    int_t k,                                                                                       \
+    idx_t* inds,                                                                                   \
+    value_t* dists,                                                                                \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  template void raft::neighbors::ball_cover::all_knn_query<idx_t, value_t, int_t, matrix_idx_t>(   \
+    raft::device_resources const& handle,                                                          \
+    raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,       \
+    raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,                                 \
+    raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,                              \
+    int_t k,                                                                                       \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  template void raft::neighbors::ball_cover::knn_query<idx_t, value_t, int_t>(                     \
+    raft::device_resources const& handle,                                                          \
+    const raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t>& index,               \
+    int_t k,                                                                                       \
+    const value_t* query,                                                                          \
+    int_t n_query_pts,                                                                             \
+    idx_t* inds,                                                                                   \
+    value_t* dists,                                                                                \
+    bool perform_post_filtering,                                                                   \
+    float weight);                                                                                 \
+                                                                                                   \
+  template void raft::neighbors::ball_cover::knn_query<idx_t, value_t, int_t, matrix_idx_t>(       \
+    raft::device_resources const& handle,                                                          \
+    const raft::neighbors::ball_cover::BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index, \
+    raft::device_matrix_view<const value_t, matrix_idx_t, row_major> query,                        \
+    raft::device_matrix_view<idx_t, matrix_idx_t, row_major> inds,                                 \
+    raft::device_matrix_view<value_t, matrix_idx_t, row_major> dists,                              \
+    int_t k,                                                                                       \
+    bool perform_post_filtering,                                                                   \
+    float weight);
+
+instantiate_raft_neighbors_ball_cover(int64_t, float, uint32_t, uint32_t);
+
+#undef instantiate_raft_neighbors_ball_cover
diff --git a/cpp/src/neighbors/specializations/ball_cover_all_knn_query.cu b/cpp/src/neighbors/specializations/ball_cover_all_knn_query.cu
deleted file mode 100644
index 305dd6796e..0000000000
--- a/cpp/src/neighbors/specializations/ball_cover_all_knn_query.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ball_cover.cuh>
-#include <raft/neighbors/ball_cover_types.hpp>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cstdint>
-
-namespace raft::neighbors::ball_cover {
-template void all_knn_query<std::int64_t, float, std::uint32_t>(
-  raft::device_resources const& handle,
-  BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
-  std::uint32_t k,
-  std::int64_t* inds,
-  float* dists,
-  bool perform_post_filtering,
-  float weight);
-
-};  // namespace raft::neighbors::ball_cover
diff --git a/cpp/src/neighbors/specializations/ball_cover_build_index.cu b/cpp/src/neighbors/specializations/ball_cover_build_index.cu
deleted file mode 100644
index ec7f4bcf52..0000000000
--- a/cpp/src/neighbors/specializations/ball_cover_build_index.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/ball_cover.cuh>
-#include <raft/neighbors/ball_cover_types.hpp>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cstdint>
-
-namespace raft::neighbors::ball_cover {
-template class BallCoverIndex<int, float, std::uint32_t, std::uint32_t>;
-template class BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>;
-
-template void build_index<std::int64_t, float, std::uint32_t, std::uint32_t>(
-  raft::device_resources const& handle,
-  BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index);
-
-};  // namespace raft::neighbors::ball_cover
diff --git a/cpp/src/neighbors/specializations/ball_cover_knn_query.cu b/cpp/src/neighbors/specializations/ball_cover_knn_query.cu
deleted file mode 100644
index 634427200e..0000000000
--- a/cpp/src/neighbors/specializations/ball_cover_knn_query.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/ball_cover.cuh>
-#include <raft/neighbors/ball_cover_types.hpp>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ball_cover {
-template void knn_query<std::int64_t, float, std::uint32_t>(
-  raft::device_resources const& handle,
-  const BallCoverIndex<std::int64_t, float, std::uint32_t, std::uint32_t>& index,
-  std::uint32_t k,
-  const float* query,
-  std::uint32_t n_query_pts,
-  std::int64_t* inds,
-  float* dists,
-  bool perform_post_filtering,
-  float weight);
-
-};  // namespace raft::neighbors::ball_cover

From 89ac8066721c199b2c5934e9ed3150b34afab031 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 13 Apr 2023 18:18:00 +0200
Subject: [PATCH 34/89] Remove unused files

---
 .../compute_similarity_float_float_fast.cu    | 26 -----------------
 ...pute_similarity_float_float_no_basediff.cu | 27 ------------------
 ...pute_similarity_float_float_no_smem_lut.cu | 27 ------------------
 .../compute_similarity_float_fp8s_fast.cu     | 27 ------------------
 ...mpute_similarity_float_fp8s_no_basediff.cu | 28 -------------------
 ...mpute_similarity_float_fp8s_no_smem_lut.cu | 28 -------------------
 .../compute_similarity_float_fp8u_fast.cu     | 28 -------------------
 ...mpute_similarity_float_fp8u_no_basediff.cu | 28 -------------------
 ...mpute_similarity_float_fp8u_no_smem_lut.cu | 28 -------------------
 .../compute_similarity_float_half_fast.cu     | 27 ------------------
 ...mpute_similarity_float_half_no_basediff.cu | 27 ------------------
 ...mpute_similarity_float_half_no_smem_lut.cu | 27 ------------------
 .../compute_similarity_half_fp8s_fast.cu      | 27 ------------------
 ...ompute_similarity_half_fp8s_no_basediff.cu | 27 ------------------
 ...ompute_similarity_half_fp8s_no_smem_lut.cu | 27 ------------------
 .../compute_similarity_half_fp8u_fast.cu      | 27 ------------------
 ...ompute_similarity_half_fp8u_no_basediff.cu | 28 -------------------
 ...ompute_similarity_half_fp8u_no_smem_lut.cu | 28 -------------------
 .../compute_similarity_half_half_fast.cu      | 27 ------------------
 ...ompute_similarity_half_half_no_basediff.cu | 27 ------------------
 ...ompute_similarity_half_half_no_smem_lut.cu | 27 ------------------
 ...mpute_similarity_float_half_no_smem_lut.cu | 27 ------------------
 22 files changed, 600 deletions(-)
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu

diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
deleted file mode 100644
index 33c4e7ffc0..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_fast.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda_fp16.h>
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, float, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, float>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
deleted file mode 100644
index f543369de5..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_basediff.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, float, false, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, float>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
deleted file mode 100644
index 1a0322a722..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_float_no_smem_lut.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, float, true, false>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, float>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
deleted file mode 100644
index c7b5c9ffe9..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_fast.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, true>, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
deleted file mode 100644
index efb2a477a7..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_basediff.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, true>, false, true>(uint32_t,
-                                                                                  uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
deleted file mode 100644
index b9051eb011..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8s_no_smem_lut.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, true>, true, false>(uint32_t,
-                                                                                  uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
deleted file mode 100644
index c6b1bad123..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_fast.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, false>, true, true>(uint32_t,
-                                                                                  uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
deleted file mode 100644
index d6033345da..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_basediff.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, false>, false, true>(uint32_t,
-                                                                                   uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
deleted file mode 100644
index 1add18cb4a..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_fp8u_no_smem_lut.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, fp_8bit<5, false>, true, false>(uint32_t,
-                                                                                   uint32_t)
-  -> compute_similarity_kernel_t<float, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
deleted file mode 100644
index 6020d7035b..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_fast.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, half, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
deleted file mode 100644
index 62be67e1a9..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_basediff.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, half, false, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
deleted file mode 100644
index 145312f334..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_float_half_no_smem_lut.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, half, true, false>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
deleted file mode 100644
index c9365e1bb4..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_fast.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, true>, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
deleted file mode 100644
index d5c6934da2..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_basediff.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, true>, false, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
deleted file mode 100644
index bac8c8706b..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8s_no_smem_lut.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, true>, true, false>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, true>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
deleted file mode 100644
index 2809005dd0..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_fast.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, false>, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
deleted file mode 100644
index 015ef21a15..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_basediff.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, false>, false, true>(uint32_t,
-                                                                                  uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
deleted file mode 100644
index 0ac96c8440..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_fp8u_no_smem_lut.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, fp_8bit<5, false>, true, false>(uint32_t,
-                                                                                  uint32_t)
-  -> compute_similarity_kernel_t<half, fp_8bit<5, false>>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
deleted file mode 100644
index f3501d11c0..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_fast.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, half, true, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
deleted file mode 100644
index 7d10020480..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_basediff.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, half, false, true>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
deleted file mode 100644
index 91ec2eca3e..0000000000
--- a/cpp/src/neighbors/specializations/detail/compute_similarity_half_half_no_smem_lut.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<half, half, true, false>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<half, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/src/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu b/cpp/src/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu
deleted file mode 100644
index 145312f334..0000000000
--- a/cpp/src/neighbors/specializations/detail/ivfpq_compute_similarity_float_half_no_smem_lut.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/detail/ivf_pq_search.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::neighbors::ivf_pq::detail {
-
-template auto get_compute_similarity_kernel<float, half, true, false>(uint32_t, uint32_t)
-  -> compute_similarity_kernel_t<float, half>;
-
-}  // namespace raft::neighbors::ivf_pq::detail

From bf6f20336829c81498b8a66cac6949aa322b4b24 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 13 Apr 2023 19:34:35 +0200
Subject: [PATCH 35/89] Split brute force knn

---
 cpp/CMakeLists.txt                            |   9 +-
 .../raft/neighbors/brute_force-ext.cuh        | 231 ++++++++++++++++++
 cpp/include/raft/neighbors/brute_force.cuh    |  25 ++
 .../raft/neighbors/specializations.cuh        |   4 -
 .../neighbors/specializations/brute_force.cuh |  46 ----
 cpp/src/neighbors/brute_force_00_generate.py  |  82 +++++++
 .../brute_force_fused_l2_knn_float_int64_t.cu |  51 ++++
 .../brute_force_knn_int64_t_float_int64_t.cu  |  49 ++++
 .../brute_force_knn_int64_t_float_uint32_t.cu |  49 ++++
 .../brute_force_knn_int_float_int.cu          |  49 ++++
 ...brute_force_knn_uint32_t_float_uint32_t.cu |  49 ++++
 .../brute_force_knn_impl_long_float_int.cu    |  39 ---
 .../brute_force_knn_impl_long_float_uint.cu   |  39 ---
 .../brute_force_knn_impl_uint_float_int.cu    |  39 ---
 .../brute_force_knn_impl_uint_float_uint.cu   |  39 ---
 15 files changed, 590 insertions(+), 210 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/brute_force-ext.cuh
 delete mode 100644 cpp/include/raft/neighbors/specializations/brute_force.cuh
 create mode 100644 cpp/src/neighbors/brute_force_00_generate.py
 create mode 100644 cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
 create mode 100644 cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
 create mode 100644 cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
 create mode 100644 cpp/src/neighbors/brute_force_knn_int_float_int.cu
 create mode 100644 cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
 delete mode 100644 cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e6ffe0d383..67e9ff7210 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -340,11 +340,12 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/ivfpq_search_float_int64_t.cu
     src/neighbors/ivfpq_search_int8_t_int64_t.cu
     src/neighbors/ivfpq_search_uint8_t_int64_t.cu
-    src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
-    src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
-    src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
-    src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
     src/neighbors/ball_cover.cu
+    src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
+    src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
+    src/neighbors/brute_force_knn_int_float_int.cu
+    src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
+    src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
diff --git a/cpp/include/raft/neighbors/brute_force-ext.cuh b/cpp/include/raft/neighbors/brute_force-ext.cuh
new file mode 100644
index 0000000000..d3a70716e0
--- /dev/null
+++ b/cpp/include/raft/neighbors/brute_force-ext.cuh
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/operators.hpp>  // raft::identity_op
+#include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/detail/knn_brute_force.cuh>
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
+#include <raft/util/raft_explicit.hpp>
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+
+namespace raft::neighbors::brute_force {
+
+/**
+ * @defgroup brute_force_knn Brute-force K-Nearest Neighbors
+ * @{
+ */
+
+/**
+ * @brief Performs a k-select across several (contiguous) row-partitioned index/distance
+ * matrices formatted like the following:
+ *
+ * part1row1: k0, k1, k2, k3
+ * part1row2: k0, k1, k2, k3
+ * part1row3: k0, k1, k2, k3
+ * part2row1: k0, k1, k2, k3
+ * part2row2: k0, k1, k2, k3
+ * part2row3: k0, k1, k2, k3
+ * etc...
+ *
+ * The example above shows what an aggregated index/distance matrix
+ * would look like with two partitions when n_samples=3 and k=4.
+ *
+ * When working with extremely large data sets that have been broken
+ * over multiple indexes, such as when computing over multiple GPUs,
+ * the ids will often start at 0 for each local knn index but the
+ * global ids need to be used when merging them together. An optional
+ * translations vector can be supplied to map the starting id of
+ * each partition to its global id so that the final merged knn
+ * is based on the global ids.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/brute_force.cuh>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  compute multiple knn graphs and aggregate row-wise
+ *  (see detailed description above)
+ *  ...
+ *  brute_force::knn_merge_parts(handle, in_keys, in_values, out_keys, out_values, n_samples);
+ * @endcode
+ *
+ * @tparam idx_t
+ * @tparam value_t
+ *
+ * @param[in] handle
+ * @param[in] in_keys matrix of input keys (size n_samples * n_parts * k)
+ * @param[in] in_values matrix of input values (size n_samples * n_parts * k)
+ * @param[out] out_keys matrix of output keys (size n_samples * k)
+ * @param[out] out_values matrix of output values (size n_samples * k)
+ * @param[in] n_samples number of rows in each partition
+ * @param[in] translations optional vector of starting global id mappings for each local partition
+ */
+template <typename value_t, typename idx_t>
+inline void knn_merge_parts(
+  raft::device_resources const& handle,
+  raft::device_matrix_view<const value_t, idx_t, row_major> in_keys,
+  raft::device_matrix_view<const idx_t, idx_t, row_major> in_values,
+  raft::device_matrix_view<value_t, idx_t, row_major> out_keys,
+  raft::device_matrix_view<idx_t, idx_t, row_major> out_values,
+  size_t n_samples,
+  std::optional<raft::device_vector_view<idx_t, idx_t>> translations = std::nullopt) RAFT_EXPLICIT;
+
+/**
+ * @brief Flat C++ API function to perform a brute force knn on
+ * a series of input arrays and combine the results into a single
+ * output array for indexes and distances. Inputs can be either
+ * row- or column-major but the output matrices will always be in
+ * row-major format.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/brute_force.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+ *  brute_force::knn(handle, index, search, indices, distances, metric);
+ * @endcode
+ *
+ * @param[in] handle: the cuml handle to use
+ * @param[in] index: vector of device matrices (each size m_i*d) to be used as the knn index
+ * @param[in] search: matrix (size n*d) to be used for searching the index
+ * @param[out] indices: matrix (size n*k) to store output knn indices
+ * @param[out] distances: matrix (size n*k) to store the output knn distance
+ * @param[in] metric: distance metric to use. Euclidean (L2) is used by default
+ * @param[in] metric_arg: the value of `p` for Minkowski (l-p) distances. This
+ * 					 is ignored if the metric_type is not Minkowski.
+ * @param[in] global_id_offset: optional starting global id mapping for the local partition
+ *                              (assumes the index contains contiguous ids in the global id space)
+ * @param[in] distance_epilogue: optional epilogue function to run after computing distances. This
+                                 function takes a triple of the (value, rowid, colid) for each
+                                 element in the pairwise distances and returns a transformed value
+                                 back.
+ */
+template <typename idx_t,
+          typename value_t,
+          typename matrix_idx,
+          typename index_layout,
+          typename search_layout,
+          typename epilogue_op = raft::identity_op>
+void knn(raft::device_resources const& handle,
+         std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index,
+         raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,
+         raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
+         raft::device_matrix_view<value_t, matrix_idx, row_major> distances,
+         distance::DistanceType metric         = distance::DistanceType::L2Unexpanded,
+         std::optional<float> metric_arg       = std::make_optional<float>(2.0f),
+         std::optional<idx_t> global_id_offset = std::nullopt,
+         epilogue_op distance_epilogue         = raft::identity_op()) RAFT_EXPLICIT;
+
+/**
+ * @brief Compute the k-nearest neighbors using L2 expanded/unexpanded distance.
+ *
+ * This is a specialized function for fusing the k-selection with the distance
+ * computation when k < 64. The value of k will be inferred from the number
+ * of columns in the output matrices.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/neighbors/brute_force.cuh>
+ *  #include <raft/distance/distance_types.hpp>
+ *  using namespace raft::neighbors;
+ *
+ *  raft::raft::device_resources handle;
+ *  ...
+ *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
+ *  brute_force::fused_l2_knn(handle, index, search, indices, distances, metric);
+ * @endcode
+
+ * @tparam value_t type of values
+ * @tparam idx_t type of indices
+ * @tparam idx_layout layout type of index matrix
+ * @tparam query_layout layout type of query matrix
+ * @param[in] handle raft handle for sharing expensive resources
+ * @param[in] index input index array on device (size m * d)
+ * @param[in] query input query array on device (size n * d)
+ * @param[out] out_inds output indices array on device (size n * k)
+ * @param[out] out_dists output dists array on device (size n * k)
+ * @param[in] metric type of distance computation to perform (must be a variant of L2)
+ */
+template <typename value_t, typename idx_t, typename idx_layout, typename query_layout>
+void fused_l2_knn(raft::device_resources const& handle,
+                  raft::device_matrix_view<const value_t, idx_t, idx_layout> index,
+                  raft::device_matrix_view<const value_t, idx_t, query_layout> query,
+                  raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,
+                  raft::device_matrix_view<value_t, idx_t, row_major> out_dists,
+                  raft::distance::DistanceType metric) RAFT_EXPLICIT;
+
+/** @} */  // end group brute_force_knn
+
+}  // namespace raft::neighbors::brute_force
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE
+
+// No extern template for raft::neighbors::brute_force::knn_merge_parts
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  extern template void raft::neighbors::brute_force::                                       \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+instantiate_raft_neighbors_brute_force_knn(
+  int64_t, float, uint32_t, raft::row_major, raft::row_major, raft::identity_op);
+instantiate_raft_neighbors_brute_force_knn(
+  int64_t, float, int64_t, raft::row_major, raft::row_major, raft::identity_op);
+instantiate_raft_neighbors_brute_force_knn(
+  int, float, int, raft::row_major, raft::row_major, raft::identity_op);
+instantiate_raft_neighbors_brute_force_knn(
+  uint32_t, float, uint32_t, raft::row_major, raft::row_major, raft::identity_op);
+
+#undef instantiate_raft_neighbors_brute_force_knn
+
+#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
+  value_t, idx_t, idx_layout, query_layout)                             \
+  extern template void raft::neighbors::brute_force::fused_l2_knn(      \
+    raft::device_resources const& handle,                               \
+    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
+    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
+    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
+    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_brute_force_fused_l2_knn(float,
+                                                    int64_t,
+                                                    raft::row_major,
+                                                    raft::row_major)
+
+#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force.cuh
index e69de29bb2..f767c840bf 100644
--- a/cpp/include/raft/neighbors/brute_force.cuh
+++ b/cpp/include/raft/neighbors/brute_force.cuh
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#include "brute_force-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "brute_force-ext.cuh"
+#endif
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index 1fe646bfa5..aaca4412e9 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -16,8 +16,4 @@
 
 #pragma once
 
-#include <raft/neighbors/specializations/brute_force.cuh>
-
-#include <raft/cluster/specializations.cuh>
 #include <raft/distance/specializations.cuh>
-#include <raft/matrix/specializations.cuh>
diff --git a/cpp/include/raft/neighbors/specializations/brute_force.cuh b/cpp/include/raft/neighbors/specializations/brute_force.cuh
deleted file mode 100644
index 1337beb68a..0000000000
--- a/cpp/include/raft/neighbors/specializations/brute_force.cuh
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/neighbors/brute_force.cuh>
-
-// also define the detail api, which is used by raft::neighbors::brute_force
-// (not doing the public api, since has extra template params on index_layout, matrix_index,
-// search_layout etc - and isn't clear what the defaults here should be)
-namespace raft::neighbors::detail {
-#define RAFT_INST(IdxT, T, IntT)                                                                 \
-  extern template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
-                                                           std::vector<T*>& input,               \
-                                                           std::vector<IntT>& sizes,             \
-                                                           IntT D,                               \
-                                                           T* search_items,                      \
-                                                           IntT n,                               \
-                                                           IdxT* res_I,                          \
-                                                           T* res_D,                             \
-                                                           IntT k,                               \
-                                                           bool rowMajorIndex,                   \
-                                                           bool rowMajorQuery,                   \
-                                                           std::vector<IdxT>* translations,      \
-                                                           raft::distance::DistanceType metric,  \
-                                                           float metricArg,                      \
-                                                           raft::identity_op);
-RAFT_INST(long, float, int);
-RAFT_INST(long, float, unsigned int);
-RAFT_INST(uint32_t, float, int);
-RAFT_INST(uint32_t, float, unsigned int);
-#undef RAFT_INST
-}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/brute_force_00_generate.py b/cpp/src/neighbors/brute_force_00_generate.py
new file mode 100644
index 0000000000..d955b3ea78
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_00_generate.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+header = """
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_knn(idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op) \\
+    template void raft::neighbors::brute_force::knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>( \\
+        raft::device_resources const& handle,                           \\
+        std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \\
+        raft::device_matrix_view<const value_t, matrix_idx, search_layout> search, \\
+        raft::device_matrix_view<idx_t, matrix_idx, row_major> indices, \\
+        raft::device_matrix_view<value_t, matrix_idx, row_major> distances, \\
+        raft::distance::DistanceType metric,                            \\
+        std::optional<float> metric_arg,                                \\
+        std::optional<idx_t> global_id_offset,                          \\
+        epilogue_op distance_epilogue);
+
+
+#define instantiate_raft_neighbors_brute_force_fused_l2_knn(value_t, idx_t, idx_layout, query_layout) \\
+    template void raft::neighbors::brute_force::fused_l2_knn(    \\
+        raft::device_resources const& handle,                           \\
+        raft::device_matrix_view<const value_t, idx_t, idx_layout> index, \\
+        raft::device_matrix_view<const value_t, idx_t, query_layout> query, \\
+        raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,     \\
+        raft::device_matrix_view<value_t, idx_t, row_major> out_dists,  \\
+        raft::distance::DistanceType metric);
+
+"""
+
+trailer = """
+
+#undef instantiate_raft_neighbors_brute_force_knn
+#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
+"""
+
+knn_types = dict(
+    int64_t_float_uint32_t=("int64_t","float","uint32_t"),
+    int64_t_float_int64_t=("int64_t","float","int64_t"),
+    int_float_int=("int","float","int"),
+    uint32_t_float_uint32_t=("uint32_t","float","uint32_t"),
+)
+
+fused_l2_knn_types = dict(
+    float_int64_t=("float", "int64_t"),
+)
+
+# knn
+for type_path, (idx_t, value_t, matrix_idx) in knn_types.items():
+    path = f"brute_force_knn_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(f"instantiate_raft_neighbors_brute_force_knn({idx_t},{value_t},{matrix_idx},raft::row_major,raft::row_major,raft::identity_op);\n")
+        f.write(trailer)
+    # For pasting into CMakeLists.txt
+    print(f"src/neighbors/{path}")
+
+#fused_l2_knn
+for type_path, (value_t, idx_t) in fused_l2_knn_types.items():
+    path = f"brute_force_fused_l2_knn_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(f"instantiate_raft_neighbors_brute_force_fused_l2_knn({value_t},{idx_t},raft::row_major,raft::row_major);\n")
+        f.write(trailer)
+    # For pasting into CMakeLists.txt
+    print(f"src/neighbors/{path}")
diff --git a/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu b/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
new file mode 100644
index 0000000000..8e9ba3ea22
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
@@ -0,0 +1,51 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  template void raft::neighbors::brute_force::                                              \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
+  value_t, idx_t, idx_layout, query_layout)                             \
+  template void raft::neighbors::brute_force::fused_l2_knn(             \
+    raft::device_resources const& handle,                               \
+    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
+    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
+    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
+    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_brute_force_fused_l2_knn(float,
+                                                    int64_t,
+                                                    raft::row_major,
+                                                    raft::row_major);
+
+#undef instantiate_raft_neighbors_brute_force_knn
+#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu b/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
new file mode 100644
index 0000000000..b2ed7c7732
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
@@ -0,0 +1,49 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  template void raft::neighbors::brute_force::                                              \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
+  value_t, idx_t, idx_layout, query_layout)                             \
+  template void raft::neighbors::brute_force::fused_l2_knn(             \
+    raft::device_resources const& handle,                               \
+    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
+    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
+    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
+    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_brute_force_knn(
+  int64_t, float, int64_t, raft::row_major, raft::row_major, raft::identity_op);
+
+#undef instantiate_raft_neighbors_brute_force_knn
+#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu b/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
new file mode 100644
index 0000000000..fa815cee64
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
@@ -0,0 +1,49 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  template void raft::neighbors::brute_force::                                              \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
+  value_t, idx_t, idx_layout, query_layout)                             \
+  template void raft::neighbors::brute_force::fused_l2_knn(             \
+    raft::device_resources const& handle,                               \
+    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
+    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
+    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
+    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_brute_force_knn(
+  int64_t, float, uint32_t, raft::row_major, raft::row_major, raft::identity_op);
+
+#undef instantiate_raft_neighbors_brute_force_knn
+#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/src/neighbors/brute_force_knn_int_float_int.cu b/cpp/src/neighbors/brute_force_knn_int_float_int.cu
new file mode 100644
index 0000000000..c3f51a2b1b
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_int_float_int.cu
@@ -0,0 +1,49 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  template void raft::neighbors::brute_force::                                              \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
+  value_t, idx_t, idx_layout, query_layout)                             \
+  template void raft::neighbors::brute_force::fused_l2_knn(             \
+    raft::device_resources const& handle,                               \
+    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
+    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
+    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
+    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_brute_force_knn(
+  int, float, int, raft::row_major, raft::row_major, raft::identity_op);
+
+#undef instantiate_raft_neighbors_brute_force_knn
+#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu b/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
new file mode 100644
index 0000000000..88e1e82aab
--- /dev/null
+++ b/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
@@ -0,0 +1,49 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <raft/neighbors/brute_force-inl.cuh>
+
+#define instantiate_raft_neighbors_brute_force_knn(                                         \
+  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
+  template void raft::neighbors::brute_force::                                              \
+    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
+      raft::device_resources const& handle,                                                 \
+      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
+      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
+      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
+      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
+      raft::distance::DistanceType metric,                                                  \
+      std::optional<float> metric_arg,                                                      \
+      std::optional<idx_t> global_id_offset,                                                \
+      epilogue_op distance_epilogue);
+
+#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
+  value_t, idx_t, idx_layout, query_layout)                             \
+  template void raft::neighbors::brute_force::fused_l2_knn(             \
+    raft::device_resources const& handle,                               \
+    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
+    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
+    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
+    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
+    raft::distance::DistanceType metric);
+
+instantiate_raft_neighbors_brute_force_knn(
+  uint32_t, float, uint32_t, raft::row_major, raft::row_major, raft::identity_op);
+
+#undef instantiate_raft_neighbors_brute_force_knn
+#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
deleted file mode 100644
index 04aa42c9f1..0000000000
--- a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_int.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/brute_force.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::detail {
-#define RAFT_INST(IdxT, T, IntT)                                                          \
-  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
-                                                    std::vector<T*>& input,               \
-                                                    std::vector<IntT>& sizes,             \
-                                                    IntT D,                               \
-                                                    T* search_items,                      \
-                                                    IntT n,                               \
-                                                    IdxT* res_I,                          \
-                                                    T* res_D,                             \
-                                                    IntT k,                               \
-                                                    bool rowMajorIndex,                   \
-                                                    bool rowMajorQuery,                   \
-                                                    std::vector<IdxT>* translations,      \
-                                                    raft::distance::DistanceType metric,  \
-                                                    float metricArg,                      \
-                                                    raft::identity_op);
-RAFT_INST(long, float, int);
-#undef RAFT_INST
-}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
deleted file mode 100644
index a8b9d4299a..0000000000
--- a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_long_float_uint.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/brute_force.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::detail {
-#define RAFT_INST(IdxT, T, IntT)                                                          \
-  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
-                                                    std::vector<T*>& input,               \
-                                                    std::vector<IntT>& sizes,             \
-                                                    IntT D,                               \
-                                                    T* search_items,                      \
-                                                    IntT n,                               \
-                                                    IdxT* res_I,                          \
-                                                    T* res_D,                             \
-                                                    IntT k,                               \
-                                                    bool rowMajorIndex,                   \
-                                                    bool rowMajorQuery,                   \
-                                                    std::vector<IdxT>* translations,      \
-                                                    raft::distance::DistanceType metric,  \
-                                                    float metricArg,                      \
-                                                    raft::identity_op);
-RAFT_INST(long, float, unsigned int);
-#undef RAFT_INST
-}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
deleted file mode 100644
index c97e6e936a..0000000000
--- a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_int.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/brute_force.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::detail {
-#define RAFT_INST(IdxT, T, IntT)                                                          \
-  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
-                                                    std::vector<T*>& input,               \
-                                                    std::vector<IntT>& sizes,             \
-                                                    IntT D,                               \
-                                                    T* search_items,                      \
-                                                    IntT n,                               \
-                                                    IdxT* res_I,                          \
-                                                    T* res_D,                             \
-                                                    IntT k,                               \
-                                                    bool rowMajorIndex,                   \
-                                                    bool rowMajorQuery,                   \
-                                                    std::vector<IdxT>* translations,      \
-                                                    raft::distance::DistanceType metric,  \
-                                                    float metricArg,                      \
-                                                    raft::identity_op);
-RAFT_INST(uint32_t, float, int);
-#undef RAFT_INST
-}  // namespace raft::neighbors::detail
diff --git a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu b/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
deleted file mode 100644
index 87451c385a..0000000000
--- a/cpp/src/neighbors/specializations/detail/brute_force_knn_impl_uint_float_uint.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/neighbors/brute_force.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-namespace raft::neighbors::detail {
-#define RAFT_INST(IdxT, T, IntT)                                                          \
-  template void brute_force_knn_impl<IntT, IdxT, T>(raft::device_resources const& handle, \
-                                                    std::vector<T*>& input,               \
-                                                    std::vector<IntT>& sizes,             \
-                                                    IntT D,                               \
-                                                    T* search_items,                      \
-                                                    IntT n,                               \
-                                                    IdxT* res_I,                          \
-                                                    T* res_D,                             \
-                                                    IntT k,                               \
-                                                    bool rowMajorIndex,                   \
-                                                    bool rowMajorQuery,                   \
-                                                    std::vector<IdxT>* translations,      \
-                                                    raft::distance::DistanceType metric,  \
-                                                    float metricArg,                      \
-                                                    raft::identity_op);
-RAFT_INST(uint32_t, float, unsigned int);
-#undef RAFT_INST
-}  // namespace raft::neighbors::detail

From aae0dfa7d8e9ef8d6241b9040d560bbd3e6595e9 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 13 Apr 2023 19:35:05 +0200
Subject: [PATCH 36/89] Remove deprecated specialization files

---
 cpp/bench/prims/cluster/kmeans.cu             |  4 --
 cpp/bench/prims/cluster/kmeans_balanced.cu    |  4 --
 cpp/bench/prims/matrix/select_k.cu            |  4 --
 cpp/include/raft/cluster/specializations.cuh  | 24 ----------
 cpp/include/raft/matrix/specializations.cuh   | 19 --------
 .../specializations/detail/select_k.cuh       | 47 -------------------
 .../raft_internal/matrix/select_k.cuh         |  7 +--
 .../raft_internal/neighbors/naive_knn.cuh     |  4 --
 .../detail/select_k_float_int64_t.cu          | 36 --------------
 .../detail/select_k_float_uint32_t.cu         | 36 --------------
 .../detail/select_k_half_int64_t.cu           | 36 --------------
 .../detail/select_k_half_uint32_t.cu          | 36 --------------
 .../cluster/cluster_cost_double.cu            |  1 -
 .../cluster/cluster_cost_float.cu             |  1 -
 .../raft_runtime/cluster/kmeans_fit_double.cu |  1 -
 .../raft_runtime/cluster/kmeans_fit_float.cu  |  1 -
 .../cluster/kmeans_init_plus_plus_double.cu   |  1 -
 .../cluster/kmeans_init_plus_plus_float.cu    |  1 -
 .../raft_runtime/cluster/update_centroids.cuh |  3 +-
 .../cluster/update_centroids_double.cu        |  3 +-
 .../cluster/update_centroids_float.cu         |  3 +-
 cpp/test/cluster/kmeans.cu                    |  4 --
 cpp/test/cluster/kmeans_balanced.cu           |  4 --
 cpp/test/cluster/kmeans_find_k.cu             |  4 --
 cpp/test/matrix/select_k.cu                   |  4 --
 25 files changed, 4 insertions(+), 284 deletions(-)
 delete mode 100644 cpp/include/raft/cluster/specializations.cuh
 delete mode 100644 cpp/include/raft/matrix/specializations.cuh
 delete mode 100644 cpp/include/raft/matrix/specializations/detail/select_k.cuh
 delete mode 100644 cpp/src/matrix/specializations/detail/select_k_float_int64_t.cu
 delete mode 100644 cpp/src/matrix/specializations/detail/select_k_float_uint32_t.cu
 delete mode 100644 cpp/src/matrix/specializations/detail/select_k_half_int64_t.cu
 delete mode 100644 cpp/src/matrix/specializations/detail/select_k_half_uint32_t.cu

diff --git a/cpp/bench/prims/cluster/kmeans.cu b/cpp/bench/prims/cluster/kmeans.cu
index af7afb8037..3147960f72 100644
--- a/cpp/bench/prims/cluster/kmeans.cu
+++ b/cpp/bench/prims/cluster/kmeans.cu
@@ -18,10 +18,6 @@
 #include <raft/cluster/kmeans.cuh>
 #include <raft/cluster/kmeans_types.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/cluster/specializations.cuh>
-#endif
-
 namespace raft::bench::cluster {
 
 struct KMeansBenchParams {
diff --git a/cpp/bench/prims/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu
index 6bda43bdb2..42a8f7967c 100644
--- a/cpp/bench/prims/cluster/kmeans_balanced.cu
+++ b/cpp/bench/prims/cluster/kmeans_balanced.cu
@@ -18,10 +18,6 @@
 #include <raft/cluster/kmeans_balanced.cuh>
 #include <raft/random/rng.cuh>
 
-#if defined RAFT_COMPILED
-#include <raft/cluster/specializations.cuh>
-#endif
-
 namespace raft::bench::cluster {
 
 struct KMeansBalancedBenchParams {
diff --git a/cpp/bench/prims/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu
index 870119db52..eb2b09cc4a 100644
--- a/cpp/bench/prims/matrix/select_k.cu
+++ b/cpp/bench/prims/matrix/select_k.cu
@@ -23,10 +23,6 @@
 #include <raft/sparse/detail/utils.h>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/matrix/specializations.cuh>
-#endif
-
 #include <raft/matrix/detail/select_radix.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/matrix/select_k.cuh>
diff --git a/cpp/include/raft/cluster/specializations.cuh b/cpp/include/raft/cluster/specializations.cuh
deleted file mode 100644
index 9b68d7adc9..0000000000
--- a/cpp/include/raft/cluster/specializations.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __CLUSTER_SPECIALIZATIONS_H
-#define __CLUSTER_SPECIALIZATIONS_H
-
-#pragma once
-
-#include <raft/distance/specializations.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#endif
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/specializations.cuh b/cpp/include/raft/matrix/specializations.cuh
deleted file mode 100644
index 07bdeab507..0000000000
--- a/cpp/include/raft/matrix/specializations.cuh
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/matrix/specializations/detail/select_k.cuh>
diff --git a/cpp/include/raft/matrix/specializations/detail/select_k.cuh b/cpp/include/raft/matrix/specializations/detail/select_k.cuh
deleted file mode 100644
index 3cb1a2d8dc..0000000000
--- a/cpp/include/raft/matrix/specializations/detail/select_k.cuh
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/matrix/detail/select_k.cuh>
-
-#include <cuda_fp16.h>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                                      \
-  extern template void select_k<T, IdxT>(const T*,              \
-                                         const IdxT*,           \
-                                         size_t,                \
-                                         size_t,                \
-                                         int,                   \
-                                         T*,                    \
-                                         IdxT*,                 \
-                                         bool,                  \
-                                         rmm::cuda_stream_view, \
-                                         rmm::mr::device_memory_resource*);
-
-// Commonly used types
-RAFT_INST(float, int64_t);
-RAFT_INST(half, int64_t);
-
-// These instances are used in the ivf_pq::search parameterized by the internal_distance_dtype
-RAFT_INST(float, uint32_t);
-RAFT_INST(half, uint32_t);
-
-#undef RAFT_INST
-
-}  // namespace raft::matrix::detail
diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh
index 188122c9b4..8dedec67cb 100644
--- a/cpp/internal/raft_internal/matrix/select_k.cuh
+++ b/cpp/internal/raft_internal/matrix/select_k.cuh
@@ -16,16 +16,11 @@
 
 #pragma once
 
+#include <raft/core/device_resources.hpp>
 #include <raft/matrix/detail/select_radix.cuh>
 #include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/matrix/select_k.cuh>
 
-#ifdef RAFT_COMPILED
-#include <raft/matrix/specializations.cuh>
-#endif
-
-#include <raft/core/device_resources.hpp>
-
 namespace raft::matrix::select {
 
 struct params {
diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
index 47d6f068e3..3ad055272b 100644
--- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -21,10 +21,6 @@
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#if defined RAFT_COMPILED
-#include <raft/matrix/specializations/detail/select_k.cuh>
-#endif
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
diff --git a/cpp/src/matrix/specializations/detail/select_k_float_int64_t.cu b/cpp/src/matrix/specializations/detail/select_k_float_int64_t.cu
deleted file mode 100644
index 370ab1ba50..0000000000
--- a/cpp/src/matrix/specializations/detail/select_k_float_int64_t.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/specializations.cuh>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                               \
-  template void select_k<T, IdxT>(const T*,              \
-                                  const IdxT*,           \
-                                  size_t,                \
-                                  size_t,                \
-                                  int,                   \
-                                  T*,                    \
-                                  IdxT*,                 \
-                                  bool,                  \
-                                  rmm::cuda_stream_view, \
-                                  rmm::mr::device_memory_resource*);
-
-RAFT_INST(float, int64_t);
-
-}  // namespace raft::matrix::detail
diff --git a/cpp/src/matrix/specializations/detail/select_k_float_uint32_t.cu b/cpp/src/matrix/specializations/detail/select_k_float_uint32_t.cu
deleted file mode 100644
index c6733c2a46..0000000000
--- a/cpp/src/matrix/specializations/detail/select_k_float_uint32_t.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/specializations.cuh>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                               \
-  template void select_k<T, IdxT>(const T*,              \
-                                  const IdxT*,           \
-                                  size_t,                \
-                                  size_t,                \
-                                  int,                   \
-                                  T*,                    \
-                                  IdxT*,                 \
-                                  bool,                  \
-                                  rmm::cuda_stream_view, \
-                                  rmm::mr::device_memory_resource*);
-
-RAFT_INST(float, uint32_t);
-
-}  // namespace raft::matrix::detail
diff --git a/cpp/src/matrix/specializations/detail/select_k_half_int64_t.cu b/cpp/src/matrix/specializations/detail/select_k_half_int64_t.cu
deleted file mode 100644
index 38e28ac54d..0000000000
--- a/cpp/src/matrix/specializations/detail/select_k_half_int64_t.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/specializations.cuh>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                               \
-  template void select_k<T, IdxT>(const T*,              \
-                                  const IdxT*,           \
-                                  size_t,                \
-                                  size_t,                \
-                                  int,                   \
-                                  T*,                    \
-                                  IdxT*,                 \
-                                  bool,                  \
-                                  rmm::cuda_stream_view, \
-                                  rmm::mr::device_memory_resource*);
-
-RAFT_INST(half, int64_t);
-
-}  // namespace raft::matrix::detail
diff --git a/cpp/src/matrix/specializations/detail/select_k_half_uint32_t.cu b/cpp/src/matrix/specializations/detail/select_k_half_uint32_t.cu
deleted file mode 100644
index 108bd30b49..0000000000
--- a/cpp/src/matrix/specializations/detail/select_k_half_uint32_t.cu
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/matrix/detail/select_k.cuh>
-#include <raft/matrix/specializations.cuh>
-
-namespace raft::matrix::detail {
-
-#define RAFT_INST(T, IdxT)                               \
-  template void select_k<T, IdxT>(const T*,              \
-                                  const IdxT*,           \
-                                  size_t,                \
-                                  size_t,                \
-                                  int,                   \
-                                  T*,                    \
-                                  IdxT*,                 \
-                                  bool,                  \
-                                  rmm::cuda_stream_view, \
-                                  rmm::mr::device_memory_resource*);
-
-RAFT_INST(half, uint32_t);
-
-}  // namespace raft::matrix::detail
diff --git a/cpp/src/raft_runtime/cluster/cluster_cost_double.cu b/cpp/src/raft_runtime/cluster/cluster_cost_double.cu
index 2244ba4ed3..b6df92c839 100644
--- a/cpp/src/raft_runtime/cluster/cluster_cost_double.cu
+++ b/cpp/src/raft_runtime/cluster/cluster_cost_double.cu
@@ -15,7 +15,6 @@
  */
 
 #include "cluster_cost.cuh"
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
diff --git a/cpp/src/raft_runtime/cluster/cluster_cost_float.cu b/cpp/src/raft_runtime/cluster/cluster_cost_float.cu
index 4164265b55..2c26b69984 100644
--- a/cpp/src/raft_runtime/cluster/cluster_cost_float.cu
+++ b/cpp/src/raft_runtime/cluster/cluster_cost_float.cu
@@ -15,7 +15,6 @@
  */
 
 #include "cluster_cost.cuh"
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
diff --git a/cpp/src/raft_runtime/cluster/kmeans_fit_double.cu b/cpp/src/raft_runtime/cluster/kmeans_fit_double.cu
index 12f4fba318..0b8b458042 100644
--- a/cpp/src/raft_runtime/cluster/kmeans_fit_double.cu
+++ b/cpp/src/raft_runtime/cluster/kmeans_fit_double.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 
 namespace raft::runtime::cluster::kmeans {
diff --git a/cpp/src/raft_runtime/cluster/kmeans_fit_float.cu b/cpp/src/raft_runtime/cluster/kmeans_fit_float.cu
index 48505dcc3e..a2831c2cf0 100644
--- a/cpp/src/raft_runtime/cluster/kmeans_fit_float.cu
+++ b/cpp/src/raft_runtime/cluster/kmeans_fit_float.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 
 namespace raft::runtime::cluster::kmeans {
diff --git a/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu b/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu
index 5bb0835595..d2ec26f882 100644
--- a/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu
+++ b/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_double.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 
 namespace raft::runtime::cluster::kmeans {
diff --git a/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu b/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu
index f211afd06e..bacab3b7d6 100644
--- a/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu
+++ b/cpp/src/raft_runtime/cluster/kmeans_init_plus_plus_float.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 
 namespace raft::runtime::cluster::kmeans {
diff --git a/cpp/src/raft_runtime/cluster/update_centroids.cuh b/cpp/src/raft_runtime/cluster/update_centroids.cuh
index 7c13252384..29c3bffe75 100644
--- a/cpp/src/raft_runtime/cluster/update_centroids.cuh
+++ b/cpp/src/raft_runtime/cluster/update_centroids.cuh
@@ -15,7 +15,6 @@
  */
 
 #include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/norm.cuh>
@@ -68,4 +67,4 @@ void update_centroids(raft::device_resources const& handle,
                                                          weight_per_cluster_view,
                                                          new_centroids_view);
 }
-}  // namespace raft::runtime::cluster::kmeans
\ No newline at end of file
+}  // namespace raft::runtime::cluster::kmeans
diff --git a/cpp/src/raft_runtime/cluster/update_centroids_double.cu b/cpp/src/raft_runtime/cluster/update_centroids_double.cu
index 0f38c7dd53..0e7b9bf834 100644
--- a/cpp/src/raft_runtime/cluster/update_centroids_double.cu
+++ b/cpp/src/raft_runtime/cluster/update_centroids_double.cu
@@ -15,7 +15,6 @@
  */
 
 #include "update_centroids.cuh"
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
@@ -44,4 +43,4 @@ void update_centroids(raft::device_resources const& handle,
                                 weight_per_cluster);
 }
 
-}  // namespace raft::runtime::cluster::kmeans
\ No newline at end of file
+}  // namespace raft::runtime::cluster::kmeans
diff --git a/cpp/src/raft_runtime/cluster/update_centroids_float.cu b/cpp/src/raft_runtime/cluster/update_centroids_float.cu
index 8f0e79b438..af338d3bd7 100644
--- a/cpp/src/raft_runtime/cluster/update_centroids_float.cu
+++ b/cpp/src/raft_runtime/cluster/update_centroids_float.cu
@@ -15,7 +15,6 @@
  */
 
 #include "update_centroids.cuh"
-#include <raft/cluster/specializations.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance_types.hpp>
 
@@ -44,4 +43,4 @@ void update_centroids(raft::device_resources const& handle,
                                weight_per_cluster);
 }
 
-}  // namespace raft::runtime::cluster::kmeans
\ No newline at end of file
+}  // namespace raft::runtime::cluster::kmeans
diff --git a/cpp/test/cluster/kmeans.cu b/cpp/test/cluster/kmeans.cu
index cfec84256b..20110eed11 100644
--- a/cpp/test/cluster/kmeans.cu
+++ b/cpp/test/cluster/kmeans.cu
@@ -29,10 +29,6 @@
 #include <rmm/device_uvector.hpp>
 #include <thrust/fill.h>
 
-#if defined RAFT_COMPILED
-#include <raft/cluster/specializations.cuh>
-#endif
-
 namespace raft {
 
 template <typename T>
diff --git a/cpp/test/cluster/kmeans_balanced.cu b/cpp/test/cluster/kmeans_balanced.cu
index 220eba4186..a34f2f3b59 100644
--- a/cpp/test/cluster/kmeans_balanced.cu
+++ b/cpp/test/cluster/kmeans_balanced.cu
@@ -30,10 +30,6 @@
 #include <rmm/device_uvector.hpp>
 #include <thrust/fill.h>
 
-#if defined RAFT_COMPILED
-#include <raft/cluster/specializations.cuh>
-#endif
-
 /* This test takes advantage of the fact that make_blobs generates balanced clusters.
  * It doesn't currently test whether the algorithm can make balanced clusters with an imbalanced
  * dataset.
diff --git a/cpp/test/cluster/kmeans_find_k.cu b/cpp/test/cluster/kmeans_find_k.cu
index a865651f56..bb41d4fafc 100644
--- a/cpp/test/cluster/kmeans_find_k.cu
+++ b/cpp/test/cluster/kmeans_find_k.cu
@@ -25,10 +25,6 @@
 #include <raft/random/make_blobs.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#if defined RAFT_COMPILED
-#include <raft/cluster/specializations.cuh>
-#endif
-
 namespace raft {
 
 template <typename T>
diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu
index e92f6c05cc..cbee243c92 100644
--- a/cpp/test/matrix/select_k.cu
+++ b/cpp/test/matrix/select_k.cu
@@ -18,10 +18,6 @@
 
 #include <raft_internal/matrix/select_k.cuh>
 
-#ifdef RAFT_COMPILED
-#include <raft/matrix/specializations.cuh>
-#endif
-
 #include <raft/core/device_resources.hpp>
 #include <raft/random/rng.cuh>
 #include <raft/sparse/detail/utils.h>

From 6196a2b0b51f0ca8f6a7d656b9a587cfb891d4a4 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 13 Apr 2023 20:07:37 +0200
Subject: [PATCH 37/89] Remove remaining specializations

---
 cpp/CMakeLists.txt                            |  9 ----
 cpp/bench/ann/src/raft/raft_benchmark.cu      |  6 +--
 cpp/bench/ann/src/raft/raft_ivf_flat.cu       |  6 +--
 cpp/bench/ann/src/raft/raft_ivf_pq.cu         | 13 +++---
 cpp/bench/prims/distance/distance_common.cuh  |  3 --
 cpp/bench/prims/distance/fused_l2_nn.cu       |  3 --
 cpp/bench/prims/distance/kernels.cu           |  4 --
 cpp/bench/prims/distance/masked_nn.cu         |  4 --
 cpp/bench/prims/neighbors/knn.cuh             |  4 --
 .../prims/neighbors/refine_float_int64_t.cu   |  5 ---
 .../prims/neighbors/refine_uint8_t_int64_t.cu |  4 --
 cpp/include/raft/distance/specializations.cuh | 24 -----------
 .../specializations/detail/kernels.cuh        | 31 -------------
 .../distance/specializations/distance.cuh     | 19 --------
 .../raft/neighbors/specializations.cuh        | 19 --------
 .../raft/sparse/neighbors/specializations.cuh | 20 ---------
 .../raft/spatial/knn/specializations.cuh      | 21 ---------
 .../raft/spatial/knn/specializations/knn.cuh  | 43 -------------------
 cpp/include/raft/spectral/specializations.cuh | 24 -----------
 cpp/include/raft/stats/specializations.cuh    | 24 -----------
 .../detail/kernels/gram_matrix_base_double.cu | 20 ---------
 .../detail/kernels/gram_matrix_base_float.cu  | 20 ---------
 .../kernels/polynomial_kernel_double_int.cu   | 20 ---------
 .../kernels/polynomial_kernel_float_int.cu    | 20 ---------
 .../detail/kernels/rbf_kernel_double.cu       | 20 ---------
 .../detail/kernels/rbf_kernel_float.cu        | 20 ---------
 .../detail/kernels/tanh_kernel_double.cu      | 20 ---------
 .../detail/kernels/tanh_kernel_float.cu       | 20 ---------
 .../brute_force_knn_long_float_int.cu         | 42 ------------------
 .../brute_force_knn_long_float_uint.cu        | 42 ------------------
 .../brute_force_knn_uint32_t_float_int.cu     | 41 ------------------
 .../brute_force_knn_uint32_t_float_uint.cu    | 42 ------------------
 .../raft_runtime/distance/fused_l2_min_arg.cu |  1 -
 .../distance/pairwise_distance.cu             |  3 +-
 .../brute_force_knn_int64_t_float.cu          |  2 -
 cpp/src/raft_runtime/neighbors/ivfpq_build.cu |  1 -
 .../neighbors/ivfpq_search_float_int64_t.cu   |  1 -
 .../neighbors/ivfpq_search_int8_t_int64_t.cu  |  1 -
 .../neighbors/ivfpq_search_uint8_t_int64_t.cu |  1 -
 .../neighbors/refine_d_int64_t_int8_t.cu      |  1 -
 .../neighbors/refine_d_int64_t_uint8_t.cu     |  1 -
 .../neighbors/refine_h_int64_t_float.cu       |  1 -
 .../neighbors/refine_h_int64_t_int8_t.cu      |  1 -
 .../neighbors/refine_h_int64_t_uint8_t.cu     |  1 -
 cpp/template/src/test_distance.cu             |  4 --
 cpp/test/cluster/cluster_solvers.cu           |  4 --
 cpp/test/cluster/linkage.cu                   |  4 --
 cpp/test/distance/fused_l2_nn.cu              |  4 --
 cpp/test/distance/gram.cu                     |  4 --
 cpp/test/distance/masked_nn.cu                |  4 --
 .../ann_cagra/test_float_uint32_t.cu          |  4 --
 cpp/test/neighbors/ann_ivf_flat.cuh           |  6 +--
 .../ann_ivf_flat/test_float_int64_t.cu        |  4 --
 .../ann_ivf_flat/test_int8_t_int64_t.cu       |  4 --
 .../ann_ivf_flat/test_uint8_t_int64_t.cu      |  4 --
 cpp/test/neighbors/ann_ivf_pq.cuh             |  5 ---
 cpp/test/neighbors/ball_cover.cu              |  4 --
 cpp/test/neighbors/epsilon_neighborhood.cu    |  4 --
 cpp/test/neighbors/fused_l2_knn.cu            |  4 --
 cpp/test/neighbors/knn.cu                     |  4 --
 cpp/test/neighbors/refine.cu                  |  4 --
 cpp/test/neighbors/tiled_knn.cu               |  4 --
 cpp/test/sparse/neighbors/knn_graph.cu        |  3 --
 cpp/test/stats/silhouette_score.cu            |  4 --
 cpp/test/stats/trustworthiness.cu             |  4 --
 docs/source/build.md                          | 10 +----
 66 files changed, 10 insertions(+), 714 deletions(-)
 delete mode 100644 cpp/include/raft/distance/specializations.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/detail/kernels.cuh
 delete mode 100644 cpp/include/raft/distance/specializations/distance.cuh
 delete mode 100644 cpp/include/raft/neighbors/specializations.cuh
 delete mode 100644 cpp/include/raft/sparse/neighbors/specializations.cuh
 delete mode 100644 cpp/include/raft/spatial/knn/specializations.cuh
 delete mode 100644 cpp/include/raft/spatial/knn/specializations/knn.cuh
 delete mode 100644 cpp/include/raft/spectral/specializations.cuh
 delete mode 100644 cpp/include/raft/stats/specializations.cuh
 delete mode 100644 cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
 delete mode 100644 cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
 delete mode 100644 cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
 delete mode 100644 cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
 delete mode 100644 cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
 delete mode 100644 cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
 delete mode 100644 cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
 delete mode 100644 cpp/src/nn/specializations/brute_force_knn_long_float_int.cu
 delete mode 100644 cpp/src/nn/specializations/brute_force_knn_long_float_uint.cu
 delete mode 100644 cpp/src/nn/specializations/brute_force_knn_uint32_t_float_int.cu
 delete mode 100644 cpp/src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 67e9ff7210..fae97315db 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -313,15 +313,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
     src/distance/fused_l2_nn.cu
-    src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
-    src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
-    src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
-    src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
-    # These are somehow missing a kernel definition which is causing a compile error.
-    # src/distance/specializations/detail/kernels/rbf_kernel_double.cu
-    # src/distance/specializations/detail/kernels/rbf_kernel_float.cu
-    src/distance/specializations/detail/kernels/tanh_kernel_double.cu
-    src/distance/specializations/detail/kernels/tanh_kernel_float.cu
     src/matrix/detail/select_k_double_int64_t.cu
     src/matrix/detail/select_k_double_uint32_t.cu
     src/matrix/detail/select_k_float_int64_t.cu
diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu
index d8e98ce2a9..e85cfa0281 100644
--- a/cpp/bench/ann/src/raft/raft_benchmark.cu
+++ b/cpp/bench/ann/src/raft/raft_benchmark.cu
@@ -22,10 +22,6 @@
 #include <type_traits>
 #include <utility>
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include "../common/ann_types.hpp"
 #include "../common/benchmark_util.hpp"
 #undef WARP_SIZE
@@ -220,4 +216,4 @@ std::unique_ptr<typename raft::bench::ann::ANN<T>::AnnSearchParam> create_search
 
 #include "../common/benchmark.hpp"
 
-int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
\ No newline at end of file
+int main(int argc, char** argv) { return raft::bench::ann::run_main(argc, argv); }
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat.cu b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
index ff108080b5..bcd23723a4 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat.cu
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat.cu
@@ -15,12 +15,8 @@
  */
 #include "raft_ivf_flat_wrapper.h"
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft::bench::ann {
 template class RaftIvfFlatGpu<float, int64_t>;
 template class RaftIvfFlatGpu<uint8_t, int64_t>;
 template class RaftIvfFlatGpu<int8_t, int64_t>;
-}  // namespace raft::bench::ann
\ No newline at end of file
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq.cu b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
index 338bc9a32f..47f7f66d3a 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq.cu
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
@@ -15,12 +15,9 @@
  */
 #include "raft_ivf_pq_wrapper.h"
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
-namespace raft::bench::ann {
-template class RaftIvfPQ<float, int64_t>;
-template class RaftIvfPQ<uint8_t, int64_t>;
-template class RaftIvfPQ<int8_t, int64_t>;
+onamespace raft::bench::ann
+{
+  template class RaftIvfPQ<float, int64_t>;
+  template class RaftIvfPQ<uint8_t, int64_t>;
+  template class RaftIvfPQ<int8_t, int64_t>;
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/prims/distance/distance_common.cuh b/cpp/bench/prims/distance/distance_common.cuh
index 9b5d67a46f..dff3401b62 100644
--- a/cpp/bench/prims/distance/distance_common.cuh
+++ b/cpp/bench/prims/distance/distance_common.cuh
@@ -17,9 +17,6 @@
 #include <common/benchmark.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/util/cudart_utils.hpp>
-#if defined RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
 #include <rmm/device_uvector.hpp>
 
 namespace raft::bench::distance {
diff --git a/cpp/bench/prims/distance/fused_l2_nn.cu b/cpp/bench/prims/distance/fused_l2_nn.cu
index 1c45572782..d1b3896cc9 100644
--- a/cpp/bench/prims/distance/fused_l2_nn.cu
+++ b/cpp/bench/prims/distance/fused_l2_nn.cu
@@ -17,9 +17,6 @@
 #include <common/benchmark.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
 #include <raft/util/cudart_utils.hpp>
-#if defined RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
 #include <rmm/device_uvector.hpp>
 
 namespace raft::bench::distance {
diff --git a/cpp/bench/prims/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu
index 4407bdcf83..53d97c1fc7 100644
--- a/cpp/bench/prims/distance/kernels.cu
+++ b/cpp/bench/prims/distance/kernels.cu
@@ -13,10 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#if defined RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 #include <common/benchmark.hpp>
 #include <memory>
 #include <raft/core/device_resources.hpp>
diff --git a/cpp/bench/prims/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu
index f9f234187d..c804ecb3a1 100644
--- a/cpp/bench/prims/distance/masked_nn.cu
+++ b/cpp/bench/prims/distance/masked_nn.cu
@@ -30,10 +30,6 @@
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#ifdef RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 namespace raft::bench::distance::masked_nn {
 
 // Introduce various sparsity patterns
diff --git a/cpp/bench/prims/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh
index 8f0b1cb5d9..a987cdc4a2 100644
--- a/cpp/bench/prims/neighbors/knn.cuh
+++ b/cpp/bench/prims/neighbors/knn.cuh
@@ -24,10 +24,6 @@
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
diff --git a/cpp/bench/prims/neighbors/refine_float_int64_t.cu b/cpp/bench/prims/neighbors/refine_float_int64_t.cu
index 43be330e9b..bbedc1ae64 100644
--- a/cpp/bench/prims/neighbors/refine_float_int64_t.cu
+++ b/cpp/bench/prims/neighbors/refine_float_int64_t.cu
@@ -17,11 +17,6 @@
 #include "refine.cuh"
 #include <common/benchmark.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations/refine.cuh>
-#include <raft/spatial/knn/specializations.cuh>
-#endif
-
 using namespace raft::neighbors;
 
 namespace raft::bench::neighbors {
diff --git a/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu b/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
index 1d7cb8c8aa..4952361f03 100644
--- a/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
+++ b/cpp/bench/prims/neighbors/refine_uint8_t_int64_t.cu
@@ -17,10 +17,6 @@
 #include "refine.cuh"
 #include <common/benchmark.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 using namespace raft::neighbors;
 
 namespace raft::bench::neighbors {
diff --git a/cpp/include/raft/distance/specializations.cuh b/cpp/include/raft/distance/specializations.cuh
deleted file mode 100644
index 5944534be7..0000000000
--- a/cpp/include/raft/distance/specializations.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DISTANCE_SPECIALIZATIONS_H
-#define __DISTANCE_SPECIALIZATIONS_H
-
-#pragma once
-
-#include <raft/distance/specializations/distance.cuh>
-
-#endif
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/detail/kernels.cuh b/cpp/include/raft/distance/specializations/detail/kernels.cuh
deleted file mode 100644
index 75c9c023e8..0000000000
--- a/cpp/include/raft/distance/specializations/detail/kernels.cuh
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/gram_matrix.cuh>
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-
-extern template class raft::distance::kernels::detail::GramMatrixBase<double>;
-extern template class raft::distance::kernels::detail::GramMatrixBase<float>;
-
-extern template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
-extern template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
-
-extern template class raft::distance::kernels::detail::TanhKernel<double>;
-extern template class raft::distance::kernels::detail::TanhKernel<float>;
-
-// These are somehow missing a kernel definition which is causing a compile error
-// extern template class raft::distance::kernels::detail::RBFKernel<double>;
-// extern template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
deleted file mode 100644
index df53d896d6..0000000000
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/specializations/detail/kernels.cuh>
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
deleted file mode 100644
index aaca4412e9..0000000000
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/specializations.cuh>
diff --git a/cpp/include/raft/sparse/neighbors/specializations.cuh b/cpp/include/raft/sparse/neighbors/specializations.cuh
deleted file mode 100644
index 23ba38ccda..0000000000
--- a/cpp/include/raft/sparse/neighbors/specializations.cuh
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/specializations.cuh>
-#include <raft/neighbors/specializations.cuh>
\ No newline at end of file
diff --git a/cpp/include/raft/spatial/knn/specializations.cuh b/cpp/include/raft/spatial/knn/specializations.cuh
deleted file mode 100644
index 5f0a39a61b..0000000000
--- a/cpp/include/raft/spatial/knn/specializations.cuh
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/neighbors/specializations/ball_cover.cuh>
-#include <raft/neighbors/specializations/brute_force.cuh>
-#include <raft/neighbors/specializations/fused_l2_knn.cuh>
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.cuh b/cpp/include/raft/spatial/knn/specializations/knn.cuh
deleted file mode 100644
index e045487597..0000000000
--- a/cpp/include/raft/spatial/knn/specializations/knn.cuh
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft::spatial::knn {
-#define RAFT_INST(IdxT, T, IntT)                                                            \
-  extern template void brute_force_knn<IdxT, T, IntT>(raft::device_resources const& handle, \
-                                                      std::vector<T*>& input,               \
-                                                      std::vector<IntT>& sizes,             \
-                                                      IntT D,                               \
-                                                      T* search_items,                      \
-                                                      IntT n,                               \
-                                                      IdxT* res_I,                          \
-                                                      T* res_D,                             \
-                                                      IntT k,                               \
-                                                      bool rowMajorIndex,                   \
-                                                      bool rowMajorQuery,                   \
-                                                      std::vector<IdxT>* translations,      \
-                                                      distance::DistanceType metric,        \
-                                                      float metric_arg);
-
-RAFT_INST(long, float, int);
-RAFT_INST(long, float, unsigned int);
-RAFT_INST(uint32_t, float, int);
-RAFT_INST(uint32_t, float, unsigned int);
-#undef RAFT_INST
-};  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spectral/specializations.cuh b/cpp/include/raft/spectral/specializations.cuh
deleted file mode 100644
index 0ce5f0c653..0000000000
--- a/cpp/include/raft/spectral/specializations.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __SPECTRAL_SPECIALIZATIONS_H
-#define __SPECTRAL_SPECIALIZATIONS_H
-
-#pragma once
-
-#include <raft/distance/specializations.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#endif
\ No newline at end of file
diff --git a/cpp/include/raft/stats/specializations.cuh b/cpp/include/raft/stats/specializations.cuh
deleted file mode 100644
index e6622469d3..0000000000
--- a/cpp/include/raft/stats/specializations.cuh
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef __STATS_SPECIALIZATIONS_H
-#define __STATS_SPECIALIZATIONS_H
-
-#pragma once
-
-#include <raft/distance/specializations.cuh>
-#include <raft/neighbors/specializations.cuh>
-
-#endif
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
deleted file mode 100644
index 7c80eb29d0..0000000000
--- a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/gram_matrix.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::GramMatrixBase<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
deleted file mode 100644
index d777e73dc9..0000000000
--- a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/gram_matrix.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::GramMatrixBase<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
deleted file mode 100644
index 28306d0c21..0000000000
--- a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
deleted file mode 100644
index 6609de69ac..0000000000
--- a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
deleted file mode 100644
index 7ea4b60e09..0000000000
--- a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::RBFKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
deleted file mode 100644
index 423613dcd1..0000000000
--- a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
deleted file mode 100644
index ab818db73b..0000000000
--- a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::TanhKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
deleted file mode 100644
index f7825e577a..0000000000
--- a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/distance/detail/kernels/kernel_matrices.cuh>
-#include <raft/distance/specializations.cuh>
-
-template class raft::distance::kernels::detail::TanhKernel<float>;
\ No newline at end of file
diff --git a/cpp/src/nn/specializations/brute_force_knn_long_float_int.cu b/cpp/src/nn/specializations/brute_force_knn_long_float_int.cu
deleted file mode 100644
index 2c21d1ec64..0000000000
--- a/cpp/src/nn/specializations/brute_force_knn_long_float_int.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-template void brute_force_knn<long, float, int>(raft::device_resources const& handle,
-                                                std::vector<float*>& input,
-                                                std::vector<int>& sizes,
-                                                int D,
-                                                float* search_items,
-                                                int n,
-                                                long* res_I,
-                                                float* res_D,
-                                                int k,
-                                                bool rowMajorIndex,
-                                                bool rowMajorQuery,
-                                                std::vector<long>* translations,
-                                                distance::DistanceType metric,
-                                                float metric_arg);
-
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/nn/specializations/brute_force_knn_long_float_uint.cu b/cpp/src/nn/specializations/brute_force_knn_long_float_uint.cu
deleted file mode 100644
index 7e6e7e80d0..0000000000
--- a/cpp/src/nn/specializations/brute_force_knn_long_float_uint.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-template void brute_force_knn<long, float, unsigned int>(raft::device_resources const& handle,
-                                                         std::vector<float*>& input,
-                                                         std::vector<unsigned int>& sizes,
-                                                         unsigned int D,
-                                                         float* search_items,
-                                                         unsigned int n,
-                                                         long* res_I,
-                                                         float* res_D,
-                                                         unsigned int k,
-                                                         bool rowMajorIndex,
-                                                         bool rowMajorQuery,
-                                                         std::vector<long>* translations,
-                                                         distance::DistanceType metric,
-                                                         float metric_arg);
-
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_int.cu b/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_int.cu
deleted file mode 100644
index e94c12d579..0000000000
--- a/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_int.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-template void brute_force_knn<uint32_t, float, int>(raft::device_resources const& handle,
-                                                    std::vector<float*>& input,
-                                                    std::vector<int>& sizes,
-                                                    int D,
-                                                    float* search_items,
-                                                    int n,
-                                                    uint32_t* res_I,
-                                                    float* res_D,
-                                                    int k,
-                                                    bool rowMajorIndex,
-                                                    bool rowMajorQuery,
-                                                    std::vector<uint32_t>* translations,
-                                                    distance::DistanceType metric,
-                                                    float metric_arg);
-
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu b/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu
deleted file mode 100644
index 95cf8a1eb3..0000000000
--- a/cpp/src/nn/specializations/brute_force_knn_uint32_t_float_uint.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <raft/neighbors/specializations.cuh>
-#include <raft/spatial/knn/knn.cuh>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-template void brute_force_knn<uint32_t, float, unsigned int>(raft::device_resources const& handle,
-                                                             std::vector<float*>& input,
-                                                             std::vector<unsigned int>& sizes,
-                                                             unsigned int D,
-                                                             float* search_items,
-                                                             unsigned int n,
-                                                             uint32_t* res_I,
-                                                             float* res_D,
-                                                             unsigned int k,
-                                                             bool rowMajorIndex,
-                                                             bool rowMajorQuery,
-                                                             std::vector<uint32_t>* translations,
-                                                             distance::DistanceType metric,
-                                                             float metric_arg);
-
-};  // namespace knn
-};  // namespace spatial
-};  // namespace raft
diff --git a/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu b/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
index 487c7e3a4a..bec71ae698 100644
--- a/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
+++ b/cpp/src/raft_runtime/distance/fused_l2_min_arg.cu
@@ -19,7 +19,6 @@
 #include <raft/core/kvp.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
-#include <raft/distance/specializations.cuh>
 #include <raft/linalg/norm.cuh>
 #include <thrust/for_each.h>
 #include <thrust/tuple.h>
diff --git a/cpp/src/raft_runtime/distance/pairwise_distance.cu b/cpp/src/raft_runtime/distance/pairwise_distance.cu
index dfdfa553e9..3c9f0211fe 100644
--- a/cpp/src/raft_runtime/distance/pairwise_distance.cu
+++ b/cpp/src/raft_runtime/distance/pairwise_distance.cu
@@ -17,7 +17,6 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
-#include <raft/distance/specializations.cuh>
 
 namespace raft::runtime::distance {
 
@@ -50,4 +49,4 @@ void pairwise_distance(raft::device_resources const& handle,
   raft::distance::pairwise_distance<double, int>(
     handle, x, y, dists, m, n, k, metric, isRowMajor, metric_arg);
 }
-}  // namespace raft::runtime::distance
\ No newline at end of file
+}  // namespace raft::runtime::distance
diff --git a/cpp/src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu b/cpp/src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
index 88545b3607..ea6002eab0 100644
--- a/cpp/src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
+++ b/cpp/src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
@@ -18,8 +18,6 @@
 #include <raft/core/device_resources.hpp>
 #include <raft/neighbors/brute_force.cuh>
 
-#include <raft/neighbors/specializations.cuh>
-
 #include <raft_runtime/neighbors/brute_force.hpp>
 
 #include <vector>
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_build.cu b/cpp/src/raft_runtime/neighbors/ivfpq_build.cu
index 8759ca2587..f6f8a541c0 100644
--- a/cpp/src/raft_runtime/neighbors/ivfpq_build.cu
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_build.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
 #include <raft_runtime/neighbors/ivf_pq.hpp>
 
 namespace raft::runtime::neighbors::ivf_pq {
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu b/cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu
index 91093d3a39..d55d726671 100644
--- a/cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_search_float_int64_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 #include <raft_runtime/neighbors/ivf_pq.hpp>
 
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu b/cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu
index e1552c0b27..b73cbc0751 100644
--- a/cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_search_int8_t_int64_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 #include <raft_runtime/neighbors/ivf_pq.hpp>
 
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu
index 85195a7551..2b3dfe585d 100644
--- a/cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_search_uint8_t_int64_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 #include <raft_runtime/neighbors/ivf_pq.hpp>
 
diff --git a/cpp/src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu
index 817369ed6a..f8a7a8c9c8 100644
--- a/cpp/src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_int8_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu
index fb426b2c02..8f68f9f88e 100644
--- a/cpp/src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_d_int64_t_uint8_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/src/raft_runtime/neighbors/refine_h_int64_t_float.cu b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_float.cu
index 1f950dc3b6..7f19d44700 100644
--- a/cpp/src/raft_runtime/neighbors/refine_h_int64_t_float.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_float.cu
@@ -16,7 +16,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu
index da99df3618..bd21c6b198 100644
--- a/cpp/src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_int8_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu
index 990754b033..f10d01cc09 100644
--- a/cpp/src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu
+++ b/cpp/src/raft_runtime/neighbors/refine_h_int64_t_uint8_t.cu
@@ -15,7 +15,6 @@
  */
 
 #include <raft/neighbors/refine.cuh>
-#include <raft/neighbors/specializations.cuh>
 
 namespace raft::runtime::neighbors {
 
diff --git a/cpp/template/src/test_distance.cu b/cpp/template/src/test_distance.cu
index b86dde70e5..e165cd8f14 100644
--- a/cpp/template/src/test_distance.cu
+++ b/cpp/template/src/test_distance.cu
@@ -20,10 +20,6 @@
 #include <raft/distance/distance.cuh>
 #include <raft/random/make_blobs.cuh>
 
-#ifdef RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 int main()
 {
   raft::device_resources handle;
diff --git a/cpp/test/cluster/cluster_solvers.cu b/cpp/test/cluster/cluster_solvers.cu
index f26c598a2b..60e5f62dc0 100644
--- a/cpp/test/cluster/cluster_solvers.cu
+++ b/cpp/test/cluster/cluster_solvers.cu
@@ -19,10 +19,6 @@
 #include <memory>
 #include <raft/core/device_resources.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/spectral/specializations.cuh>
-#endif
-
 #include <raft/spectral/cluster_solvers.cuh>
 #include <raft/spectral/modularity_maximization.cuh>
 
diff --git a/cpp/test/cluster/linkage.cu b/cpp/test/cluster/linkage.cu
index c5239063c3..5b17c9be63 100644
--- a/cpp/test/cluster/linkage.cu
+++ b/cpp/test/cluster/linkage.cu
@@ -29,10 +29,6 @@
 #include <raft/linalg/transpose.cuh>
 #include <raft/sparse/coo.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <raft/core/device_mdspan.hpp>
 #include <raft/sparse/hierarchy/single_linkage.cuh>
 #include <raft/util/cudart_utils.hpp>
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 84ad52a324..b3aa570647 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -24,10 +24,6 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 namespace raft {
 namespace distance {
 
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
index f99d02dc7f..32a7493930 100644
--- a/cpp/test/distance/gram.cu
+++ b/cpp/test/distance/gram.cu
@@ -14,10 +14,6 @@
  * limitations under the License.
  */
 
-#if defined RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 #include "../test_utils.cuh"
 #include <gtest/gtest.h>
 #include <iostream>
diff --git a/cpp/test/distance/masked_nn.cu b/cpp/test/distance/masked_nn.cu
index 2b6081fcc3..d27c40db46 100644
--- a/cpp/test/distance/masked_nn.cu
+++ b/cpp/test/distance/masked_nn.cu
@@ -29,10 +29,6 @@
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
 
-#ifdef RAFT_COMPILED
-#include <raft/distance/specializations.cuh>
-#endif
-
 namespace raft::distance::masked_nn {
 
 // The adjacency pattern determines what distances get computed.
diff --git a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
index 71a83e2cca..1497a515d2 100644
--- a/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_cagra/test_float_uint32_t.cu
@@ -18,10 +18,6 @@
 
 #include "../ann_cagra.cuh"
 
-// #if defined RAFT_DISTANCE_COMPILED
-// #include <raft/neighbors/specializations.cuh>
-// #endif
-
 namespace raft::neighbors::experimental::cagra {
 
 typedef AnnCagraTest<float, float, std::uint32_t> AnnCagraTestF;
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index fe6f9163a0..8cd94cba8e 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -36,10 +36,6 @@
 
 #include <thrust/sequence.h>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <cstddef>
 #include <iostream>
 #include <vector>
@@ -357,4 +353,4 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
    raft::distance::DistanceType::InnerProduct,
    false}};
 
-}  // namespace raft::neighbors::ivf_flat
\ No newline at end of file
+}  // namespace raft::neighbors::ivf_flat
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
index e430af89df..f0988ca988 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu
@@ -18,10 +18,6 @@
 
 #include "../ann_ivf_flat.cuh"
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, float, std::int64_t> AnnIVFFlatTestF;
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
index e4e7a207fb..2f542bd6ec 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
@@ -18,10 +18,6 @@
 
 #include "../ann_ivf_flat.cuh"
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, int8_t, std::int64_t> AnnIVFFlatTestF_int8;
diff --git a/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
index ef7980401a..7659707089 100644
--- a/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
+++ b/cpp/test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
@@ -18,10 +18,6 @@
 
 #include "../ann_ivf_flat.cuh"
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft::neighbors::ivf_flat {
 
 typedef AnnIVFFlatTest<float, uint8_t, std::int64_t> AnnIVFFlatTestF_uint8;
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 458a40d9f2..3f09f605c6 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -25,11 +25,6 @@
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/neighbors/ivf_pq_serialize.cuh>
 #include <raft/random/rng.cuh>
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#else
-#pragma message("NN specializations are not enabled; expect very long building times.")
-#endif
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/test/neighbors/ball_cover.cu b/cpp/test/neighbors/ball_cover.cu
index 46ef3a9150..19935154df 100644
--- a/cpp/test/neighbors/ball_cover.cu
+++ b/cpp/test/neighbors/ball_cover.cu
@@ -23,10 +23,6 @@
 #include <raft/random/make_blobs.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/test/neighbors/epsilon_neighborhood.cu b/cpp/test/neighbors/epsilon_neighborhood.cu
index 769cb7ec2d..c78a15dd2d 100644
--- a/cpp/test/neighbors/epsilon_neighborhood.cu
+++ b/cpp/test/neighbors/epsilon_neighborhood.cu
@@ -23,10 +23,6 @@
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 namespace raft {
 namespace spatial {
 namespace knn {
diff --git a/cpp/test/neighbors/fused_l2_knn.cu b/cpp/test/neighbors/fused_l2_knn.cu
index d7e0e1e067..9fbccf681d 100644
--- a/cpp/test/neighbors/fused_l2_knn.cu
+++ b/cpp/test/neighbors/fused_l2_knn.cu
@@ -23,10 +23,6 @@
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/knn.cuh>
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <raft/distance/distance.cuh>
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/test/neighbors/knn.cu b/cpp/test/neighbors/knn.cu
index bcd4b9cb0b..e0f2c2e58e 100644
--- a/cpp/test/neighbors/knn.cu
+++ b/cpp/test/neighbors/knn.cu
@@ -21,10 +21,6 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/brute_force.cuh>
 
-#ifdef RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <rmm/device_buffer.hpp>
 
 #include <gtest/gtest.h>
diff --git a/cpp/test/neighbors/refine.cu b/cpp/test/neighbors/refine.cu
index dd3491673e..d868ba06cf 100644
--- a/cpp/test/neighbors/refine.cu
+++ b/cpp/test/neighbors/refine.cu
@@ -31,10 +31,6 @@
 
 #include <gtest/gtest.h>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <vector>
 
 namespace raft::neighbors {
diff --git a/cpp/test/neighbors/tiled_knn.cu b/cpp/test/neighbors/tiled_knn.cu
index ccc3a64edd..570af5dc7a 100644
--- a/cpp/test/neighbors/tiled_knn.cu
+++ b/cpp/test/neighbors/tiled_knn.cu
@@ -25,10 +25,6 @@
 #include <raft/matrix/init.cuh>
 #include <raft/neighbors/brute_force.cuh>
 
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
-
 #include <rmm/device_buffer.hpp>
 
 #include <gtest/gtest.h>
diff --git a/cpp/test/sparse/neighbors/knn_graph.cu b/cpp/test/sparse/neighbors/knn_graph.cu
index 8873445c37..aadb00879b 100644
--- a/cpp/test/sparse/neighbors/knn_graph.cu
+++ b/cpp/test/sparse/neighbors/knn_graph.cu
@@ -22,9 +22,6 @@
 
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/neighbors/knn_graph.cuh>
-#if defined RAFT_COMPILED
-#include <raft/neighbors/specializations.cuh>
-#endif
 
 #include <iostream>
 
diff --git a/cpp/test/stats/silhouette_score.cu b/cpp/test/stats/silhouette_score.cu
index 40b7e59d81..9ad89d59c0 100644
--- a/cpp/test/stats/silhouette_score.cu
+++ b/cpp/test/stats/silhouette_score.cu
@@ -20,10 +20,6 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/stats/specializations.cuh>
-#endif
-
 #include <raft/stats/silhouette_score.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/stats/trustworthiness.cu b/cpp/test/stats/trustworthiness.cu
index 2fde6b29c1..15b27c7669 100644
--- a/cpp/test/stats/trustworthiness.cu
+++ b/cpp/test/stats/trustworthiness.cu
@@ -20,10 +20,6 @@
 #include <raft/distance/distance.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#if defined RAFT_COMPILED
-#include <raft/stats/specializations.cuh>
-#endif
-
 #include <raft/stats/trustworthiness_score.cuh>
 #include <vector>
 
diff --git a/docs/source/build.md b/docs/source/build.md
index d7550eb631..dfc76fda66 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -272,15 +272,7 @@ If the RAFT headers have already been installed into your environment with cmake
 
 Use `find_package(raft COMPONENTS compiled distributed)` to enable the shared library and transitively pass dependencies through separate targets for each component. In this example, the `raft::compiled` and `raft::distributed` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as NCCL for the `distributed` component).
 
-The pre-compiled libraries contain template specializations for commonly used types, such as single- and double-precision floating-point. In order to use the symbols in the pre-compiled libraries, the compiler needs to be told not to instantiate templates that are already contained in the shared libraries. By convention, these header files are named `specializations.cuh` and located in the base directory for the packages that contain specializations.
-
-The following example tells the compiler to ignore the pre-compiled templates for the `raft::distance` API so any symbols already compiled into the `libraft` shared library will be used instead. RAFT's cmake creates a variable `RAFT_COMPILED` which can be used to ignore the pre-compiled template specializations only when the shared library has been enabled through cmake (such as by specifying the `compiled` component in `find_package`):
-```c++
-#ifdef RAFT_COMPILED
-#include <raft/distance/distance.cuh>
-#include <raft/distance/specializations.cuh>
-#endif
-```
+The pre-compiled libraries contain template instantiations for commonly used types, such as single- and double-precision floating-point. By default, these are used automatically when the `RAFT_COMPILED` macro is defined during compilation. This definition is automatically added by CMake.
 
 ### Building RAFT C++ from source in cmake
 

From c72e70a1eb4a214337aecf4b3cfc3233cb4aa1ec Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 10:36:09 +0200
Subject: [PATCH 38/89] Sort src files in CMakeLists.txt

---
 cpp/CMakeLists.txt | 71 ++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 37 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index fae97315db..94a30254dc 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -276,16 +276,6 @@ if(RAFT_COMPILE_LIBRARY)
   add_library(
     raft_lib
     src/core/logger.cpp
-    src/linalg/detail/coalesced_reduction.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
-    src/neighbors/detail/selection_faiss.cu
-    src/distance/distance.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
@@ -306,41 +296,48 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
-    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
-    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
+    src/distance/distance.cu
     src/distance/fused_l2_nn.cu
+    src/linalg/detail/coalesced_reduction.cu
     src/matrix/detail/select_k_double_int64_t.cu
     src/matrix/detail/select_k_double_uint32_t.cu
     src/matrix/detail/select_k_float_int64_t.cu
     src/matrix/detail/select_k_float_uint32_t.cu
     src/matrix/detail/select_k_half_int64_t.cu
     src/matrix/detail/select_k_half_uint32_t.cu
-    src/neighbors/ivf_flat_build_float_int64_t.cu
-    src/neighbors/ivf_flat_extend_float_int64_t.cu
-    src/neighbors/ivf_flat_search_float_int64_t.cu
-    src/neighbors/ivf_flat_build_int8_t_int64_t.cu
-    src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
-    src/neighbors/ivf_flat_search_int8_t_int64_t.cu
-    src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
-    src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
-    src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
-    src/neighbors/ivfpq_search_float_int64_t.cu
-    src/neighbors/ivfpq_search_int8_t_int64_t.cu
-    src/neighbors/ivfpq_search_uint8_t_int64_t.cu
     src/neighbors/ball_cover.cu
-    src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
+    src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
     src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
+    src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
     src/neighbors/brute_force_knn_int_float_int.cu
     src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
-    src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_float_float_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_int8_t_int32_t_int64_t.cu
     src/neighbors/detail/ivf_flat_interleaved_scan_uint8_t_uint32_t_int64_t.cu
     src/neighbors/detail/ivf_flat_search.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
+    src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
+    src/neighbors/detail/selection_faiss.cu
+    src/neighbors/ivf_flat_build_float_int64_t.cu
+    src/neighbors/ivf_flat_build_int8_t_int64_t.cu
+    src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
+    src/neighbors/ivf_flat_extend_float_int64_t.cu
+    src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
+    src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
+    src/neighbors/ivf_flat_search_float_int64_t.cu
+    src/neighbors/ivf_flat_search_int8_t_int64_t.cu
+    src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
     src/neighbors/ivfpq_build_float_int64_t.cu
     src/neighbors/ivfpq_build_int8_t_int64_t.cu
     src/neighbors/ivfpq_build_uint8_t_int64_t.cu
@@ -384,21 +381,21 @@ if(RAFT_COMPILE_LIBRARY)
     src/raft_runtime/random/rmat_rectangular_generator_int64_float.cu
     src/raft_runtime/random/rmat_rectangular_generator_int_double.cu
     src/raft_runtime/random/rmat_rectangular_generator_int_float.cu
-    src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
-    src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
-    src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
-    src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
-    src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
-    src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
-    src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
     src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
     src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
-    src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
-    src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
-    src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
-    src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
     src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
     src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
+    src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
+    src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
+    src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
+    src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
     src/util/memory_pool.cpp
   )
   set_target_properties(

From c37b73e915d7ac98da95d9afae99ee514edcf384 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 13:44:37 +0200
Subject: [PATCH 39/89] Document RAFT_EXPLICIT

---
 cpp/include/raft/util/raft_explicit.hpp | 113 +++++++++++++++---------
 1 file changed, 69 insertions(+), 44 deletions(-)

diff --git a/cpp/include/raft/util/raft_explicit.hpp b/cpp/include/raft/util/raft_explicit.hpp
index ceb82fadf7..8b5c97390e 100644
--- a/cpp/include/raft/util/raft_explicit.hpp
+++ b/cpp/include/raft/util/raft_explicit.hpp
@@ -14,51 +14,76 @@
  */
 #pragma once
 
-#define RAFT_EXPLICIT                                                     \
-  {                                                                       \
-    raft::util::raft_explicit::do_not_implicitly_instantiate_templates(); \
-    throw "raft_explicit_error";                                          \
+/**
+ * @brief Prevents a function template from being implicitly instantiated
+ *
+ * This macro defines a function body that can be used for function template
+ * definitions of functions that should not be implicitly instantiated.
+ *
+ * When the template is erroneously implicitly instantiated, it provides a
+ * useful error message that tells the user how to avoid the implicit
+ * instantiation.
+ *
+ * The error message is generated using a static assert. It is generally tricky
+ * to have a static assert fire only when you want it, as documented in
+ * P2593: https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+ *
+ * We use the strategy from paragraph 1.3 here. We define a struct
+ * `not_allowed`, whose type is dependent on the template parameters of the
+ * enclosing function instance. We use this struct type to instantiate the
+ * `implicit_instantiation` template class, whose value is always false. We pass
+ * this value to static_assert. This way, the static assert only fires when the
+ * template is instantiated, since `implicit_instantiation` cannot be
+ * instantiated without all the types in the enclosing function template.
+ */
+#define RAFT_EXPLICIT                                                                          \
+  {                                                                                            \
+    /* Type of `not_allowed` depends on template parameters of enclosing function. */          \
+    struct not_allowed {                                                                       \
+    };                                                                                         \
+    static_assert(                                                                             \
+      raft::util::raft_explicit::implicit_instantiation<not_allowed>::value,                   \
+      "ACCIDENTAL_IMPLICIT_INSTANTIATION\n\n"                                                  \
+                                                                                               \
+      "If you see this error, then you have implicitly instantiated a function\n"              \
+      "template. To keep compile times in check, libraft has the policy of\n"                  \
+      "explicitly instantiating templates. To fix the compilation error, follow\n"             \
+      "these steps.\n\n"                                                                       \
+                                                                                               \
+      "If you scroll up or down a bit, you probably saw a line like the following:\n\n"        \
+                                                                                               \
+      "detected during instantiation of \"void raft::foo(T) [with T=float]\" at line [..]\n\n" \
+                                                                                               \
+      "Simplest temporary solution:\n\n"                                                       \
+                                                                                               \
+      "    Add '#undef RAFT_EXPLICIT_INSTANTIATE' at the top of your .cpp/.cu file.\n\n"       \
+                                                                                               \
+      "Best solution:\n\n"                                                                     \
+                                                                                               \
+      "    1. Add the following line to the file include/raft/foo.hpp:\n\n"                    \
+                                                                                               \
+      "        extern template void raft::foo<double>(double);\n\n"                            \
+                                                                                               \
+      "    2. Add the following line to the file src/raft/foo.cpp:\n\n"                        \
+                                                                                               \
+      "        template void raft::foo<double>(double)\n");                                    \
+                                                                                               \
+    /* Function may have non-void return type. */                                              \
+    /* To prevent warnings/errors about missing returns, throw an exception. */                \
+    throw "raft_explicit_error";                                                               \
   }
 
 namespace raft::util::raft_explicit {
-
-// To make sure the static_assert only fires when
-// do_not_implicitly_instantiate_templates is instantiated, we use a dummy
-// template parameter as described in P2593:
-// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
-template <bool implicit_instantiation_allowed = false>
-void do_not_implicitly_instantiate_templates()
-{
-  static_assert(implicit_instantiation_allowed,
-                "ACCIDENTAL_IMPLICIT_INSTANTIATION\n\n"
-
-                "If you see this error, then you have implicitly instantiated a function\n"
-                "template. To keep compile times in check, libfoo has the policy of\n"
-                "explicitly instantiating templates. To fix the compilation error, follow\n"
-                "these steps.\n\n"
-
-                "If you scroll up a bit in your error message, you probably saw two lines\n"
-                "like the following:\n\n"
-
-                "[.. snip ..] required from ‘void raft::do_not_implicitly_instantiate_templates() "
-                "[with int dummy = 0]’\n"
-                "[.. snip ..] from ‘void raft::bar(T) [with T = double]’\n\n"
-
-                "Simple solution:\n\n"
-
-                "    Add '#undef RAFT_EXPLICIT_INSTANTIATE' at the top of your .cpp/.cu file.\n\n"
-
-                "Best solution:\n\n"
-
-                "    1. Add the following line to the file include/raft/bar.hpp:\n\n"
-
-                "        extern template void raft::bar<double>(double);\n\n"
-
-                "    2. Add the following line to the file src/raft/bar.cpp:\n\n"
-
-                "        template void raft::bar<double>(double)\n\n"
-
-                "Probability is that there are many other similar lines in both files.\n");
-}
-
+/**
+ * @brief Template that is always false
+ *
+ * This template is from paragraph 1.3 of P2593:
+ * https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p2593r0.html
+ *
+ * The value of `value` is always false, but it depends on a template parameter.
+ */
+template <typename T>
+struct implicit_instantiation {
+  static constexpr bool value = false;
+};
 }  // namespace raft::util::raft_explicit

From 75c076b755c704f9b27519f3628a2091a2690deb Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 13:46:26 +0200
Subject: [PATCH 40/89] Undo custom RMM

---
 cpp/CMakeLists.txt | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 94a30254dc..dc32209c0d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -13,17 +13,6 @@
 set(RAPIDS_VERSION "23.06")
 set(RAFT_VERSION "23.06.00")
 
-include(FetchContent)
-FetchContent_Declare(
-  rapids-cmake
-  GIT_REPOSITORY https://github.com/ahendriksen/rapids-cmake.git
-  GIT_TAG different-rmm
-)
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.02/RAPIDS.cmake
-     ${CMAKE_CURRENT_BINARY_DIR}/RAPIDS.cmake
-)
-include(${CMAKE_CURRENT_BINARY_DIR}/RAPIDS.cmake)
-
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 include(../fetch_rapids.cmake)
 include(rapids-cmake)

From e97e0bde8e4c2ff3e933ca75bda75bc61b6feae9 Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Thu, 23 Mar 2023 14:57:18 -0400
Subject: [PATCH 41/89] DOC

---
 dependencies.yaml               | 9 +++++----
 python/raft-dask/pyproject.toml | 5 -----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 66b1c31b2b..7254d12cb3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -278,10 +278,11 @@ dependencies:
           - ucx>=1.13.0
           - ucx-py=0.32.*
           - ucx-proc=*=gpu
-      - output_types: pyproject
-        packages:
-          - pylibraft==23.6.*
-  test_python_common:
+          - rmm=23.06
+          - libfaiss>=1.7.1=cuda*
+          - faiss-proc=*=cuda
+          - dask-cuda=23.06
+  test_python:
     common:
       - output_types: [conda, requirements]
         packages:
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 597f1a8764..28938e1590 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -39,11 +39,6 @@ dependencies = [
     "joblib>=0.11",
     "dask-cuda==23.6.*",
     "dask>=2023.1.1",
-    "distributed>=2023.1.1",
-    "joblib>=0.11",
-    "numba>=0.49",
-    "numpy>=1.21",
-    "pylibraft==23.6.*",
     "ucx-py==0.32.*",
     "distributed>=2023.1.1",
     "pylibraft==23.6.*",

From bcadeaeddd2dce0a222cc1b5947d1e2ce5724526 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Date: Wed, 29 Mar 2023 21:38:40 -0400
Subject: [PATCH 42/89] Update pylibraft version

---
 dependencies.yaml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 7254d12cb3..66b1c31b2b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -278,11 +278,10 @@ dependencies:
           - ucx>=1.13.0
           - ucx-py=0.32.*
           - ucx-proc=*=gpu
-          - rmm=23.06
-          - libfaiss>=1.7.1=cuda*
-          - faiss-proc=*=cuda
-          - dask-cuda=23.06
-  test_python:
+      - output_types: pyproject
+        packages:
+          - pylibraft==23.6.*
+  test_python_common:
     common:
       - output_types: [conda, requirements]
         packages:

From c767b8718b84cdc4b6668c82f437da4cbd71f561 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Date: Wed, 29 Mar 2023 21:48:31 -0400
Subject: [PATCH 43/89] Run dfg

---
 python/raft-dask/pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 28938e1590..597f1a8764 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -39,6 +39,11 @@ dependencies = [
     "joblib>=0.11",
     "dask-cuda==23.6.*",
     "dask>=2023.1.1",
+    "distributed>=2023.1.1",
+    "joblib>=0.11",
+    "numba>=0.49",
+    "numpy>=1.21",
+    "pylibraft==23.6.*",
     "ucx-py==0.32.*",
     "distributed>=2023.1.1",
     "pylibraft==23.6.*",

From 30d09b16ab8ccb86c41e595f051194d5803a5a35 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 5 Apr 2023 20:31:12 -0500
Subject: [PATCH 44/89] Pin `dask` and `distributed` for release (#1399)

This PR pins `dask` and `distributed` to `2023.3.2` and `2023.3.2.1` respectively for `23.04` release.

xref: https://github.com/rapidsai/cudf/pull/13070

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)
  - Ben Frederickson (https://github.com/benfred)
  - Joseph (https://github.com/jolorunyomi)

URL: https://github.com/rapidsai/raft/pull/1399
---
 .github/workflows/pr.yaml                        | 4 ++--
 .github/workflows/test.yaml                      | 4 ++--
 conda/environments/all_cuda-118_arch-x86_64.yaml | 7 ++++---
 python/raft-dask/pyproject.toml                  | 9 +++------
 4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 2085f89414..0ce32b73b6 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -103,7 +103,7 @@ jobs:
       build_type: pull-request
       package-name: raft_dask
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
-      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
       test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
       test-smoketest: "python ./ci/wheel_smoke_test_raft_dask.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index e0731a9a97..11ff3333d1 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -51,6 +51,6 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: raft_dask
-      test-before-amd64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
-      test-before-arm64: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
       test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a992ebddb1..0f1999c93c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -18,9 +18,10 @@ dependencies:
 - cupy
 - cxx-compiler
 - cython>=0.29,<0.30
-- dask-cuda=23.06
-- dask>=2023.1.1
-- distributed>=2023.1.1
+- dask-core==2023.3.2
+- dask-cuda==23.4.*
+- dask==2023.3.2
+- distributed==2023.3.2.1
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 597f1a8764..4901df6c38 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -34,12 +34,9 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "numpy",
-    "numba>=0.49",
-    "joblib>=0.11",
-    "dask-cuda==23.6.*",
-    "dask>=2023.1.1",
-    "distributed>=2023.1.1",
+    "dask-cuda==23.4.*",
+    "dask==2023.3.2",
+    "distributed==2023.3.2.1",
     "joblib>=0.11",
     "numba>=0.49",
     "numpy>=1.21",

From facb5d2e992abd8b0dfd141e78693d2c752bf4cc Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 7 Apr 2023 08:22:33 -0400
Subject: [PATCH 45/89] Have consistent compile lines between BUILD_TESTS
 enabled or not (#1401)

This will remove 1h from our conda CI builds since we can now re-use the cached object files between `libraft` and `libraft-tests`

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ben Frederickson (https://github.com/benfred)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/1401
---
 cpp/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index dc32209c0d..1824ebb241 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -70,11 +70,13 @@ option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
 
-# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
-# have different values for the `Threads::Threads` target. Setting this flag ensures
+
+# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
+# to have different values for the `Threads::Threads` target. Setting this flag ensures
 # `Threads::Threads` is the same value across all builds so that cache hits occur
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 
+
 include(CMakeDependentOption)
 # cmake_dependent_option( RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for
 # nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARY OFF )

From d3626f92445494c2e84cd5fc4abc1be03fddbfd9 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 13 Apr 2023 19:15:18 -0400
Subject: [PATCH 46/89] Generate build metrics report for test and benchmarks
 (#1414)

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/1414
---
 build.sh                                      | 37 ++++++++++++++-----
 conda/recipes/libraft/build_libraft.sh        |  2 +-
 .../recipes/libraft/build_libraft_nn_bench.sh |  2 +-
 conda/recipes/libraft/build_libraft_tests.sh  |  2 +-
 4 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/build.sh b/build.sh
index 270c75de93..039f0ed6a5 100755
--- a/build.sh
+++ b/build.sh
@@ -18,8 +18,8 @@ ARGS=$*
 # scripts, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pylibraft raft-dask docs tests template bench-prims bench-ann clean --uninstall  -v -g -n --compile-lib --allgpuarch --no-nvtx --show_depr_warn  --build-metrics --incl-cache-stats --time -h"
-HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-prims=<targets>] [--limit-bench-ann=<targets>]
+VALIDARGS="clean libraft pylibraft raft-dask docs tests template bench-prims bench-ann clean --uninstall  -v -g -n --compile-lib --allgpuarch --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
+HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-prims=<targets>] [--limit-bench-ann=<targets>] [--build-metrics=<filename>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    libraft          - build the raft C++ code only. Also builds the C-wrapper library
@@ -45,7 +45,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    --allgpuarch                - build for all supported GPU architectures
    --no-nvtx                   - disable nvtx (profiling markers), but allow enabling it in downstream projects
    --show_depr_warn            - show cmake deprecation warnings
-   --build-metrics             - generate build metrics report for libraft
+   --build-metrics             - filename for generating build metrics report for libraft
    --incl-cache-stats          - include cache statistics in build metrics report
    --cmake-args=\\\"<args>\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument)
    --cache-tool=<tool>         - pass the build cache tool (eg: ccache, sccache, distcc) that will be used
@@ -73,7 +73,7 @@ BUILD_PRIMS_BENCH=OFF
 BUILD_ANN_BENCH=OFF
 COMPILE_LIBRARY=OFF
 INSTALL_TARGET=install
-BUILD_REPORT_METRICS=OFF
+BUILD_REPORT_METRICS=""
 BUILD_REPORT_INCL_CACHE_STATS=OFF
 
 TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NEIGHBORS_TEST;NEIGHBORS_TEST;STATS_TEST;UTILS_TEST"
@@ -189,6 +189,25 @@ function limitAnnBench {
     fi
 }
 
+function buildMetrics {
+    # Check for multiple build-metrics options
+    if [[ $(echo $ARGS | { grep -Eo "\-\-build\-metrics" || true; } | wc -l ) -gt 1 ]]; then
+        echo "Multiple --build-metrics options were provided, please provide only one: ${ARGS}"
+        exit 1
+    fi
+    # Check for build-metrics option
+    if [[ -n $(echo $ARGS | { grep -E "\-\-build\-metrics" || true; } ) ]]; then
+        # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
+        # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
+        # on the invalid option error
+        BUILD_REPORT_METRICS=$(echo $ARGS | sed -e 's/.*--build-metrics=//' -e 's/ .*//')
+        if [[ -n ${BUILD_REPORT_METRICS} ]]; then
+            # Remove the full BUILD_REPORT_METRICS argument from list of args so that it passes validArgs function
+            ARGS=${ARGS//--build-metrics=$BUILD_REPORT_METRICS/}
+        fi
+    fi
+}
+
 if hasArg -h || hasArg --help; then
     echo "${HELP}"
     exit 0
@@ -201,6 +220,7 @@ if (( ${NUMARGS} != 0 )); then
     limitTests
     limitBench
     limitAnnBench
+    buildMetrics
     for a in ${ARGS}; do
         if ! (echo " ${VALIDARGS} " | grep -q " ${a} "); then
             echo "Invalid option: ${a}"
@@ -339,9 +359,6 @@ fi
 if hasArg clean; then
     CLEAN=1
 fi
-if hasArg --build-metrics; then
-    BUILD_REPORT_METRICS=ON
-fi
 if hasArg --incl-cache-stats; then
     BUILD_REPORT_INCL_CACHE_STATS=ON
 fi
@@ -422,7 +439,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
   compile_end=$(date +%s)
   compile_total=$(( compile_end - compile_start ))
 
-  if [[ "$BUILD_REPORT_METRICS" == "ON" && -f "${LIBRAFT_BUILD_DIR}/.ninja_log" ]]; then
+  if [[ -n "$BUILD_REPORT_METRICS" && -f "${LIBRAFT_BUILD_DIR}/.ninja_log" ]]; then
       if ! rapids-build-metrics-reporter.py 2> /dev/null && [ ! -f rapids-build-metrics-reporter.py ]; then
           echo "Downloading rapids-build-metrics-reporter.py"
           curl -sO https://raw.githubusercontent.com/rapidsai/build-metrics-reporter/v1/rapids-build-metrics-reporter.py
@@ -454,13 +471,13 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has
           MSG="${MSG}<br/>libraft.so size: $LIBRAFT_FS"
       fi
       BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIBRAFT_BUILD_DIR}"}
-      echo "The HTML report can be found at [${BMR_DIR}/ninja_log.html]. In CI, this report"
+      echo "The HTML report can be found at [${BMR_DIR}/${BUILD_REPORT_METRICS}.html]. In CI, this report"
       echo "will also be uploaded to the appropriate subdirectory of https://downloads.rapids.ai/ci/raft/, and"
       echo "the entire URL can be found in \"conda-cpp-build\" runs under the task \"Upload additional artifacts\""
       mkdir -p ${BMR_DIR}
       MSG_OUTFILE="$(mktemp)"
       echo "$MSG" > "${MSG_OUTFILE}"
-      PATH=".:$PATH" python rapids-build-metrics-reporter.py ${LIBRAFT_BUILD_DIR}/.ninja_log --fmt html --msg "${MSG_OUTFILE}" > ${BMR_DIR}/ninja_log.html
+      PATH=".:$PATH" python rapids-build-metrics-reporter.py ${LIBRAFT_BUILD_DIR}/.ninja_log --fmt html --msg "${MSG_OUTFILE}" > ${BMR_DIR}/${BUILD_REPORT_METRICS}.html
       cp ${LIBRAFT_BUILD_DIR}/.ninja_log ${BMR_DIR}/ninja.log
   fi
 fi
diff --git a/conda/recipes/libraft/build_libraft.sh b/conda/recipes/libraft/build_libraft.sh
index 2bf9b428cb..7d4173e8bb 100644
--- a/conda/recipes/libraft/build_libraft.sh
+++ b/conda/recipes/libraft/build_libraft.sh
@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
-./build.sh libraft --allgpuarch --compile-lib --build-metrics --incl-cache-stats --no-nvtx
+./build.sh libraft --allgpuarch --compile-lib --build-metrics=compile_lib --incl-cache-stats --no-nvtx
diff --git a/conda/recipes/libraft/build_libraft_nn_bench.sh b/conda/recipes/libraft/build_libraft_nn_bench.sh
index dc6250f0f4..00078792a1 100644
--- a/conda/recipes/libraft/build_libraft_nn_bench.sh
+++ b/conda/recipes/libraft/build_libraft_nn_bench.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
-./build.sh tests bench-ann --allgpuarch --no-nvtx
+./build.sh bench-ann --allgpuarch --no-nvtx --build-metrics=bench_ann --incl-cache-stats
 cmake --install cpp/build --component ann_bench
diff --git a/conda/recipes/libraft/build_libraft_tests.sh b/conda/recipes/libraft/build_libraft_tests.sh
index cc28f93fb8..05a2b59eb0 100644
--- a/conda/recipes/libraft/build_libraft_tests.sh
+++ b/conda/recipes/libraft/build_libraft_tests.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
 # Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
-./build.sh tests bench-prims --allgpuarch --no-nvtx
+./build.sh tests bench-prims --allgpuarch --no-nvtx --build-metrics=tests_bench_prims --incl-cache-stats
 cmake --install cpp/build --component testing

From 70977202f9b0571ec557233905c1df6ab7b391d1 Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Fri, 14 Apr 2023 01:16:56 +0200
Subject: [PATCH 47/89] Fix IVF-PQ API to use `device_vector_view` (#1384)

This PR mainly intends to replace `device_matrix_view` for `ivf_pq::extend` to `device_vector_view`.
There are also a few updates to the documentation to reflect the current API.
The order of the arguments in the API is not touched.

Authors:
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Victor Lafargue (https://github.com/viclafargue)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/1384
---
 cpp/include/raft_runtime/neighbors/ivf_pq.hpp | 40 ++++++-------
 cpp/src/raft_runtime/neighbors/ivfpq_build.cu | 58 +++++++++----------
 cpp/test/neighbors/ann_ivf_pq.cuh             |  5 +-
 .../pylibraft/neighbors/ivf_flat/ivf_flat.pyx |  2 +-
 .../neighbors/ivf_pq/cpp/c_ivf_pq.pxd         | 12 ++--
 .../pylibraft/neighbors/ivf_pq/ivf_pq.pyx     | 22 +++++--
 6 files changed, 77 insertions(+), 62 deletions(-)

diff --git a/cpp/include/raft_runtime/neighbors/ivf_pq.hpp b/cpp/include/raft_runtime/neighbors/ivf_pq.hpp
index fb22d7657e..17260b0ded 100644
--- a/cpp/include/raft_runtime/neighbors/ivf_pq.hpp
+++ b/cpp/include/raft_runtime/neighbors/ivf_pq.hpp
@@ -23,26 +23,26 @@ namespace raft::runtime::neighbors::ivf_pq {
 // We define overloads for build and extend with void return type. This is used in the Cython
 // wrappers, where exception handling is not compatible with return type that has nontrivial
 // constructor.
-#define RAFT_DECL_BUILD_EXTEND(T, IdxT)                                                         \
-  [[nodiscard]] raft::neighbors::ivf_pq::index<IdxT> build(                                     \
-    raft::device_resources const& handle,                                                       \
-    const raft::neighbors::ivf_pq::index_params& params,                                        \
-    raft::device_matrix_view<const T, IdxT, row_major> dataset);                                \
-                                                                                                \
-  void build(raft::device_resources const& handle,                                              \
-             const raft::neighbors::ivf_pq::index_params& params,                               \
-             raft::device_matrix_view<const T, IdxT, row_major> dataset,                        \
-             raft::neighbors::ivf_pq::index<IdxT>* idx);                                        \
-                                                                                                \
-  [[nodiscard]] raft::neighbors::ivf_pq::index<IdxT> extend(                                    \
-    raft::device_resources const& handle,                                                       \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                             \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,           \
-    const raft::neighbors::ivf_pq::index<IdxT>& idx);                                           \
-                                                                                                \
-  void extend(raft::device_resources const& handle,                                             \
-              raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                   \
-              std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices, \
+#define RAFT_DECL_BUILD_EXTEND(T, IdxT)                                              \
+  [[nodiscard]] raft::neighbors::ivf_pq::index<IdxT> build(                          \
+    raft::device_resources const& handle,                                            \
+    const raft::neighbors::ivf_pq::index_params& params,                             \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset);                     \
+                                                                                     \
+  void build(raft::device_resources const& handle,                                   \
+             const raft::neighbors::ivf_pq::index_params& params,                    \
+             raft::device_matrix_view<const T, IdxT, row_major> dataset,             \
+             raft::neighbors::ivf_pq::index<IdxT>* idx);                             \
+                                                                                     \
+  [[nodiscard]] raft::neighbors::ivf_pq::index<IdxT> extend(                         \
+    raft::device_resources const& handle,                                            \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                  \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,           \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx);                                \
+                                                                                     \
+  void extend(raft::device_resources const& handle,                                  \
+              raft::device_matrix_view<const T, IdxT, row_major> new_vectors,        \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices, \
               raft::neighbors::ivf_pq::index<IdxT>* idx);
 
 RAFT_DECL_BUILD_EXTEND(float, int64_t);
diff --git a/cpp/src/raft_runtime/neighbors/ivfpq_build.cu b/cpp/src/raft_runtime/neighbors/ivfpq_build.cu
index f6f8a541c0..5bfb546060 100644
--- a/cpp/src/raft_runtime/neighbors/ivfpq_build.cu
+++ b/cpp/src/raft_runtime/neighbors/ivfpq_build.cu
@@ -19,35 +19,35 @@
 
 namespace raft::runtime::neighbors::ivf_pq {
 
-#define RAFT_INST_BUILD_EXTEND(T, IdxT)                                                         \
-  raft::neighbors::ivf_pq::index<IdxT> build(                                                   \
-    raft::device_resources const& handle,                                                       \
-    const raft::neighbors::ivf_pq::index_params& params,                                        \
-    raft::device_matrix_view<const T, IdxT, row_major> dataset)                                 \
-  {                                                                                             \
-    return raft::neighbors::ivf_pq::build<T, IdxT>(handle, params, dataset);                    \
-  }                                                                                             \
-  void build(raft::device_resources const& handle,                                              \
-             const raft::neighbors::ivf_pq::index_params& params,                               \
-             raft::device_matrix_view<const T, IdxT, row_major> dataset,                        \
-             raft::neighbors::ivf_pq::index<IdxT>* idx)                                         \
-  {                                                                                             \
-    *idx = raft::neighbors::ivf_pq::build<T, IdxT>(handle, params, dataset);                    \
-  }                                                                                             \
-  raft::neighbors::ivf_pq::index<IdxT> extend(                                                  \
-    raft::device_resources const& handle,                                                       \
-    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                             \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,           \
-    const raft::neighbors::ivf_pq::index<IdxT>& idx)                                            \
-  {                                                                                             \
-    return raft::neighbors::ivf_pq::extend<T, IdxT>(handle, new_vectors, new_indices, idx);     \
-  }                                                                                             \
-  void extend(raft::device_resources const& handle,                                             \
-              raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                   \
-              std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices, \
-              raft::neighbors::ivf_pq::index<IdxT>* idx)                                        \
-  {                                                                                             \
-    raft::neighbors::ivf_pq::extend<T, IdxT>(handle, new_vectors, new_indices, idx);            \
+#define RAFT_INST_BUILD_EXTEND(T, IdxT)                                                     \
+  raft::neighbors::ivf_pq::index<IdxT> build(                                               \
+    raft::device_resources const& handle,                                                   \
+    const raft::neighbors::ivf_pq::index_params& params,                                    \
+    raft::device_matrix_view<const T, IdxT, row_major> dataset)                             \
+  {                                                                                         \
+    return raft::neighbors::ivf_pq::build<T, IdxT>(handle, params, dataset);                \
+  }                                                                                         \
+  void build(raft::device_resources const& handle,                                          \
+             const raft::neighbors::ivf_pq::index_params& params,                           \
+             raft::device_matrix_view<const T, IdxT, row_major> dataset,                    \
+             raft::neighbors::ivf_pq::index<IdxT>* idx)                                     \
+  {                                                                                         \
+    *idx = raft::neighbors::ivf_pq::build<T, IdxT>(handle, params, dataset);                \
+  }                                                                                         \
+  raft::neighbors::ivf_pq::index<IdxT> extend(                                              \
+    raft::device_resources const& handle,                                                   \
+    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                         \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,                  \
+    const raft::neighbors::ivf_pq::index<IdxT>& idx)                                        \
+  {                                                                                         \
+    return raft::neighbors::ivf_pq::extend<T, IdxT>(handle, new_vectors, new_indices, idx); \
+  }                                                                                         \
+  void extend(raft::device_resources const& handle,                                         \
+              raft::device_matrix_view<const T, IdxT, row_major> new_vectors,               \
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,        \
+              raft::neighbors::ivf_pq::index<IdxT>* idx)                                    \
+  {                                                                                         \
+    raft::neighbors::ivf_pq::extend<T, IdxT>(handle, new_vectors, new_indices, idx);        \
   }
 
 RAFT_INST_BUILD_EXTEND(float, int64_t);
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 3f09f605c6..90387cde2f 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -206,13 +206,12 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     auto idx = ivf_pq::build<DataT, IdxT>(handle_, ipams, database_view);
 
     auto vecs_2_view = raft::make_device_matrix_view<DataT, IdxT>(vecs_2, size_2, ps.dim);
-    auto inds_2_view = raft::make_device_matrix_view<IdxT, IdxT>(inds_2, size_2, 1);
+    auto inds_2_view = raft::make_device_vector_view<IdxT, IdxT>(inds_2, size_2);
     ivf_pq::extend<DataT, IdxT>(handle_, vecs_2_view, inds_2_view, &idx);
 
     auto vecs_1_view =
       raft::make_device_matrix_view<DataT, IdxT, row_major>(vecs_1, size_1, ps.dim);
-    auto inds_1_view =
-      raft::make_device_matrix_view<const IdxT, IdxT, row_major>(inds_1, size_1, 1);
+    auto inds_1_view = raft::make_device_vector_view<const IdxT, IdxT>(inds_1, size_1);
     ivf_pq::extend<DataT, IdxT>(handle_, vecs_1_view, inds_1_view, &idx);
     return idx;
   }
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx b/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx
index db279ad2db..352376fe17 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx
@@ -427,7 +427,7 @@ def extend(Index index, new_vectors, new_indices, handle=None):
         Trained ivf_flat object.
     new_vectors : CUDA array interface compliant matrix shape (n_samples, dim)
         Supported dtype [float, int8, uint8]
-    new_indices : CUDA array interface compliant matrix shape (n_samples, dim)
+    new_indices : CUDA array interface compliant vector shape (n_samples)
         Supported dtype [int64]
     {handle_docstring}
 
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
index d04d833f3b..531c2428e9 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
@@ -29,7 +29,11 @@ from libcpp.string cimport string
 
 from rmm._lib.memory_resource cimport device_memory_resource
 
-from pylibraft.common.cpp.mdspan cimport device_matrix_view, row_major
+from pylibraft.common.cpp.mdspan cimport (
+    device_matrix_view,
+    device_vector_view,
+    row_major,
+)
 from pylibraft.common.handle cimport device_resources
 from pylibraft.common.optional cimport optional
 from pylibraft.distance.distance_type cimport DistanceType
@@ -126,19 +130,19 @@ cdef extern from "raft_runtime/neighbors/ivf_pq.hpp" \
     cdef void extend(
         const device_resources& handle,
         device_matrix_view[float, int64_t, row_major] new_vectors,
-        optional[device_matrix_view[int64_t, int64_t, row_major]] new_indices,
+        optional[device_vector_view[int64_t, int64_t]] new_indices,
         index[int64_t]* index) except +
 
     cdef void extend(
         const device_resources& handle,
         device_matrix_view[int8_t, int64_t, row_major] new_vectors,
-        optional[device_matrix_view[int64_t, int64_t, row_major]] new_indices,
+        optional[device_vector_view[int64_t, int64_t]] new_indices,
         index[int64_t]* index) except +
 
     cdef void extend(
         const device_resources& handle,
         device_matrix_view[uint8_t, int64_t, row_major] new_vectors,
-        optional[device_matrix_view[int64_t, int64_t, row_major]] new_indices,
+        optional[device_vector_view[int64_t, int64_t]] new_indices,
         index[int64_t]* index) except +
 
     cdef void search(
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
index 1906c569f6..b89e5dd44d 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -51,10 +51,16 @@ from rmm._lib.memory_resource cimport (
 
 cimport pylibraft.neighbors.ivf_flat.cpp.c_ivf_flat as c_ivf_flat
 cimport pylibraft.neighbors.ivf_pq.cpp.c_ivf_pq as c_ivf_pq
+from pylibraft.common.optional cimport make_optional, optional
 
 from pylibraft.neighbors.common import _check_input_array, _get_metric
 
-from pylibraft.common.cpp.mdspan cimport device_matrix_view, row_major
+from pylibraft.common.cpp.mdspan cimport (
+    device_matrix_view,
+    device_vector_view,
+    make_device_vector_view,
+    row_major,
+)
 from pylibraft.common.mdspan cimport (
     get_dmv_float,
     get_dmv_int8,
@@ -416,7 +422,7 @@ def extend(Index index, new_vectors, new_indices, handle=None):
         Trained ivf_pq object.
     new_vectors : array interface compliant matrix shape (n_samples, dim)
         Supported dtype [float, int8, uint8]
-    new_indices : array interface compliant matrix shape (n_samples, dim)
+    new_indices : array interface compliant vector shape (n_samples)
         Supported dtype [int64]
     {handle_docstring}
 
@@ -472,6 +478,7 @@ def extend(Index index, new_vectors, new_indices, handle=None):
 
     vecs_cai = wrap_array(new_vectors)
     vecs_dt = vecs_cai.dtype
+    cdef optional[device_vector_view[int64_t, int64_t]] new_indices_opt
     cdef int64_t n_rows = vecs_cai.shape[0]
     cdef uint32_t dim = vecs_cai.shape[1]
 
@@ -484,23 +491,28 @@ def extend(Index index, new_vectors, new_indices, handle=None):
     if len(idx_cai.shape)!=1:
         raise ValueError("Indices array is expected to be 1D")
 
+    if index.index.size() > 0:
+        new_indices_opt = make_device_vector_view(
+            <int64_t *><uintptr_t>idx_cai.data,
+            <int64_t>idx_cai.shape[0])
+
     if vecs_dt == np.float32:
         with cuda_interruptible():
             c_ivf_pq.extend(deref(handle_),
                             get_dmv_float(vecs_cai, check_shape=True),
-                            make_optional_view_int64(get_dmv_int64(idx_cai, check_shape=False)),  # noqa: E501
+                            new_indices_opt,
                             index.index)
     elif vecs_dt == np.int8:
         with cuda_interruptible():
             c_ivf_pq.extend(deref(handle_),
                             get_dmv_int8(vecs_cai, check_shape=True),
-                            make_optional_view_int64(get_dmv_int64(idx_cai, check_shape=False)),  # noqa: E501
+                            new_indices_opt,
                             index.index)
     elif vecs_dt == np.uint8:
         with cuda_interruptible():
             c_ivf_pq.extend(deref(handle_),
                             get_dmv_uint8(vecs_cai, check_shape=True),
-                            make_optional_view_int64(get_dmv_int64(idx_cai, check_shape=False)),  # noqa: E501
+                            new_indices_opt,
                             index.index)
     else:
         raise TypeError("query dtype %s not supported" % vecs_dt)

From a7a46ca46129993683423007d4d2f8fa8e596acb Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 13 Apr 2023 19:44:31 -0400
Subject: [PATCH 48/89] Adding base header-only conda package without cuda math
 libs (#1386)

cc @MatthiasKohl  @bdice

Making sure CI agrees w/ this change. @MatthiasKohl, if CI succeeds here let's try to plug the resulting conda packages into a cugraph-ops PR to make sure cugraph-ops CI is happy as well.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/1386
---
 build.sh                                      |  2 +-
 conda/recipes/libraft/meta.yaml               | 51 +++++++-----------
 cpp/CMakeLists.txt                            | 53 ++++++++++++++-----
 cpp/bench/ann/CMakeLists.txt                  |  1 +
 .../raft/linalg/detail/cublas_wrappers.hpp    |  8 +--
 .../detail/modularity_maximization.hpp        | 18 -------
 cpp/test/CMakeLists.txt                       |  3 +-
 docs/source/build.md                          | 24 +++++----
 8 files changed, 82 insertions(+), 78 deletions(-)

diff --git a/build.sh b/build.sh
index 039f0ed6a5..ab904abdad 100755
--- a/build.sh
+++ b/build.sh
@@ -522,7 +522,7 @@ fi
 # Initiate build for example RAFT application template (if needed)
 
 if hasArg template; then
-    pushd cpp/template
+    pushd ${REPODIR}/cpp/template
     ./build.sh
     popd
 fi
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index ccd7314484..8ec9cc10c6 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -16,7 +16,7 @@ source:
   git_url: ../../..
 
 outputs:
-  - name: libraft-headers
+  - name: libraft-headers-only
     version: {{ version }}
     script: build_libraft_headers.sh
     build:
@@ -50,20 +50,26 @@ outputs:
         - ninja
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
-        - cuda-profiler-api {{ cuda_profiler_api_host_version }}
-        - cudatoolkit ={{ cuda_version }}
-        - libcublas {{ libcublas_host_version }}
-        - libcublas-dev {{ libcublas_host_version }}
-        - libcurand {{ libcurand_host_version }}
-        - libcurand-dev {{ libcurand_host_version }}
-        - libcusolver {{ libcusolver_host_version }}
-        - libcusolver-dev {{ libcusolver_host_version }}
-        - libcusparse {{ libcusparse_host_version }}
-        - libcusparse-dev {{ libcusparse_host_version }}
         - librmm ={{ minor_version }}
+        - cudatoolkit {{ cuda_version }}
+    about:
+      home: https://rapids.ai/
+      license: Apache-2.0
+      summary: libraft-headers-only library
+  - name: libraft-headers
+    version: {{ version }}
+    build:
+      number: {{ GIT_DESCRIBE_NUMBER }}
+      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+      ignore_run_exports_from:
+        - {{ compiler('cuda') }}
+        - librmm
+    requirements:
       run:
-        - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
+        - {{ pin_subpackage('libraft-headers-only', exact=True) }}
         - cuda-profiler-api {{ cuda_profiler_api_run_version }}
+        - cudatoolkit {{ cuda_version }}
+        - librmm ={{ minor_version }}
         - libcublas {{ libcublas_run_version }}
         - libcublas-dev {{ libcublas_run_version }}
         - libcurand {{ libcurand_run_version }}
@@ -72,7 +78,6 @@ outputs:
         - libcusolver-dev {{ libcusolver_run_version }}
         - libcusparse {{ libcusparse_run_version }}
         - libcusparse-dev {{ libcusparse_run_version }}
-        - librmm ={{ minor_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
@@ -130,7 +135,6 @@ outputs:
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
         - {{ pin_subpackage('libraft', exact=True) }}
-        - {{ pin_subpackage('libraft-headers', exact=True) }}
         - cuda-profiler-api {{ cuda_profiler_api_host_version }}
         - gmock {{ gtest_version }}
         - gtest {{ gtest_version }}
@@ -144,7 +148,6 @@ outputs:
         - libcusparse-dev {{ libcusparse_host_version }}
       run:
         - {{ pin_subpackage('libraft', exact=True) }}
-        - {{ pin_subpackage('libraft-headers', exact=True) }}
         - gmock {{ gtest_version }}
         - gtest {{ gtest_version }}
     about:
@@ -170,19 +173,10 @@ outputs:
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
         - {{ pin_subpackage('libraft', exact=True) }}
-        - {{ pin_subpackage('libraft-headers', exact=True) }}
-        - cuda-profiler-api {{ cuda_profiler_api_host_version }}
         - libcublas {{ libcublas_host_version }}
         - libcublas-dev {{ libcublas_host_version }}
-        - libcurand {{ libcurand_host_version }}
-        - libcurand-dev {{ libcurand_host_version }}
-        - libcusolver {{ libcusolver_host_version }}
-        - libcusolver-dev {{ libcusolver_host_version }}
-        - libcusparse {{ libcusparse_host_version }}
-        - libcusparse-dev {{ libcusparse_host_version }}
       run:
         - {{ pin_subpackage('libraft', exact=True) }}
-        - {{ pin_subpackage('libraft-headers', exact=True) }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
@@ -206,23 +200,14 @@ outputs:
         - sysroot_{{ target_platform }} {{ sysroot_version }}
       host:
         - {{ pin_subpackage('libraft', exact=True) }}
-        - {{ pin_subpackage('libraft-headers', exact=True) }}
-        - cuda-profiler-api {{ cuda_profiler_api_host_version }}
         - libcublas {{ libcublas_host_version }}
         - libcublas-dev {{ libcublas_host_version }}
-        - libcurand {{ libcurand_host_version }}
-        - libcurand-dev {{ libcurand_host_version }}
-        - libcusolver {{ libcusolver_host_version }}
-        - libcusolver-dev {{ libcusolver_host_version }}
-        - libcusparse {{ libcusparse_host_version }}
-        - libcusparse-dev {{ libcusparse_host_version }}
         - glog {{ glog_version }}
         - nlohmann_json {{ nlohmann_json_version }}
         - libfaiss>=1.7.1
         - faiss-proc=*=cuda
       run:
         - {{ pin_subpackage('libraft', exact=True) }}
-        - {{ pin_subpackage('libraft-headers', exact=True) }}
         - glog {{ glog_version }}
         - faiss-proc=*=cuda
         - libfaiss {{ faiss_version }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1824ebb241..701b056771 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -179,17 +179,7 @@ target_include_directories(
 )
 
 # Keep RAFT as lightweight as possible. Only CUDA libs and rmm should be used in global target.
-target_link_libraries(
-  raft
-  INTERFACE rmm::rmm
-            cuco::cuco
-            nvidia::cutlass::cutlass
-            CUDA::cublas${_ctk_static_suffix}
-            CUDA::curand${_ctk_static_suffix}
-            CUDA::cusolver${_ctk_static_suffix}
-            CUDA::cusparse${_ctk_static_suffix}
-            raft::Thrust
-)
+target_link_libraries(raft INTERFACE rmm::rmm cuco::cuco nvidia::cutlass::cutlass raft::Thrust)
 
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 target_compile_options(
@@ -197,6 +187,15 @@ target_compile_options(
                  --expt-relaxed-constexpr>
 )
 
+set(RAFT_CUSOLVER_DEPENDENCY CUDA::cusolver${_ctk_static_suffix})
+set(RAFT_CUBLAS_DEPENDENCY CUDA::cublas${_ctk_static_suffix})
+set(RAFT_CURAND_DEPENDENCY CUDA::curand${_ctk_static_suffix})
+set(RAFT_CUSPARSE_DEPENDENCY CUDA::cusparse${_ctk_static_suffix})
+
+set(RAFT_CTK_MATH_DEPENDENCIES ${RAFT_CUBLAS_DEPENDENCY} ${RAFT_CUSOLVER_DEPENDENCY}
+                               ${RAFT_CUSPARSE_DEPENDENCY} ${RAFT_CURAND_DEPENDENCY}
+)
+
 # Endian detection
 include(TestBigEndian)
 test_big_endian(BIG_ENDIAN)
@@ -402,7 +401,13 @@ if(RAFT_COMPILE_LIBRARY)
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
 
-  target_link_libraries(raft_lib PUBLIC raft::raft $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>)
+  target_link_libraries(
+    raft_lib
+    PUBLIC raft::raft
+           ${RAFT_CTK_MATH_DEPENDENCIES} # TODO: Once `raft::resources` is used everywhere, this
+                                         # will just be cublas
+           $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+  )
   target_compile_options(
     raft_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
                      "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
@@ -555,6 +560,30 @@ if(TARGET raft_lib)
   list(APPEND raft_export_sets raft-compiled-lib-exports)
 endif()
 
+string(
+  APPEND
+  code_string
+  [=[
+ option(RAFT_ENABLE_CUSOLVER_DEPENDENCY "Enable cusolver dependency" ON)
+ option(RAFT_ENABLE_CUBLAS_DEPENDENCY "Enable cublas dependency" ON)
+ option(RAFT_ENABLE_CURAND_DEPENDENCY "Enable curand dependency" ON)
+ option(RAFT_ENABLE_CUSPARSE_DEPENDENCY "Enable cusparse dependency" ON)
+
+mark_as_advanced(RAFT_ENABLE_CUSOLVER_DEPENDENCY)
+mark_as_advanced(RAFT_ENABLE_CUBLAS_DEPENDENCY)
+mark_as_advanced(RAFT_ENABLE_CURAND_DEPENDENCY)
+mark_as_advanced(RAFT_ENABLE_CUSPARSE_DEPENDENCY)
+
+target_link_libraries(raft::raft INTERFACE
+  $<$<BOOL:${RAFT_ENABLE_CUSOLVER_DEPENDENCY}>:${RAFT_CUSOLVER_DEPENDENCY}>
+  $<$<BOOL:${RAFT_ENABLE_CUBLAS_DEPENDENCY}>:${RAFT_CUBLAS_DEPENDENCY}>
+  $<$<BOOL:${RAFT_ENABLE_CUSPARSE_DEPENDENCY}>:${RAFT_CUSPARSE_DEPENDENCY}>
+  $<$<BOOL:${RAFT_ENABLE_CURAND_DEPENDENCY}>:${RAFT_CURAND_DEPENDENCY}>
+)
+]=]
+)
+
+# Use `rapids_export` for 22.04 as it will have COMPONENT support
 rapids_export(
   INSTALL raft
   EXPORT_SET raft-exports
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 6267be518e..a14018a15d 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -80,6 +80,7 @@ function(ConfigureAnnBench)
             $<$<BOOL:${RAFT_ANN_BENCH_USE_MULTIGPU}>:NCCL::NCCL>
             ${ConfigureAnnBench_LINKS}
             Threads::Threads
+            ${RAFT_CTK_MATH_DEPENDENCIES}
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
   )
diff --git a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
index 03975b1b7d..87a195757c 100644
--- a/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
+++ b/cpp/include/raft/linalg/detail/cublas_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -965,7 +965,8 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
                                 cudaStream_t stream)
 {
   RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
-  return cublasSdot(handle, n, x, incx, y, incy, result);
+  return cublasDotEx(
+    handle, n, x, CUDA_R_32F, incx, y, CUDA_R_32F, incy, result, CUDA_R_32F, CUDA_R_32F);
 }
 
 template <>
@@ -979,7 +980,8 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
                                 cudaStream_t stream)
 {
   RAFT_CUBLAS_TRY(cublasSetStream(handle, stream));
-  return cublasDdot(handle, n, x, incx, y, incy, result);
+  return cublasDotEx(
+    handle, n, x, CUDA_R_64F, incx, y, CUDA_R_64F, incy, result, CUDA_R_64F, CUDA_R_64F);
 }
 /** @} */
 
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index 160664bae8..d81c64b257 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -32,24 +32,6 @@
 #include <raft/spectral/eigen_solvers.cuh>
 #include <raft/spectral/matrix_wrappers.hpp>
 
-#ifdef COLLECT_TIME_STATISTICS
-#include <cuda_profiler_api.h>
-#include <stddef.h>
-#include <sys/resource.h>
-#include <sys/sysinfo.h>
-#include <sys/time.h>
-#endif
-
-#ifdef COLLECT_TIME_STATISTICS
-static double timer(void)
-{
-  struct timeval tv;
-  cudaDeviceSynchronize();
-  gettimeofday(&tv, NULL);
-  return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
-}
-#endif
-
 namespace raft {
 namespace spectral {
 namespace detail {
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index c7292361b7..175411f983 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -31,12 +31,13 @@ function(ConfigureTest)
 
   target_link_libraries(
     ${TEST_NAME}
-    PRIVATE raft::raft
+    PRIVATE raft
             raft_internal
             $<$<BOOL:${ConfigureTest_LIB}>:raft::compiled>
             GTest::gtest
             GTest::gtest_main
             Threads::Threads
+            ${RAFT_CTK_MATH_DEPENDENCIES}
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
   )
diff --git a/docs/source/build.md b/docs/source/build.md
index dfc76fda66..021286847d 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -151,18 +151,22 @@ make -j<parallel_level> install
 
 RAFT's cmake has the following configurable flags available:.
 
-| Flag                      | Possible Values      | Default Value | Behavior |
-|---------------------------|----------------------| --- | --- |
-| BUILD_TESTS               | ON, OFF              | ON | Compile Googletests |
-| BUILD_PRIMS_BENCH               | ON, OFF              | OFF | Compile benchmarks |
+| Flag                            | Possible Values      | Default Value | Behavior                                                                     |
+|---------------------------------|----------------------| --- |------------------------------------------------------------------------------|
+| BUILD_TESTS                     | ON, OFF              | ON | Compile Googletests                                                          |
+| BUILD_PRIMS_BENCH                     | ON, OFF              | OFF | Compile benchmarks                                                           |
 | BUILD_ANN_BENCH               | ON, OFF              | OFF | Compile end-to-end ANN benchmarks |
-| raft_FIND_COMPONENTS      | compiled distributed | | Configures the optional components as a space-separated list |
 | RAFT_COMPILE_LIBRARY      | ON, OFF              | ON if either BUILD_TESTS or BUILD_PRIMS_BENCH is ON; otherwise OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
-| DETECT_CONDA_ENV          | ON, OFF              | ON | Enable detection of conda environment for dependencies |
-| RAFT_NVTX                 | ON, OFF              | OFF | Enable NVTX Markers |
-| CUDA_ENABLE_KERNELINFO    | ON, OFF              | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` |
-| CUDA_ENABLE_LINEINFO      | ON, OFF              | OFF | Enable the -lineinfo option for nvcc |
-| CUDA_STATIC_RUNTIME       | ON, OFF              | OFF | Statically link the CUDA runtime |
+| raft_FIND_COMPONENTS            | compiled distributed | | Configures the optional components as a space-separated list                 |
+| RAFT_ENABLE_CUBLAS_DEPENDENCY   | ON, OFF | ON | Link against cublas library in `raft::raft`                                  | 
+| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against cusolver library in `raft::raft`                                | 
+| RAFT_ENABLE_CUSPARSE_DEPENDENCY | ON, OFF | ON | Link against cusparse library in `raft::raft`                                | 
+| RAFT_ENABLE_CUSOLVER_DEPENDENCY | ON, OFF | ON | Link against curand library in `raft::raft`                                  | 
+| DETECT_CONDA_ENV                | ON, OFF              | ON | Enable detection of conda environment for dependencies                       |
+| RAFT_NVTX                       | ON, OFF              | OFF | Enable NVTX Markers                                                          |
+| CUDA_ENABLE_KERNELINFO          | ON, OFF              | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer`         |
+| CUDA_ENABLE_LINEINFO            | ON, OFF              | OFF | Enable the -lineinfo option for nvcc                                         |
+| CUDA_STATIC_RUNTIME             | ON, OFF              | OFF | Statically link the CUDA runtime                                             |
 
 Currently, shared libraries are provided for the `libraft-nn` and `libraft-distance` components.
 

From 38d276ebc8ff300ce2f498a410762beb4110526d Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 15:56:32 +0200
Subject: [PATCH 49/89] Fix style

---
 python/pylibraft/pyproject.toml | 2 --
 python/raft-dask/pyproject.toml | 2 --
 2 files changed, 4 deletions(-)

diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index b4eb296089..cf8e722b86 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -48,8 +48,6 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "cupy",
-    "pytest",
-    "pytest-cov",
     "scikit-learn",
     "scipy",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 4901df6c38..210a8ffaa8 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -54,8 +54,6 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "pytest",
-    "pytest-cov",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From b9ba602c312bc52052dd725439ed02028cc30306 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 16:05:21 +0200
Subject: [PATCH 50/89] Fix style

---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 cpp/cmake/modules/ConfigureCUDA.cmake            | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 0f1999c93c..6bb981c1d0 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -53,5 +53,6 @@ dependencies:
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
 - ucx-py=0.32.*
+- ucx-py==0.32.*
 - ucx>=1.13.0
 name: all_cuda-118_arch-x86_64
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index d88d48a5cf..8b2b5c54d0 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -37,7 +37,6 @@ if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
   # list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
 endif()
 
-
 # Option to enable line info in CUDA device compilation to allow introspection when profiling /
 # memchecking
 if(CUDA_ENABLE_LINEINFO)

From 7ca22423fcb137c3485b826d8147e0c32c5ebf1a Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 16:20:12 +0200
Subject: [PATCH 51/89] Remove greppable-id comments

They are now redundant as a compiler error will be raised on accidental
template instantiation
---
 .../neighbors/detail/ivf_flat_interleaved_scan-inl.cuh   | 8 --------
 cpp/include/raft/neighbors/detail/refine.cuh             | 9 ---------
 2 files changed, 17 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
index 01bfbf4a43..4eed2aa453 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh
@@ -1056,14 +1056,6 @@ void ivfflat_interleaved_scan(const index<T, IdxT>& index,
                               uint32_t& grid_dim_x,
                               rmm::cuda_stream_view stream)
 {
-  // greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
-  // function is used in both raft::neighbors::ivf_flat::search and
-  // raft::neighbors::detail::refine_device. To prevent a duplicate
-  // instantiation of this function (which defines ~270 kernels) in the refine
-  // specializations, an extern template definition is provided. Please check
-  // related function calls after editing this function definition. Search for
-  // `greppable-id-specializations-ivf-flat-search` to find them.
-
   const int capacity = bound_by_power_of_two(k);
   select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
                                                      index.veclen(),
diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh
index 5fa0e1ab15..0ff5e4cdbc 100644
--- a/cpp/include/raft/neighbors/detail/refine.cuh
+++ b/cpp/include/raft/neighbors/detail/refine.cuh
@@ -118,15 +118,6 @@ void refine_device(raft::device_resources const& handle,
                                                            neighbor_candidates.data_handle(),
                                                            n_queries,
                                                            n_candidates);
-
-  // greppable-id-specializations-ivf-flat-search: The ivfflat_interleaved_scan
-  // function is used in both raft::neighbors::ivf_flat::search and
-  // raft::neighbors::detail::refine_device. To prevent a duplicate
-  // instantiation of this function (which defines ~270 kernels) in the refine
-  // specializations, an extern template definition is provided. Please check
-  // and adjust the extern template definition and the instantiation when the
-  // below function call is edited. Search for
-  // `greppable-id-specializations-ivf-flat-search` to find them.
   uint32_t grid_dim_x = 1;
   raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<
     data_t,

From 00db48f496acf080f5b7749dfaa3b13de8a14122 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 16:32:56 +0200
Subject: [PATCH 52/89] Add note to modify generating python script

Instead of the src file itself.
---
 .../pairwise_matrix/dispatch_00_generate.py   | 24 ++++++++++++++++++-
 ...patch_canberra_double_double_double_int.cu |  9 +++++++
 ...dispatch_canberra_float_float_float_int.cu |  9 +++++++
 ...ch_correlation_double_double_double_int.cu |  9 +++++++
 ...patch_correlation_float_float_float_int.cu |  9 +++++++
 ...ispatch_cosine_double_double_double_int.cu |  9 +++++++
 .../dispatch_cosine_float_float_float_int.cu  |  9 +++++++
 ...ing_unexpanded_double_double_double_int.cu |  9 +++++++
 ...amming_unexpanded_float_float_float_int.cu |  9 +++++++
 ...inger_expanded_double_double_double_int.cu |  9 +++++++
 ...ellinger_expanded_float_float_float_int.cu |  9 +++++++
 ...jensen_shannon_double_double_double_int.cu |  9 +++++++
 ...ch_jensen_shannon_float_float_float_int.cu |  9 +++++++
 ..._kl_divergence_double_double_double_int.cu |  9 +++++++
 ...tch_kl_divergence_float_float_float_int.cu |  9 +++++++
 .../dispatch_l1_double_double_double_int.cu   |  9 +++++++
 .../dispatch_l1_float_float_float_int.cu      |  9 +++++++
 ...ch_l2_expanded_double_double_double_int.cu |  9 +++++++
 ...patch_l2_expanded_float_float_float_int.cu |  9 +++++++
 ..._l2_unexpanded_double_double_double_int.cu |  9 +++++++
 ...tch_l2_unexpanded_float_float_float_int.cu |  9 +++++++
 ...dispatch_l_inf_double_double_double_int.cu |  9 +++++++
 .../dispatch_l_inf_float_float_float_int.cu   |  9 +++++++
 ..._lp_unexpanded_double_double_double_int.cu |  9 +++++++
 ...tch_lp_unexpanded_float_float_float_int.cu |  9 +++++++
 ...tch_russel_rao_double_double_double_int.cu |  9 +++++++
 ...spatch_russel_rao_float_float_float_int.cu |  9 +++++++
 cpp/src/neighbors/brute_force_00_generate.py  | 23 +++++++++++++++++-
 .../brute_force_fused_l2_knn_float_int64_t.cu |  9 +++++++
 .../brute_force_knn_int64_t_float_int64_t.cu  |  9 +++++++
 .../brute_force_knn_int64_t_float_uint32_t.cu |  9 +++++++
 .../brute_force_knn_int_float_int.cu          |  9 +++++++
 ...brute_force_knn_uint32_t_float_uint32_t.cu |  9 +++++++
 .../ivf_pq_compute_similarity_00_generate.py  | 23 +++++++++++++++++-
 .../ivf_pq_compute_similarity_float_float.cu  |  9 +++++++
 ...f_pq_compute_similarity_float_fp8_false.cu |  9 +++++++
 ...vf_pq_compute_similarity_float_fp8_true.cu |  9 +++++++
 .../ivf_pq_compute_similarity_float_half.cu   |  9 +++++++
 ...vf_pq_compute_similarity_half_fp8_false.cu |  9 +++++++
 ...ivf_pq_compute_similarity_half_fp8_true.cu |  9 +++++++
 .../ivf_pq_compute_similarity_half_half.cu    |  9 +++++++
 cpp/src/neighbors/ivf_flat_00_generate.py     | 23 +++++++++++++++++-
 .../neighbors/ivf_flat_build_float_int64_t.cu |  9 +++++++
 .../ivf_flat_build_int8_t_int64_t.cu          |  9 +++++++
 .../ivf_flat_build_uint8_t_int64_t.cu         |  9 +++++++
 .../ivf_flat_extend_float_int64_t.cu          |  9 +++++++
 .../ivf_flat_extend_int8_t_int64_t.cu         |  9 +++++++
 .../ivf_flat_extend_uint8_t_int64_t.cu        |  9 +++++++
 .../ivf_flat_search_float_int64_t.cu          |  9 +++++++
 .../ivf_flat_search_int8_t_int64_t.cu         |  9 +++++++
 .../ivf_flat_search_uint8_t_int64_t.cu        |  9 +++++++
 cpp/src/neighbors/refine_00_generate.py       | 23 +++++++++++++++++-
 cpp/src/neighbors/refine_float_float.cu       |  9 +++++++
 cpp/src/neighbors/refine_int8_t_float.cu      |  9 +++++++
 cpp/src/neighbors/refine_uint8_t_float.cu     |  9 +++++++
 .../ball_cover/registers_00_generate.py       | 23 +++++++++++++++++-
 .../ball_cover/registers_pass_one_2d_dist.cu  |  9 +++++++
 .../registers_pass_one_2d_euclidean.cu        |  9 +++++++
 .../registers_pass_one_2d_haversine.cu        |  9 +++++++
 .../ball_cover/registers_pass_one_3d_dist.cu  |  9 +++++++
 .../registers_pass_one_3d_euclidean.cu        |  9 +++++++
 .../registers_pass_one_3d_haversine.cu        |  9 +++++++
 .../ball_cover/registers_pass_two_2d_dist.cu  |  9 +++++++
 .../registers_pass_two_2d_euclidean.cu        |  9 +++++++
 .../registers_pass_two_2d_haversine.cu        |  9 +++++++
 .../ball_cover/registers_pass_two_3d_dist.cu  |  9 +++++++
 .../registers_pass_two_3d_euclidean.cu        |  9 +++++++
 .../registers_pass_two_3d_haversine.cu        |  9 +++++++
 68 files changed, 691 insertions(+), 6 deletions(-)

diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
index 42ce3e2291..4537397aac 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -1,4 +1,16 @@
-#!/usr/bin/env python3
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 # NOTE: this template is not perfectly formatted. Use pre-commit to get
 # everything in shape again.
@@ -18,6 +30,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp> // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>  // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh> // dispatch
@@ -149,3 +170,4 @@ def arch_headers(archs):
             FinOpT = "raft::identity_op"
             f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
             f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
+        print(f"src/distance/detail/pairwise_matrix/{path}")
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
index e937e319f0..41db12e9ae 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
index 87f6d3ba6b..f038e53381 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
index 04d223edb3..52e4cc02d8 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
index a5f66d448c..c9481d6c22 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_correlation_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
index afd9dfdffc..517858125b 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
index cd720b5363..62f1d9874b 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
index 8b1ca0d6d5..500f7b4a9c 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
index 4a4338ebd1..3be7586b43 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
index e21a601ff6..023134ddff 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
index e1c08abf46..e438f121f2 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_hellinger_expanded_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
index c3a675eb0e..31c5003ad6 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
index 6dcc056e2d..e78c1c320a 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_jensen_shannon_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
index 4bb4ad1f11..5b95df9614 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
index 44d6c6cace..fb72c91b73 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_kl_divergence_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
index 3d257c5001..cac5acad92 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
index 1a0b393a0a..78aa097961 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l1_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
index 4059844964..c8d922f6fa 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
index 474a29a149..20cf57f898 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_expanded_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
index 99624b59b9..eadd0d2c2b 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
index 5901e7c142..e4b5dd3a86 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l2_unexpanded_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
index 22e1470bc3..45d021bce9 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
index 28a66bb36a..ba48e52a18 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_l_inf_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
index 3b36712161..ffa58793d9 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
index be5f30a4d6..915c68f05f 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
index 7e66efae9e..db45dc8b94 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
index bb3f493445..a2a5a9fafe 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
 #include <raft/core/operators.hpp>                                // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
 #include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
diff --git a/cpp/src/neighbors/brute_force_00_generate.py b/cpp/src/neighbors/brute_force_00_generate.py
index d955b3ea78..53a10b0a08 100644
--- a/cpp/src/neighbors/brute_force_00_generate.py
+++ b/cpp/src/neighbors/brute_force_00_generate.py
@@ -1,4 +1,16 @@
-#!/usr/bin/env python3
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 header = """
 /*
@@ -17,6 +29,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
 #include <cstdint>
 #include <raft/neighbors/brute_force-inl.cuh>
 
diff --git a/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu b/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
index 8e9ba3ea22..4b7eeb034c 100644
--- a/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
+++ b/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
 #include <cstdint>
 #include <raft/neighbors/brute_force-inl.cuh>
 
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu b/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
index b2ed7c7732..cb2414d164 100644
--- a/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
+++ b/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
 #include <cstdint>
 #include <raft/neighbors/brute_force-inl.cuh>
 
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu b/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
index fa815cee64..e4b9c608f1 100644
--- a/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
+++ b/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
 #include <cstdint>
 #include <raft/neighbors/brute_force-inl.cuh>
 
diff --git a/cpp/src/neighbors/brute_force_knn_int_float_int.cu b/cpp/src/neighbors/brute_force_knn_int_float_int.cu
index c3f51a2b1b..c9df0b3bbd 100644
--- a/cpp/src/neighbors/brute_force_knn_int_float_int.cu
+++ b/cpp/src/neighbors/brute_force_knn_int_float_int.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
 #include <cstdint>
 #include <raft/neighbors/brute_force-inl.cuh>
 
diff --git a/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu b/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
index 88e1e82aab..17076857df 100644
--- a/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
+++ b/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by brute_force_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python brute_force_00_generate.py
+ *
+ */
+
 #include <cstdint>
 #include <raft/neighbors/brute_force-inl.cuh>
 
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
index e3eead977c..a740d01bd2 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_00_generate.py
@@ -1,4 +1,16 @@
-#!/usr/bin/env python3
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 header = """
 /*
@@ -17,6 +29,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
 #include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
 
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
index ea31db16f4..956b7010d5 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_float.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
 #include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
 
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
index 02826f6f45..fba72ad1dd 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
 #include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
 
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
index 61558da70b..030f429315 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
 #include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
 
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
index c97ee5b20b..31a4d7d503 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_float_half.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
 #include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
 
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
index de4325b33b..c623c80446 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
 #include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
 
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
index baec4f1bea..f2aaca20db 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
 #include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
 
diff --git a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
index a73ed07ce4..4420b2534b 100644
--- a/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
+++ b/cpp/src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_pq_compute_similarity_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_pq_compute_similarity_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh>
 #include <raft/neighbors/detail/ivf_pq_fp_8bit.cuh>
 
diff --git a/cpp/src/neighbors/ivf_flat_00_generate.py b/cpp/src/neighbors/ivf_flat_00_generate.py
index a5bff90165..7ba04646d5 100644
--- a/cpp/src/neighbors/ivf_flat_00_generate.py
+++ b/cpp/src/neighbors/ivf_flat_00_generate.py
@@ -1,4 +1,16 @@
-#!/usr/bin/env python3
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 header = """/*
  * Copyright (c) 2023, NVIDIA CORPORATION.
@@ -16,6 +28,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/ivf_flat-inl.cuh>
 """
 
diff --git a/cpp/src/neighbors/ivf_flat_build_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_build_float_int64_t.cu
index ad4912efa4..622f7c7d90 100644
--- a/cpp/src/neighbors/ivf_flat_build_float_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_build_float_int64_t.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
 #define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)      \
diff --git a/cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu
index 3e5253048c..7b1eeae32d 100644
--- a/cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_build_int8_t_int64_t.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
 #define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)      \
diff --git a/cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
index 7ea9976645..40cf28151f 100644
--- a/cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
 #define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)      \
diff --git a/cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu
index 48d77488a6..f7d99d7081 100644
--- a/cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_extend_float_int64_t.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
 #define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
diff --git a/cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
index 68fe1e3677..9eec4f9648 100644
--- a/cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_extend_int8_t_int64_t.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
 #define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
diff --git a/cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
index aa371b96bc..fc24cbff74 100644
--- a/cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_extend_uint8_t_int64_t.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
 #define instantiate_raft_neighbors_ivf_flat_extend(T, IdxT)                \
diff --git a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
index ab29d7f63a..60e0105fd1 100644
--- a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
diff --git a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
index 00b8944d85..b317c3fc5b 100644
--- a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
diff --git a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
index b402626aa4..1545b4dbcf 100644
--- a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by ivf_flat_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python ivf_flat_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
diff --git a/cpp/src/neighbors/refine_00_generate.py b/cpp/src/neighbors/refine_00_generate.py
index c3795a031e..18c8857e3f 100644
--- a/cpp/src/neighbors/refine_00_generate.py
+++ b/cpp/src/neighbors/refine_00_generate.py
@@ -1,4 +1,16 @@
-#!/usr/bin/env python3
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 header = """
 /*
@@ -17,6 +29,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by refine_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python refine_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/refine-inl.cuh>
 
 #define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)       \\
diff --git a/cpp/src/neighbors/refine_float_float.cu b/cpp/src/neighbors/refine_float_float.cu
index 08976449f5..7e811fd7e3 100644
--- a/cpp/src/neighbors/refine_float_float.cu
+++ b/cpp/src/neighbors/refine_float_float.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by refine_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python refine_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/refine-inl.cuh>
 
 #define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)      \
diff --git a/cpp/src/neighbors/refine_int8_t_float.cu b/cpp/src/neighbors/refine_int8_t_float.cu
index 66293b741a..6983c2492c 100644
--- a/cpp/src/neighbors/refine_int8_t_float.cu
+++ b/cpp/src/neighbors/refine_int8_t_float.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by refine_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python refine_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/refine-inl.cuh>
 
 #define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)      \
diff --git a/cpp/src/neighbors/refine_uint8_t_float.cu b/cpp/src/neighbors/refine_uint8_t_float.cu
index c5e4f5e19c..f61bc508c0 100644
--- a/cpp/src/neighbors/refine_uint8_t_float.cu
+++ b/cpp/src/neighbors/refine_uint8_t_float.cu
@@ -15,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by refine_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python refine_00_generate.py
+ *
+ */
+
 #include <raft/neighbors/refine-inl.cuh>
 
 #define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)      \
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py b/cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py
index 04e5a9e4b6..f8ce27728b 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_00_generate.py
@@ -1,4 +1,16 @@
-#!/usr/bin/env python3
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 header = """/*
  * Copyright (c) 2021-2023, NVIDIA CORPORATION.
@@ -16,6 +28,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint> // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
index 42a14d11e0..b4ecac06e6 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_dist.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
index 437b5a3d7e..31628d8b82 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_euclidean.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
index ef69305571..80fda1bf9d 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_2d_haversine.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
index 111513e6d0..40aa89aa39 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_dist.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
index 98dbcac2aa..be159932a6 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_euclidean.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
index 7b0c885986..a9fe8f355f 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_one_3d_haversine.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
index 17dafbe862..b20df46a4f 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_dist.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
index 7a3d770b87..d5042b0142 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_euclidean.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
index 0c877ad717..01002d356e 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_2d_haversine.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
index 2bfa4bdf5e..5746ab99fb 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_dist.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
index 30f3ad97e9..fad007a2d4 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_euclidean.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 
diff --git a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
index 8b25ca9698..93083da5c6 100644
--- a/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
+++ b/cpp/src/spatial/knn/detail/ball_cover/registers_pass_two_3d_haversine.cu
@@ -14,6 +14,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by registers_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python registers_00_generate.py
+ *
+ */
+
 #include <cstdint>  // int64_t
 #include <raft/spatial/knn/detail/ball_cover/registers-inl.cuh>
 

From ff1b955e025981d9aa486d7e144d0479fbba91f4 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 16:45:28 +0200
Subject: [PATCH 53/89] Split selection_faiss source file

It was taking a long time to compile. Makes sense to split.
---
 cpp/CMakeLists.txt                            |  7 +-
 .../neighbors/detail/selection_faiss-ext.cuh  |  2 -
 .../detail/selection_faiss_00_generate.py     | 75 +++++++++++++++++++
 .../detail/selection_faiss_int32_t_float.cu   | 44 +++++++++++
 .../detail/selection_faiss_int_double.cu      | 44 +++++++++++
 .../detail/selection_faiss_long_float.cu      | 44 +++++++++++
 .../detail/selection_faiss_size_t_double.cu   | 44 +++++++++++
 .../detail/selection_faiss_size_t_float.cu    | 44 +++++++++++
 ...s.cu => selection_faiss_uint32_t_float.cu} | 19 ++---
 9 files changed, 311 insertions(+), 12 deletions(-)
 create mode 100644 cpp/src/neighbors/detail/selection_faiss_00_generate.py
 create mode 100644 cpp/src/neighbors/detail/selection_faiss_int32_t_float.cu
 create mode 100644 cpp/src/neighbors/detail/selection_faiss_int_double.cu
 create mode 100644 cpp/src/neighbors/detail/selection_faiss_long_float.cu
 create mode 100644 cpp/src/neighbors/detail/selection_faiss_size_t_double.cu
 create mode 100644 cpp/src/neighbors/detail/selection_faiss_size_t_float.cu
 rename cpp/src/neighbors/detail/{selection_faiss.cu => selection_faiss_uint32_t_float.cu} (75%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 701b056771..1c705cc786 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -318,7 +318,12 @@ if(RAFT_COMPILE_LIBRARY)
     src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false.cu
     src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true.cu
     src/neighbors/detail/ivf_pq_compute_similarity_half_half.cu
-    src/neighbors/detail/selection_faiss.cu
+    src/neighbors/detail/selection_faiss_uint32_t_float.cu
+    src/neighbors/detail/selection_faiss_int32_t_float.cu
+    src/neighbors/detail/selection_faiss_long_float.cu
+    src/neighbors/detail/selection_faiss_size_t_double.cu
+    src/neighbors/detail/selection_faiss_int_double.cu
+    src/neighbors/detail/selection_faiss_size_t_float.cu
     src/neighbors/ivf_flat_build_float_int64_t.cu
     src/neighbors/ivf_flat_build_int8_t_int64_t.cu
     src/neighbors/ivf_flat_build_uint8_t_int64_t.cu
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
index c724aa0407..7ff30e3eff 100644
--- a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
@@ -62,8 +62,6 @@ void select_k(const key_t* inK,
                                                          int k,                \
                                                          cudaStream_t stream)
 
-// @benfred: Not sure if this is correct. Should I not flip float and uint32_t?
-// It seems weird that float is the key and uint32_t is the payload type.
 instantiate_raft_neighbors_detail_select_k(uint32_t, float);
 instantiate_raft_neighbors_detail_select_k(int32_t, float);
 instantiate_raft_neighbors_detail_select_k(long, float);
diff --git a/cpp/src/neighbors/detail/selection_faiss_00_generate.py b/cpp/src/neighbors/detail/selection_faiss_00_generate.py
new file mode 100644
index 0000000000..36ba56c9b3
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_00_generate.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+header = """
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \\
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \\
+                                                  const payload_t* inV, \\
+                                                  size_t n_rows,        \\
+                                                  size_t n_cols,        \\
+                                                  key_t* outK,          \\
+                                                  payload_t* outV,      \\
+                                                  bool select_min,      \\
+                                                  int k,                \\
+                                                  cudaStream_t stream)
+
+"""
+
+types = dict(
+    uint32_t_float=("uint32_t", "float"),
+    int32_t_float=("int32_t", "float"),
+    long_float=("long", "float"),
+    size_t_double=("size_t", "double"),
+    int_double=("int", "double"),
+    size_t_float=("size_t", "float"),
+)
+
+for type_path, (payload_t, key_t) in types.items():
+    path = f"selection_faiss_{type_path}.cu"
+    with open(path, "w") as f:
+        f.write(header)
+        f.write(f"instantiate_raft_neighbors_detail_select_k({payload_t}, {key_t});\n\n")
+        f.write(f"#undef instantiate_raft_neighbors_detail_select_k\n")
+
+    # for pasting into CMakeLists.txt
+    print(f"src/neighbors/detail/{path}")
diff --git a/cpp/src/neighbors/detail/selection_faiss_int32_t_float.cu b/cpp/src/neighbors/detail/selection_faiss_int32_t_float.cu
new file mode 100644
index 0000000000..1f1ece05ae
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_int32_t_float.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(int32_t, float);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/detail/selection_faiss_int_double.cu b/cpp/src/neighbors/detail/selection_faiss_int_double.cu
new file mode 100644
index 0000000000..7e832410c4
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_int_double.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(int, double);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/detail/selection_faiss_long_float.cu b/cpp/src/neighbors/detail/selection_faiss_long_float.cu
new file mode 100644
index 0000000000..441d54fa30
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_long_float.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(long, float);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/detail/selection_faiss_size_t_double.cu b/cpp/src/neighbors/detail/selection_faiss_size_t_double.cu
new file mode 100644
index 0000000000..ca310e7697
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_size_t_double.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(size_t, double);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/detail/selection_faiss_size_t_float.cu b/cpp/src/neighbors/detail/selection_faiss_size_t_float.cu
new file mode 100644
index 0000000000..a830e6ecac
--- /dev/null
+++ b/cpp/src/neighbors/detail/selection_faiss_size_t_float.cu
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
+#include <cstddef>  // size_t
+#include <cstdint>  // uint32_t
+#include <raft/neighbors/detail/selection_faiss-inl.cuh>
+
+#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)    \
+  template void raft::neighbors::detail::select_k(const key_t* inK,     \
+                                                  const payload_t* inV, \
+                                                  size_t n_rows,        \
+                                                  size_t n_cols,        \
+                                                  key_t* outK,          \
+                                                  payload_t* outV,      \
+                                                  bool select_min,      \
+                                                  int k,                \
+                                                  cudaStream_t stream)
+
+instantiate_raft_neighbors_detail_select_k(size_t, float);
+
+#undef instantiate_raft_neighbors_detail_select_k
diff --git a/cpp/src/neighbors/detail/selection_faiss.cu b/cpp/src/neighbors/detail/selection_faiss_uint32_t_float.cu
similarity index 75%
rename from cpp/src/neighbors/detail/selection_faiss.cu
rename to cpp/src/neighbors/detail/selection_faiss_uint32_t_float.cu
index 034f37f8cc..2fecaa5cf1 100644
--- a/cpp/src/neighbors/detail/selection_faiss.cu
+++ b/cpp/src/neighbors/detail/selection_faiss_uint32_t_float.cu
@@ -1,3 +1,4 @@
+
 /*
  * Copyright (c) 2023, NVIDIA CORPORATION.
  *
@@ -14,6 +15,15 @@
  * limitations under the License.
  */
 
+/*
+ * NOTE: this file is generated by selection_faiss_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python selection_faiss_00_generate.py
+ *
+ */
+
 #include <cstddef>  // size_t
 #include <cstdint>  // uint32_t
 #include <raft/neighbors/detail/selection_faiss-inl.cuh>
@@ -29,15 +39,6 @@
                                                   int k,                \
                                                   cudaStream_t stream)
 
-// @benfred: Not sure if this is correct. Should I not flip float and uint32_t?
-// It seems weird that float is the key and uint32_t is the payload type.
 instantiate_raft_neighbors_detail_select_k(uint32_t, float);
-instantiate_raft_neighbors_detail_select_k(int32_t, float);
-instantiate_raft_neighbors_detail_select_k(long, float);
-// Needed by the tests
-instantiate_raft_neighbors_detail_select_k(size_t, double);
-// test/neighbors/selection.cu
-instantiate_raft_neighbors_detail_select_k(int, double);
-instantiate_raft_neighbors_detail_select_k(size_t, float);
 
 #undef instantiate_raft_neighbors_detail_select_k

From dfe860e28afa54e58a5249011eb226ec914f6c6c Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 17:41:59 +0200
Subject: [PATCH 54/89] Update docs

---
 docs/source/using_libraft.md | 83 +++++++++++++++---------------------
 1 file changed, 34 insertions(+), 49 deletions(-)

diff --git a/docs/source/using_libraft.md b/docs/source/using_libraft.md
index 6fa2e644a9..a50b6ad690 100644
--- a/docs/source/using_libraft.md
+++ b/docs/source/using_libraft.md
@@ -4,15 +4,6 @@ At its core, RAFT is a header-only template library, which makes it very powerfu
 
 For most functions, compile-time overhead is minimal but some of RAFT's APIs take a substantial time to compile. As a rule of thumb, most functionality in `raft::distance`, `raft::neighbors`, and `raft::spatial` is expensive to compile and most functionality in other namespaces has little compile-time overhead.
 
-
-To speed up compilation when using RAFT as a header-only library, you can do the following... 
-
-To speed up compilation when using the precompiled RAFT library, you can do the
-following:
-
-1. 
-
-
 There are three ways to speed up compile times:
 
 1. Continue to use RAFT as a header-only library and create a CUDA source file
@@ -29,51 +20,45 @@ There are three ways to speed up compile times:
    compile any CUDA code yourself, you can simply add `libraft` to your link
    libraries and use the growing set of runtime APIs.
 
-## Using Template Specializations
+### How do I verify template instantiations didn't compile into my binary?
 
-As mentioned above, the pre-compiled template instantiations can save a lot of time if you are able to use the type combinations for the templates which are already specialized in the `libraft` binary. This will, of course, mean that you will need to add `libraft` to your link libraries.
-
-At the top level of each namespace containing pre-compiled template specializations is a header file called `specializations.cuh`. This header file includes `extern template` directives for all the specializations which are compiled into libraft. As an example, including `raft/neighbors/specializations.cuh` in one of your source files will effectively tell the compiler to skip over any of the template specializations that are already compiled into the `libraft` binary.
-
-### How do I verify template specializations didn't compile into my binary?
-
-Which specializations were chosen to instantiations were based on compile time analysis and reuse. This means you can't assume that all specializations are for the public API itself. Take the following example in `raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh`:
+To verify that you are not accidentally instantiating templates that have not been pre-compiled in RAFT, set the `RAFT_EXPLICIT_INSTANTIATE` macro. This only works if you are linking with the pre-compiled libraft (i.e., when `RAFT_COMPILED` has been defined). To check if, for instance, `raft::distance::distance` has been precompiled with specific template arguments, you can set `RAFT_EXPLICIT_INSTANTIATE` at the top of the file you are compiling, as in the following example:
 
 ```c++
-namespace raft::neighbors::ivf_pq::detail {
-
-namespace {
-using fp8s_t = fp_8bit<5, true>;
-using fp8u_t = fp_8bit<5, false>;
-}  // namespace
-
-#define RAFT_INST(OutT, LutT)                                                                     \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, true, true>(uint32_t, uint32_t)  \
-    ->compute_similarity_kernel_t<OutT, LutT>;                                                    \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, true, false>(uint32_t, uint32_t) \
-    ->compute_similarity_kernel_t<OutT, LutT>;                                                    \
-  extern template auto get_compute_similarity_kernel<OutT, LutT, false, true>(uint32_t, uint32_t) \
-    ->compute_similarity_kernel_t<OutT, LutT>;
-
-#define RAFT_INST_ALL_OUT_T(LutT) \
-  RAFT_INST(float, LutT)          \
-  RAFT_INST(half, LutT)
-
-RAFT_INST_ALL_OUT_T(float)
-RAFT_INST_ALL_OUT_T(half)
-RAFT_INST_ALL_OUT_T(fp8s_t)
-RAFT_INST_ALL_OUT_T(fp8u_t)
-
-#undef RAFT_INST
-#undef RAFT_INST_ALL_OUT_T
-
-}  // namespace raft::neighbors::ivf_pq::detail
-```
 
-We can see here that the function `raft::neighbors::ivf_pq::detail::get_compute_similarity_kernel` is being instantiated for the cartesian product of `OutT={float, half, fp8s_t, fp8u_t}` and `LutT={float, half}`. After linking against the `libraft` binary and including `raft/neighbors/specializations.cuh` in your source file, you can invoke the `raft::neighbors::ivf_pq` functions and compile your code. If the specializations are working, you should be able to use `nm -g -C --defined-only /path/to/your/binary | grep raft::neighbors::ivf_pq::detail::get_compute_similarity::kernel` and you shouldn't see any results, because those symbols should be coming from the `libraft` binary and skipped from compiling into your binary.
+#ifdef RAFT_COMPILED
+#define RAFT_EXPLICIT_INSTANTIATE
+#endif
+
+#include <cstdint>
+#include <raft/core/device_resources.hpp>
+#include <raft/distance/distance.cuh>
+
+int main()
+{
+  raft::resources handle{};
+
+  // Change IdxT to uint64_t and you will get an error because you are
+  // instantiating a template that has not been pre-compiled.
+  using IdxT = int;
+
+  const float* x = nullptr;
+  const float* y = nullptr;
+  float* out     = nullptr;
+  int m          = 1024;
+  int n          = 1024;
+  int k          = 1024;
+  bool row_major = true;
+  raft::distance::distance<raft::distance::DistanceType::L1, float, float, float, IdxT>(
+    handle, x, y, out, m, n, k, row_major, 2.0f);
+}
+```
 
 ## Runtime APIs
 
-RAFT contains a growing list of runtime APIs that, unlike the pre-compiled template specializations, allow you to link against `libraft` and invoke RAFT directly from `cpp` files. The benefit to RAFT's runtime APIs are two-fold- unlike the template specializations, which still require your code be compiled with the CUDA compiler (`nvcc`), the `runtime` APIs are the lightweight wrappers which enable `pylibraft`.
+RAFT contains a growing list of runtime APIs that, unlike the pre-compiled
+template instantiations, allow you to link against `libraft` and invoke RAFT
+directly from `cpp` files. The benefit to RAFT's runtime APIs is that they can
+be used from code that is compiled with a `c++` compiler (rather than the CUDA
+compiler `nvcc`). This enables the `runtime` APIs to power `pylibraft`.
 
-Similar to the pre-compiled template specializations, RAFT's runtime APIs 

From 1ee301e0730c32f9a6dfaab71cb0d52e5d1e979b Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 17:46:04 +0200
Subject: [PATCH 55/89] Replace specialization with instantiation

where appropriate
---
 README.md                                          | 14 +++++++-------
 .../detail/pairwise_matrix/dispatch-inl.cuh        |  4 ++--
 .../detail/ivf_pq_compute_similarity-inl.cuh       |  5 +++--
 .../raft/neighbors/detail/knn_brute_force.cuh      |  2 +-
 docs/source/build.md                               |  2 +-
 5 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index b77e906262..3bbb5de5fa 100755
--- a/README.md
+++ b/README.md
@@ -198,7 +198,7 @@ RAFT itself can be installed through conda, [CMake Package Manager (CPM)](https:
 
 The easiest way to install RAFT is through conda and several packages are provided.
 - `libraft-headers` RAFT headers
-- `libraft` (optional) shared library of pre-compiled template specializations and runtime APIs.
+- `libraft` (optional) shared library of pre-compiled template instantiations and runtime APIs.
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives.
 - `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters.
 
@@ -231,11 +231,11 @@ You can find an [example RAFT](cpp/template/README.md) project template in the `
 
 Additional CMake targets can be made available by adding components in the table below to the `RAFT_COMPONENTS` list above, separated by spaces. The `raft::raft` target will always be available. RAFT headers require, at a minimum, the CUDA toolkit libraries and RMM dependencies.
 
-| Component   | Target              | Description                                               | Base Dependencies                     |
-|-------------|---------------------|-----------------------------------------------------------|---------------------------------------|
-| n/a         | `raft::raft`        | Full RAFT header library                                  | CUDA toolkit, RMM, NVTX, CCCL, CUTLASS |
-| compiled    | `raft::compiled`    | Pre-compiled template specializations and runtime library | raft::raft                            |
-| distributed | `raft::distributed` | Dependencies for `raft::comms` APIs                       | raft::raft, UCX, NCCL                 |
+| Component   | Target              | Description                                              | Base Dependencies                      |
+|-------------|---------------------|----------------------------------------------------------|----------------------------------------|
+| n/a         | `raft::raft`        | Full RAFT header library                                 | CUDA toolkit, RMM, NVTX, CCCL, CUTLASS |
+| compiled    | `raft::compiled`    | Pre-compiled template instantiations and runtime library | raft::raft                             |
+| distributed | `raft::distributed` | Dependencies for `raft::comms` APIs                      | raft::raft, UCX, NCCL                  |
 
 ### Source
 
@@ -282,7 +282,7 @@ The folder structure mirrors other RAPIDS repos, with the following folders:
     - `util`: Various reusable tools and utilities for accelerated algorithm development
   - `internal`: A private header-only component that hosts the code shared between benchmarks and tests.
   - `scripts`: Helpful scripts for development
-  - `src`: Compiled APIs and template specializations for the shared libraries
+  - `src`: Compiled APIs and template instantiations for the shared libraries
   - `template`: A skeleton template containing the bare-bones file structure and cmake configuration for writing applications with RAFT.
   - `test`: Googletests source code
 - `docs`: Source code and scripts for building library documentation (Uses breath, doxygen, & pydocs)
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
index 55a064acf9..bb4422735b 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-inl.cuh
@@ -24,7 +24,7 @@
  *    architectures.
  *
  * 2. Provide concise function templates that can be instantiated in
- *    src/distance/distance/specializations/detail/. Previously,
+ *    src/distance/detail/pairwise_matrix/. Previously,
  *    raft::distance::detail::distance was instantiated. The function
  *    necessarily required a large set of include files, which slowed down the
  *    build. The raft::distance::detail::pairwise_matrix_arch_dispatch functions
@@ -46,7 +46,7 @@ namespace raft::distance::detail {
 
 // This forward-declaration ensures that we do not need to include
 // dispatch_sm80.cuh if we are not calling it in practice. This makes compiling
-// all the non-CUTLASS based distance specializations faster. For CUTLASS-based
+// all the non-CUTLASS based distance instantiations faster. For CUTLASS-based
 // distances, dispatch_sm80.cuh has to be included by the file including this
 // file.
 template <typename OpT,
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
index de79d3f6c8..7573e2ca13 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-inl.cuh
@@ -519,8 +519,9 @@ struct compute_similarity_kernel_config {
   }
 };
 
-// A standalone accessor function is necessary to make sure template specializations work correctly
-// (we "extern template" this function)
+// A standalone accessor function was necessary to make sure template
+// instantiation work correctly. This accessor function is not used anymore and
+// may be removed.
 template <typename OutT, typename LutT, bool PrecompBaseDiff, bool EnableSMemLut>
 auto get_compute_similarity_kernel(uint32_t pq_bits, uint32_t k_max)
   -> compute_similarity_kernel_t<OutT, LutT>
diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
index 0148a1a887..5d099b8d67 100644
--- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
@@ -142,7 +142,7 @@ void tiled_brute_force_knn(const raft::device_resources& handle,
       // calculate the top-k elements for the current tile, by calculating the
       // full pairwise distance for the tile - and then selecting the top-k from that
       // note: we're using a int32 IndexType here on purpose in order to
-      // use the pairwise_distance specializations. Since the tile size will ensure
+      // use the pairwise_distance instantiations. Since the tile size will ensure
       // that the total memory is < 1GB per tile, this will not cause any issues
       distance::pairwise_distance<ElementType, int>(handle,
                                                     search + i * d,
diff --git a/docs/source/build.md b/docs/source/build.md
index 021286847d..bd2afe6638 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -4,7 +4,7 @@
 
 The easiest way to install RAFT is through conda and several packages are provided.
 - `libraft-headers` RAFT headers
-- `libraft` (optional) shared library containing pre-compiled template specializations and runtime API.
+- `libraft` (optional) shared library containing pre-compiled template instantiations and runtime API.
 - `pylibraft` (optional) Python wrappers around RAFT algorithms and primitives.
 - `raft-dask` (optional) enables deployment of multi-node multi-GPU algorithms that use RAFT `raft::comms` in Dask clusters.
 

From 48795141acbfe86ee992cb32d726cb88e554895f Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 17:56:01 +0200
Subject: [PATCH 56/89] Fix ivf benchmarks

---
 cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h | 1 +
 cpp/bench/ann/src/raft/raft_ivf_pq.cu          | 9 ++++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index 8b2a7d329b..0a80eef1b5 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -29,6 +29,7 @@
 #include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 #include <stdexcept>
 #include <string>
 #include <type_traits>
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq.cu b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
index 47f7f66d3a..2efe14631b 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq.cu
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq.cu
@@ -15,9 +15,8 @@
  */
 #include "raft_ivf_pq_wrapper.h"
 
-onamespace raft::bench::ann
-{
-  template class RaftIvfPQ<float, int64_t>;
-  template class RaftIvfPQ<uint8_t, int64_t>;
-  template class RaftIvfPQ<int8_t, int64_t>;
+namespace raft::bench::ann {
+template class RaftIvfPQ<float, int64_t>;
+template class RaftIvfPQ<uint8_t, int64_t>;
+template class RaftIvfPQ<int8_t, int64_t>;
 }  // namespace raft::bench::ann

From 7461085fcf4428fabb72f8daa1a2179655bb5390 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 17:56:28 +0200
Subject: [PATCH 57/89] Remove preprocessor logic from distance test

This logic is now part of RAFT
---
 cpp/test/distance/distance_base.cuh | 33 ++++-------------------------
 1 file changed, 4 insertions(+), 29 deletions(-)

diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index 438e212fbd..45c2685001 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -18,23 +18,14 @@
 #include <gtest/gtest.h>
 #include <raft/common/nvtx.hpp>  // common::nvtx::range
 
-#include <raft/core/device_mdspan.hpp>       // make_device_matrix_view
-#include <raft/core/device_resources.hpp>    // raft::device_resources
-#include <raft/core/operators.hpp>           // raft::sqrt
+#include <raft/core/device_mdspan.hpp>     // make_device_matrix_view
+#include <raft/core/device_resources.hpp>  // raft::device_resources
+#include <raft/core/operators.hpp>         // raft::sqrt
+#include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
 #include <raft/random/rng.cuh>
 #include <rmm/device_uvector.hpp>  // rmm::device_uvector
 
-// When the distance library is precompiled, include only the raft_runtime
-// headers. This way, a small change in one of the kernel internals does not
-// trigger a rebuild of the test files (it of course still triggers a rebuild of
-// the raft specializations)
-#if defined RAFT_COMPILED
-#include <raft_runtime/distance/pairwise_distance.hpp>
-#else
-#include <raft/distance/distance.cuh>
-#endif
-
 namespace raft {
 namespace distance {
 
@@ -449,23 +440,12 @@ void distanceLauncher(raft::device_resources const& handle,
                       DataType threshold,
                       DataType metric_arg = 2.0f)
 {
-#if defined RAFT_COMPILED
-  // TODO: Implement and use mdspan-based
-  // raft::runtime::distance::pairwise_distance here.
-  //
-  // Context:
-  // https://github.com/rapidsai/raft/issues/1338
-  bool row_major = layout_to_row_major<layout>();
-  raft::runtime::distance::pairwise_distance(
-    handle, x, y, dist, m, n, k, distanceType, row_major, metric_arg);
-#else
   auto x_v    = make_device_matrix_view<DataType, int, layout>(x, m, k);
   auto y_v    = make_device_matrix_view<DataType, int, layout>(y, n, k);
   auto dist_v = make_device_matrix_view<DataType, int, layout>(dist, m, n);
 
   raft::distance::distance<distanceType, DataType, DataType, DataType, layout>(
     handle, x_v, y_v, dist_v, metric_arg);
-#endif
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
@@ -573,13 +553,8 @@ class BigMatrixDistanceTest : public ::testing::Test {
                            float metric_arg);
     constexpr bool row_major   = true;
     constexpr float metric_arg = 0.0f;
-#if defined RAFT_COMPILED
-    raft::runtime::distance::pairwise_distance(
-      handle, x.data(), x.data(), dist.data(), m, n, k, distanceType, row_major, metric_arg);
-#else
     raft::distance::distance<distanceType, float, float, float>(
       handle, x.data(), x.data(), dist.data(), m, n, k, row_major, metric_arg);
-#endif
     RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
   }
 

From 32fb40b93087642b2e3fed8061b10a7ff75900c0 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 18:26:08 +0200
Subject: [PATCH 58/89] Fix benchmarks

---
 cpp/bench/prims/distance/fused_l2_nn.cu       |  1 +
 cpp/bench/prims/distance/masked_nn.cu         | 10 ++++++----
 cpp/include/raft/distance/fused_l2_nn-ext.cuh | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/cpp/bench/prims/distance/fused_l2_nn.cu b/cpp/bench/prims/distance/fused_l2_nn.cu
index d1b3896cc9..24c0cbf8f9 100644
--- a/cpp/bench/prims/distance/fused_l2_nn.cu
+++ b/cpp/bench/prims/distance/fused_l2_nn.cu
@@ -16,6 +16,7 @@
 
 #include <common/benchmark.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/norm.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/bench/prims/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu
index c804ecb3a1..033c29e209 100644
--- a/cpp/bench/prims/distance/masked_nn.cu
+++ b/cpp/bench/prims/distance/masked_nn.cu
@@ -25,6 +25,7 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/handle.hpp>
+#include <raft/distance/detail/fused_l2_nn.cuh>  // MinAndDistanceReduceOpImpl
 #include <raft/distance/masked_nn.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
@@ -91,8 +92,8 @@ struct masked_l2_nn : public fixture {
   using DataT      = T;
   using IdxT       = int;
   using OutT       = raft::KeyValuePair<IdxT, DataT>;
-  using RedOpT     = raft::distance::MinAndDistanceReduceOp<int, DataT>;
-  using PairRedOpT = raft::distance::KVPMinReduce<int, DataT>;
+  using RedOpT     = raft::distance::detail::MinAndDistanceReduceOpImpl<int, DataT>;
+  using PairRedOpT = raft::distance::detail::KVPMinReduceImpl<int, DataT>;
   using ParamT     = raft::distance::masked_l2_nn_params<RedOpT, PairRedOpT>;
 
   // Parameters
@@ -122,8 +123,9 @@ struct masked_l2_nn : public fixture {
       xn.data_handle(), x.data_handle(), p.k, p.m, raft::linalg::L2Norm, true, stream);
     raft::linalg::rowNorm(
       yn.data_handle(), y.data_handle(), p.k, p.n, raft::linalg::L2Norm, true, stream);
-    raft::distance::initialize<T, raft::KeyValuePair<int, T>, int>(
-      handle, out.data_handle(), p.m, std::numeric_limits<T>::max(), RedOpT{});
+    // Avoid instantiating raft::distance::initialize..
+    raft::distance::detail::initialize<T, raft::KeyValuePair<int, T>, int>(
+      out.data_handle(), p.m, std::numeric_limits<T>::max(), RedOpT{}, handle.get_stream());
 
     dim3 block(32, 32);
     dim3 grid(10, 10);
diff --git a/cpp/include/raft/distance/fused_l2_nn-ext.cuh b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
index b9b507179b..4968fcb602 100644
--- a/cpp/include/raft/distance/fused_l2_nn-ext.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
@@ -30,6 +30,21 @@ namespace distance {
  * @}
  */
 
+/**
+ * Initialize array using init value from reduction op
+ */
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
+void initialize(raft::device_resources const& handle,
+                OutT* min,
+                IdxT m,
+                DataT maxVal,
+                ReduceOpT redOp) RAFT_EXPLICIT;
+
+/**
+ * \ingroup fused_l2_nn
+ * @{
+ */
+
 /**
  * @brief Wrapper around fusedL2NN with minimum reduction operators.
  *

From 7ff4fadd4d153cdbf3f08cd8e813dfe668a88ff2 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 18:26:25 +0200
Subject: [PATCH 59/89] Add ALL_BENCH CMake target

---
 cpp/bench/prims/CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index c0c1706b2a..e222316635 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -141,4 +141,11 @@ if(BUILD_PRIMS_BENCH)
     OPTIONAL
     LIB
   )
+
+  add_custom_target(ALL_BENCH)
+  add_dependencies(
+    ALL_BENCH CLUSTER_BENCH DISTANCE_BENCH LINALG_BENCH MATRIX_BENCH NEIGHBORS_BENCH RANDOM_BENCH
+    SPARSE_BENCH TUNE_DISTANCE
+  )
+
 endif()

From 5c0380450b2a8bdf14d59099c78d04e52764006c Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 14 Apr 2023 19:04:13 +0200
Subject: [PATCH 60/89] Fix non-standard instantiation

---
 cpp/include/raft/neighbors/ivf_flat-ext.cuh          | 2 +-
 cpp/src/neighbors/ivf_flat_00_generate.py            | 2 +-
 cpp/src/neighbors/ivf_flat_search_float_int64_t.cu   | 2 +-
 cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu  | 2 +-
 cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
index 675b2fd97b..9dcc0575d7 100644
--- a/cpp/include/raft/neighbors/ivf_flat-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
@@ -484,7 +484,7 @@ instantiate_raft_neighbors_ivf_flat_extend(uint8_t, int64_t);
     uint32_t k,                                                    \
     IdxT* neighbors,                                               \
     float* distances,                                              \
-    rmm::mr::device_memory_resource* mr = nullptr);                \
+    rmm::mr::device_memory_resource* mr);                          \
                                                                    \
   extern template void raft::neighbors::ivf_flat::search<T, IdxT>( \
     raft::device_resources const& handle,                          \
diff --git a/cpp/src/neighbors/ivf_flat_00_generate.py b/cpp/src/neighbors/ivf_flat_00_generate.py
index 7ba04646d5..44ea9709c2 100644
--- a/cpp/src/neighbors/ivf_flat_00_generate.py
+++ b/cpp/src/neighbors/ivf_flat_00_generate.py
@@ -111,7 +111,7 @@
     uint32_t k,                                                    \\
     IdxT* neighbors,                                               \\
     float* distances,                                              \\
-    rmm::mr::device_memory_resource* mr = nullptr);                \\
+    rmm::mr::device_memory_resource* mr );                         \\
                                                                    \\
   template void raft::neighbors::ivf_flat::search<T, IdxT>( \\
     raft::device_resources const& handle,                          \\
diff --git a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
index 60e0105fd1..5a1fae6d5a 100644
--- a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
@@ -35,7 +35,7 @@
     uint32_t k,                                                 \
     IdxT* neighbors,                                            \
     float* distances,                                           \
-    rmm::mr::device_memory_resource* mr = nullptr);             \
+    rmm::mr::device_memory_resource* mr);                       \
                                                                 \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::device_resources const& handle,                       \
diff --git a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
index b317c3fc5b..bc84159a41 100644
--- a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
@@ -35,7 +35,7 @@
     uint32_t k,                                                 \
     IdxT* neighbors,                                            \
     float* distances,                                           \
-    rmm::mr::device_memory_resource* mr = nullptr);             \
+    rmm::mr::device_memory_resource* mr);                       \
                                                                 \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::device_resources const& handle,                       \
diff --git a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
index 1545b4dbcf..9e70e21af4 100644
--- a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
@@ -35,7 +35,7 @@
     uint32_t k,                                                 \
     IdxT* neighbors,                                            \
     float* distances,                                           \
-    rmm::mr::device_memory_resource* mr = nullptr);             \
+    rmm::mr::device_memory_resource* mr);                       \
                                                                 \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::device_resources const& handle,                       \

From 018910de7cf19a11e20048318dce250de5df18e4 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 17 Apr 2023 11:44:57 +0200
Subject: [PATCH 61/89] Fix docs build

---
 cpp/include/raft/neighbors/ivf_flat-inl.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
index 1bfe206608..88229b39d3 100644
--- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
@@ -239,7 +239,7 @@ auto extend(raft::device_resources const& handle,
  *
  * @param[in] handle
  * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices optional raft::device_matrix_view to a vector of indices [n_rows].
+ * @param[in] new_indices optional raft::device_vector_view to a vector of indices [n_rows].
  *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
  *    here to imply a continuous range `[0...n_rows)`.
  * @param[in] orig_index original index

From 8158fa6f550ae951697eb76d5217e9a1f7f130a1 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 17 Apr 2023 11:54:52 +0200
Subject: [PATCH 62/89] Add back specialization headers

Add deprecation notice. If we remove the headers, that would be a
breaking change that we want to avoid for now.
---
 cpp/doxygen/Doxyfile                          |  1 +
 cpp/include/raft/cluster/specializations.cuh  | 22 +++++++++++++++++++
 cpp/include/raft/distance/specializations.cuh | 22 +++++++++++++++++++
 .../distance/specializations/distance.cuh     | 22 +++++++++++++++++++
 .../specializations/fused_l2_nn_min.cuh       | 22 +++++++++++++++++++
 cpp/include/raft/matrix/specializations.cuh   | 22 +++++++++++++++++++
 .../raft/neighbors/specializations.cuh        | 22 +++++++++++++++++++
 .../neighbors/specializations/ball_cover.cuh  | 22 +++++++++++++++++++
 .../neighbors/specializations/brute_force.cuh | 22 +++++++++++++++++++
 .../specializations/fused_l2_knn.cuh          | 22 +++++++++++++++++++
 .../neighbors/specializations/ivf_flat.cuh    | 22 +++++++++++++++++++
 .../raft/neighbors/specializations/ivf_pq.cuh | 22 +++++++++++++++++++
 .../raft/neighbors/specializations/refine.cuh | 22 +++++++++++++++++++
 .../raft/sparse/neighbors/specializations.cuh | 22 +++++++++++++++++++
 .../raft/spatial/knn/specializations.cuh      | 22 +++++++++++++++++++
 .../raft/spatial/knn/specializations/knn.cuh  | 22 +++++++++++++++++++
 cpp/include/raft/spectral/specializations.cuh | 22 +++++++++++++++++++
 cpp/include/raft/stats/specializations.cuh    | 22 +++++++++++++++++++
 18 files changed, 375 insertions(+)
 create mode 100644 cpp/include/raft/cluster/specializations.cuh
 create mode 100644 cpp/include/raft/distance/specializations.cuh
 create mode 100644 cpp/include/raft/distance/specializations/distance.cuh
 create mode 100644 cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
 create mode 100644 cpp/include/raft/matrix/specializations.cuh
 create mode 100644 cpp/include/raft/neighbors/specializations.cuh
 create mode 100644 cpp/include/raft/neighbors/specializations/ball_cover.cuh
 create mode 100644 cpp/include/raft/neighbors/specializations/brute_force.cuh
 create mode 100644 cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
 create mode 100644 cpp/include/raft/neighbors/specializations/ivf_flat.cuh
 create mode 100644 cpp/include/raft/neighbors/specializations/ivf_pq.cuh
 create mode 100644 cpp/include/raft/neighbors/specializations/refine.cuh
 create mode 100644 cpp/include/raft/sparse/neighbors/specializations.cuh
 create mode 100644 cpp/include/raft/spatial/knn/specializations.cuh
 create mode 100644 cpp/include/raft/spatial/knn/specializations/knn.cuh
 create mode 100644 cpp/include/raft/spectral/specializations.cuh
 create mode 100644 cpp/include/raft/stats/specializations.cuh

diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 17a1e0caca..1948169c91 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -918,6 +918,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
+# TODO: remove specializations from exclude patterns when headers have been removed.
 EXCLUDE_PATTERNS       = */detail/* \
                          */specializations/* \
                          */thirdparty/*
diff --git a/cpp/include/raft/cluster/specializations.cuh b/cpp/include/raft/cluster/specializations.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/cluster/specializations.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/distance/specializations.cuh b/cpp/include/raft/distance/specializations.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/distance/specializations.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/matrix/specializations.cuh b/cpp/include/raft/matrix/specializations.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/matrix/specializations.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/ball_cover.cuh b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/brute_force.cuh b/cpp/include/raft/neighbors/specializations/brute_force.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations/brute_force.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh b/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/ivf_pq.cuh b/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/neighbors/specializations/refine.cuh b/cpp/include/raft/neighbors/specializations/refine.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/neighbors/specializations/refine.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/sparse/neighbors/specializations.cuh b/cpp/include/raft/sparse/neighbors/specializations.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/sparse/neighbors/specializations.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/spatial/knn/specializations.cuh b/cpp/include/raft/spatial/knn/specializations.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/specializations.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.cuh b/cpp/include/raft/spatial/knn/specializations/knn.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/specializations/knn.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/spectral/specializations.cuh b/cpp/include/raft/spectral/specializations.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/spectral/specializations.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
diff --git a/cpp/include/raft/stats/specializations.cuh b/cpp/include/raft/stats/specializations.cuh
new file mode 100644
index 0000000000..7ea4aed5c5
--- /dev/null
+++ b/cpp/include/raft/stats/specializations.cuh
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#pragma message(                                          \
+  __FILE__                                                \
+  " is deprecated and will be removed."                   \
+  " Including specializations is not necessary any more." \
+  " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")

From bfde580134bf3fe596759dfea4cd79d97746f4bf Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 17 Apr 2023 13:34:52 +0200
Subject: [PATCH 63/89] Undo change to conda environment

This must have been a merge conflict somewhere..
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 6bb981c1d0..5d5dc0e378 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -52,7 +52,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py=0.32.*
 - ucx-py==0.32.*
 - ucx>=1.13.0
 name: all_cuda-118_arch-x86_64

From 858b46d6cb973bac279380f49296742cf285fec2 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 17 Apr 2023 13:48:59 +0200
Subject: [PATCH 64/89] Revert ConfigureCUDA.cmake

These changes were made to make it possible to build RAFT with clang.
This can go into a separate PR.
---
 cpp/cmake/modules/ConfigureCUDA.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 8b2b5c54d0..c733d46985 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -19,7 +19,6 @@ endif()
 
 if(CMAKE_COMPILER_IS_GNUCXX)
   list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
-  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
 endif()
 
 if(CUDA_LOG_COMPILE_TIME)
@@ -34,8 +33,9 @@ list(APPEND RAFT_CUDA_FLAGS -Xfatbin=-compress-all)
 
 # set warnings as errors
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
-  # list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
+  list(APPEND RAFT_CUDA_FLAGS -Werror=all-warnings)
 endif()
+list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
 
 # Option to enable line info in CUDA device compilation to allow introspection when profiling /
 # memchecking

From 651bdd796a69b7eb7f56f14c97567992bd717ec7 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 17 Apr 2023 14:05:38 +0200
Subject: [PATCH 65/89] Rename *-types.cuh to *_types.cuh

To be consistent with other headers
---
 cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh      | 2 +-
 cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh      | 2 +-
 ...lesced_reduction-types.cuh => coalesced_reduction_types.cuh} | 0
 cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh       | 2 +-
 .../raft/spatial/knn/detail/ball_cover/registers-ext.cuh        | 2 +-
 .../raft/spatial/knn/detail/ball_cover/registers-inl.cuh        | 2 +-
 .../ball_cover/{registers-types.cuh => registers_types.cuh}     | 0
 7 files changed, 5 insertions(+), 5 deletions(-)
 rename cpp/include/raft/linalg/detail/{coalesced_reduction-types.cuh => coalesced_reduction_types.cuh} (100%)
 rename cpp/include/raft/spatial/knn/detail/ball_cover/{registers-types.cuh => registers_types.cuh} (100%)

diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
index 7dbdb59c10..4412c507b2 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-ext.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "coalesced_reduction-types.cuh"
+#include "coalesced_reduction_types.cuh"
 #include <raft/core/operators.hpp>
 
 // The explicit instantiation of raft::linalg::detail::coalescedReduction is not
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index 7ba4537b0f..c75e28014c 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "coalesced_reduction-types.cuh"  // policy structs
+#include "coalesced_reduction_types.cuh"  // policy structs
 #include <cub/cub.cuh>
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-types.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction_types.cuh
similarity index 100%
rename from cpp/include/raft/linalg/detail/coalesced_reduction-types.cuh
rename to cpp/include/raft/linalg/detail/coalesced_reduction_types.cuh
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
index 5522e867fd..ce72b2648f 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "../haversine_distance.cuh"
-#include "registers-types.cuh"
+#include "registers_types.cuh"
 #include <cstdint>
 #include <thrust/functional.h>
 #include <thrust/tuple.h>
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
index b5b54c62a7..2cbf8fb89e 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "../../ball_cover_types.hpp"
-#include "registers-types.cuh"          // DistFunc
+#include "registers_types.cuh"          // DistFunc
 #include <cstdint>                      // uint32_t
 #include <raft/util/raft_explicit.hpp>  //RAFT_EXPLICIT
 
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
index 9c624dcb08..e0e7d716ee 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-inl.cuh
@@ -20,7 +20,7 @@
 
 #include "../../ball_cover_types.hpp"
 #include "../haversine_distance.cuh"
-#include "registers-types.cuh"  // DistFunc
+#include "registers_types.cuh"  // DistFunc
 
 #include <cstdint>
 #include <limits.h>
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-types.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers_types.cuh
similarity index 100%
rename from cpp/include/raft/spatial/knn/detail/ball_cover/registers-types.cuh
rename to cpp/include/raft/spatial/knn/detail/ball_cover/registers_types.cuh

From a90400d016b39fe819ae649194165b571561cd02 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 17 Apr 2023 14:58:24 +0200
Subject: [PATCH 66/89] Fix doxygen errors

---
 cpp/include/raft/core/logger-inl.hpp | 81 +---------------------------
 1 file changed, 1 insertion(+), 80 deletions(-)

diff --git a/cpp/include/raft/core/logger-inl.hpp b/cpp/include/raft/core/logger-inl.hpp
index ee007eb2b8..85ea4baea5 100644
--- a/cpp/include/raft/core/logger-inl.hpp
+++ b/cpp/include/raft/core/logger-inl.hpp
@@ -44,18 +44,6 @@ namespace raft {
 
 namespace detail {
 
-/**
- * @defgroup CStringFormat Expand a C-style format string
- *
- * @brief Expands C-style formatted string into std::string
- *
- * @param[in] fmt format string
- * @param[in] vl  respective values for each of format modifiers in the string
- *
- * @return the expanded `std::string`
- *
- * @{
- */
 inline std::string format(const char* fmt, va_list& vl)
 {
   va_list vl_copy;
@@ -75,7 +63,6 @@ inline std::string format(const char* fmt, ...)
   va_end(vl);
   return str;
 }
-/** @} */
 
 inline int convert_level_to_spdlog(int level)
 {
@@ -102,85 +89,37 @@ class logger::impl {  // defined privately here
   }
 };  // class logger::impl
 
-/**
- * @brief The main Logging class for raft library.
- *
- * This class acts as a thin wrapper over the underlying `spdlog` interface. The
- * design is done in this way in order to avoid us having to also ship `spdlog`
- * header files in our installation.
- *
- * @todo This currently only supports logging to stdout. Need to add support in
- *       future to add custom loggers as well [Issue #2046]
- */
 RAFT_INLINE_CONDITIONAL logger::logger(std::string const& name_) : pimpl(new impl(name_))
 {
   set_pattern(default_log_pattern);
   set_level(RAFT_ACTIVE_LEVEL);
 }
-/**
- * @brief Singleton method to get the underlying logger object
- *
- * @return the singleton logger object
- */
+
 RAFT_INLINE_CONDITIONAL logger& logger::get(std::string const& name)
 {
   if (log_map.find(name) == log_map.end()) { log_map[name] = std::make_shared<raft::logger>(name); }
   return *log_map[name];
 }
 
-/**
- * @brief Set the logging level.
- *
- * Only messages with level equal or above this will be printed
- *
- * @param[in] level logging level
- *
- * @note The log level will actually be set only if the input is within the
- *       range [RAFT_LEVEL_TRACE, RAFT_LEVEL_OFF]. If it is not, then it'll
- *       be ignored. See documentation of decisiontree for how this gets used
- */
 RAFT_INLINE_CONDITIONAL void logger::set_level(int level)
 {
   level = raft::detail::convert_level_to_spdlog(level);
   pimpl->spdlogger->set_level(static_cast<spdlog::level::level_enum>(level));
 }
 
-/**
- * @brief Set the logging pattern
- *
- * @param[in] pattern the pattern to be set. Refer this link
- *                    https://github.com/gabime/spdlog/wiki/3.-Custom-formatting
- *                    to know the right syntax of this pattern
- */
 RAFT_INLINE_CONDITIONAL void logger::set_pattern(const std::string& pattern)
 {
   pimpl->cur_pattern = pattern;
   pimpl->spdlogger->set_pattern(pattern);
 }
 
-/**
- * @brief Register a callback function to be run in place of usual log call
- *
- * @param[in] callback the function to be run on all logged messages
- */
 RAFT_INLINE_CONDITIONAL void logger::set_callback(void (*callback)(int lvl, const char* msg))
 {
   pimpl->sink->set_callback(callback);
 }
 
-/**
- * @brief Register a flush function compatible with the registered callback
- *
- * @param[in] flush the function to use when flushing logs
- */
 RAFT_INLINE_CONDITIONAL void logger::set_flush(void (*flush)()) { pimpl->sink->set_flush(flush); }
 
-/**
- * @brief Tells whether messages will be logged for the given log level
- *
- * @param[in] level log level to be checked for
- * @return true if messages will be logged for this level, else false
- */
 RAFT_INLINE_CONDITIONAL bool logger::should_log_for(int level) const
 {
   level        = raft::detail::convert_level_to_spdlog(level);
@@ -188,29 +127,14 @@ RAFT_INLINE_CONDITIONAL bool logger::should_log_for(int level) const
   return pimpl->spdlogger->should_log(level_e);
 }
 
-/**
- * @brief Query for the current log level
- *
- * @return the current log level
- */
 RAFT_INLINE_CONDITIONAL int logger::get_level() const
 {
   auto level_e = pimpl->spdlogger->level();
   return RAFT_LEVEL_TRACE - static_cast<int>(level_e);
 }
 
-/**
- * @brief Get the current logging pattern
- * @return the pattern
- */
 RAFT_INLINE_CONDITIONAL std::string logger::get_pattern() const { return pimpl->cur_pattern; }
 
-/**
- * @brief Main logging method
- *
- * @param[in] level logging level of this message
- * @param[in] fmt   C-like format string, followed by respective params
- */
 RAFT_INLINE_CONDITIONAL void logger::log(int level, const char* fmt, ...)
 {
   level        = raft::detail::convert_level_to_spdlog(level);
@@ -225,9 +149,6 @@ RAFT_INLINE_CONDITIONAL void logger::log(int level, const char* fmt, ...)
   }
 }
 
-/**
- * @brief Flush logs by calling flush on underlying logger
- */
 RAFT_INLINE_CONDITIONAL void logger::flush() { pimpl->spdlogger->flush(); }
 
 RAFT_INLINE_CONDITIONAL logger::~logger() {}

From 076a1452b41a7b5f3ba2e9a988b21ccff8739a11 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 17 Apr 2023 15:01:03 +0200
Subject: [PATCH 67/89] Revert "Undo change to conda environment"

This reverts commit c5057d3c17ae1af58e8cbbc31d9aa4536d31f6b3.

For some reason, we need the line "- ucx-py=0.32*" to get through style
checking..
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 5d5dc0e378..6bb981c1d0 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -52,6 +52,7 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
+- ucx-py=0.32.*
 - ucx-py==0.32.*
 - ucx>=1.13.0
 name: all_cuda-118_arch-x86_64

From dffb55214f09fd2da156050b85262b0b2d3b9cbc Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 17 Apr 2023 16:26:32 +0200
Subject: [PATCH 68/89] Undo changes to python dependencies

---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 -
 dependencies.yaml                                | 3 +--
 python/pylibraft/pyproject.toml                  | 2 ++
 python/raft-dask/pyproject.toml                  | 2 ++
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 6bb981c1d0..5d5dc0e378 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -52,7 +52,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py=0.32.*
 - ucx-py==0.32.*
 - ucx>=1.13.0
 name: all_cuda-118_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 66b1c31b2b..3cbab2fa8d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -276,14 +276,13 @@ dependencies:
         packages:
           - dask-core==2023.3.2
           - ucx>=1.13.0
-          - ucx-py=0.32.*
           - ucx-proc=*=gpu
       - output_types: pyproject
         packages:
           - pylibraft==23.6.*
   test_python_common:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - pytest
           - pytest-cov
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index cf8e722b86..b4eb296089 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -48,6 +48,8 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "cupy",
+    "pytest",
+    "pytest-cov",
     "scikit-learn",
     "scipy",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 210a8ffaa8..4901df6c38 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -54,6 +54,8 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
+    "pytest",
+    "pytest-cov",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]

From 070a3851367d0f351b4a6b7c39509cd37d50e8d3 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Tue, 18 Apr 2023 10:23:42 +0200
Subject: [PATCH 69/89] Update cpp/include/raft/neighbors/ivf_flat-inl.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/neighbors/ivf_flat-inl.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
index 88229b39d3..adb1762fc3 100644
--- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
@@ -181,7 +181,8 @@ void build(raft::device_resources const& handle,
  *   index_params.add_data_on_build = false;      // don't populate index on build
  *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
+ *   std::optional<raft::device_vector_view<const idx_t, idx_t>> no_op = std::nullopt;
+ *   auto index = ivf_flat::build(handle, index_params, dataset);
  *   // fill the index with the data
  *   auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
  * @endcode

From 35ed4dfe54836bfd9f721c45df89022c7f414bbe Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Tue, 18 Apr 2023 10:23:53 +0200
Subject: [PATCH 70/89] Update cpp/include/raft/neighbors/ivf_flat-inl.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/neighbors/ivf_flat-inl.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
index adb1762fc3..66c6a3cd37 100644
--- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
@@ -230,7 +230,7 @@ auto extend(raft::device_resources const& handle,
  *   index_params.add_data_on_build = false;      // don't populate index on build
  *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset);
  *   // fill the index with the data
  *   auto index = ivf_flat::extend(handle, index_empty, dataset);
  * @endcode

From 65c1cba27df6b92750605f94cacf43c26abf40a3 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Tue, 18 Apr 2023 10:24:16 +0200
Subject: [PATCH 71/89] Update cpp/include/raft/neighbors/ivf_flat-ext.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/neighbors/ivf_flat-ext.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
index 9dcc0575d7..ad41534510 100644
--- a/cpp/include/raft/neighbors/ivf_flat-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
@@ -284,7 +284,7 @@ void extend(raft::device_resources const& handle,
  *   index_params.add_data_on_build = false;      // don't populate index on build
  *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset);
  *   // fill the index with the data
  *   std::optional<raft::device_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
  *   ivf_flat::extend(handle, dataset, no_opt, &index_empty);

From 3ea52b8f838894ad8993c05d30dfb6a83d13fb31 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 18 Apr 2023 10:42:12 +0200
Subject: [PATCH 72/89] Address review re: distance API

---
 .../detail/pairwise_matrix/dispatch-ext.cuh   |  8 ++
 cpp/include/raft/distance/distance-ext.cuh    | 12 +++
 .../detail/pairwise_matrix/dispatch.cu        | 91 -------------------
 cpp/src/distance/distance.cu                  |  9 ++
 4 files changed, 29 insertions(+), 91 deletions(-)
 delete mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch.cu

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
index 4fc55c29b8..ff95bb56cd 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -65,6 +65,14 @@ void pairwise_matrix_dispatch(OpT distance_op,
       cudaStream_t stream,                                                             \
       bool is_row_major)
 
+/*
+ * Hierarchy of instantiations:
+ *
+ * This file defines extern template instantiations of the distance kernels. The
+ * instantiation of the public API is handled in raft/distance/distance-ext.cuh.
+ *
+ * After adding an instance here, make sure to also add the instance there.
+ */
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   raft::distance::detail::ops::canberra_distance_op, float, float, float, raft::identity_op, int);
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index 4df42ca72e..31d4dc28a1 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -331,6 +331,18 @@ void pairwise_distance(raft::resources const& handle,
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE
 
+/*
+ * Hierarchy of instantiations:
+ *
+ * This file defines the extern template instantiations for the public API of
+ * raft::distance. To improve compile times, the extern template instantiation
+ * of the distance kernels is handled in
+ * distance/detail/pairwise_matrix/dispatch-ext.cuh.
+ *
+ * After adding an instance here, make sure to also add the instance to
+ * dispatch-ext.cuh and the corresponding .cu files.
+ */
+
 #define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT)       \
   extern template void raft::distance::distance<DT, DataT, AccT, OutT, FinalLambda, IdxT>( \
     raft::resources const& handle,                                                         \
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch.cu
deleted file mode 100644
index 7b91b3c3bf..0000000000
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch.cu
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/core/operators.hpp>
-#include <raft/distance/detail/distance_ops/all_ops.cuh>  // ops::*
-#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>
-
-#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
-  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
-  template void raft::distance::detail::                                               \
-    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
-      OpT<DataT, AccT, IdxT> distance_op,                                              \
-      IdxT m,                                                                          \
-      IdxT n,                                                                          \
-      IdxT k,                                                                          \
-      const DataT* x,                                                                  \
-      const DataT* y,                                                                  \
-      const DataT* x_norm,                                                             \
-      const DataT* y_norm,                                                             \
-      OutT* out,                                                                       \
-      FinOpT fin_op,                                                                   \
-      cudaStream_t stream,                                                             \
-      bool is_row_major)
-
-instantiate_raft_distance_detail_pairwise_matrix_dispatch(
-  raft::distance::detail::ops::canberra_distance_op, float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::canberra_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::correlation_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::correlation_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::cosine_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::cosine_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::hamming_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::hamming_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::hellinger_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::hellinger_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::jensen_shannon_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::jensen_shannon_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::kl_divergence_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::kl_divergence_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l1_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l1_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l2_exp_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l2_exp_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l2_unexp_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l2_unexp_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l_inf_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::l_inf_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::lp_unexp_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::lp_unexp_distance_op,
-// double, double, double, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::russel_rao_distance_op,
-// float, float, float, raft::identity_op, int);
-// instantiate_raft_distance_detail_pairwise_matrix_dispatch(raft::distance::detail::ops::russel_rao_distance_op,
-// double, double, double, raft::identity_op, int);
-
-#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
index f986dd30ef..91bd506724 100644
--- a/cpp/src/distance/distance.cu
+++ b/cpp/src/distance/distance.cu
@@ -16,6 +16,15 @@
 
 #include <raft/distance/distance-inl.cuh>
 
+/*
+ * Hierarchy of instantiations:
+ *
+ * This file defines the template instantiations for the public API of
+ * raft::distance. To improve compile times, the compilation of the distance
+ * kernels is handled in distance/detail/pairwise_matrix/dispatch_*.cu.
+ *
+ */
+
 #define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT) \
   template void raft::distance::distance<DT, DataT, AccT, OutT, FinalLambda, IdxT>(  \
     raft::resources const& handle,                                                   \

From f8788efbf1dd29a5cbc43efc05247d8ff3c48a29 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Mon, 17 Apr 2023 19:22:12 +0200
Subject: [PATCH 73/89] Use ARC V2 self-hosted runners for GPU jobs (#1410)

This PR is updating the runner labels to use ARC V2 self-hosted runners for GPU jobs. This is needed to resolve the auto-scalling issues.

Authors:
  - Jordan Jacobelli (https://github.com/jjacobelli)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/raft/pull/1410
---
 .github/workflows/build.yaml | 2 +-
 .github/workflows/pr.yaml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index bec89ab888..0f5f84c158 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -60,7 +60,7 @@ jobs:
     uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: branch
-      node_type: "gpu-latest-1"
+      node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci:latest"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 0ce32b73b6..82435a103f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -60,7 +60,7 @@ jobs:
     uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.06
     with:
       build_type: pull-request
-      node_type: "gpu-latest-1"
+      node_type: "gpu-v100-latest-1"
       arch: "amd64"
       container_image: "rapidsai/ci:latest"
       run_script: "ci/build_docs.sh"

From 5c179e6d33d37a01f89b886c6ec3bfda62821f0e Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Mon, 17 Apr 2023 11:34:47 -0700
Subject: [PATCH 74/89] Fix is_min_close (#1419)

Correlation and Cosine distance both return (1 - similarity) in the pairwise distances apis, meaning that is_min_close is returning the wrong sort order for them. Fix.

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/1419
---
 cpp/include/raft/distance/distance_types.hpp     | 2 --
 cpp/include/raft/sparse/neighbors/detail/knn.cuh | 5 ++---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/distance/distance_types.hpp b/cpp/include/raft/distance/distance_types.hpp
index 4060147f1d..d17ef358ee 100644
--- a/cpp/include/raft/distance/distance_types.hpp
+++ b/cpp/include/raft/distance/distance_types.hpp
@@ -74,8 +74,6 @@ inline bool is_min_close(DistanceType metric)
   bool select_min;
   switch (metric) {
     case DistanceType::InnerProduct:
-    case DistanceType::CosineExpanded:
-    case DistanceType::CorrelationExpanded:
       // Similarity metrics have the opposite meaning, i.e. nearest neighbors are those with larger
       // similarity (See the same logic at cpp/include/raft/sparse/spatial/detail/knn.cuh:362
       // {perform_k_selection})
diff --git a/cpp/include/raft/sparse/neighbors/detail/knn.cuh b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
index 7bedec9830..f9f07c13ca 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn.cuh
@@ -355,8 +355,7 @@ class sparse_knn_t {
     // want to adjust k.
     value_idx n_neighbors = std::min(static_cast<value_idx>(k), batch_cols);
 
-    bool ascending = true;
-    if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
+    bool ascending = raft::distance::is_min_close(metric);
 
     // kernel to slice first (min) k cols and copy into batched merge buffer
     raft::spatial::knn::select_k(batch_dists,
@@ -425,4 +424,4 @@ class sparse_knn_t {
   raft::device_resources const& handle;
 };
 
-};  // namespace raft::sparse::neighbors::detail
\ No newline at end of file
+};  // namespace raft::sparse::neighbors::detail

From edb1c5c98aeda4b461c62fdd0207c3fc4fbdfff2 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Mon, 17 Apr 2023 22:06:33 +0200
Subject: [PATCH 75/89] IVF-PQ: manipulating individual lists (#1298)

Add public functions for reading and writing into individual ivf-pq lists (clusters), in the input space (reconstructed data) and in flat PQ codes.

Partially solves (IVF-PQ) https://github.com/rapidsai/raft/issues/1205

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Tamas Bela Feher (https://github.com/tfeher)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/1298
---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 731 ++++++++++++++----
 .../neighbors/detail/ivf_pq_codepacking.cuh   | 214 +++++
 cpp/include/raft/neighbors/ivf_pq_helpers.cuh | 409 ++++++++++
 cpp/test/neighbors/ann_ivf_pq.cuh             | 199 ++++-
 4 files changed, 1389 insertions(+), 164 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh
 create mode 100644 cpp/include/raft/neighbors/ivf_pq_helpers.cuh

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 1a563d213e..36ceccc36f 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
+#include <raft/neighbors/detail/ivf_pq_codepacking.cuh>
 #include <raft/neighbors/ivf_list.hpp>
 #include <raft/neighbors/ivf_pq_types.hpp>
 
@@ -60,63 +61,6 @@ namespace raft::neighbors::ivf_pq::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
-/** A chunk of PQ-encoded vector managed by one CUDA thread. */
-using pq_vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
-
-namespace {
-
-/**
- * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`.
- *
- * @tparam Bits number of bits comprising the value.
- */
-template <uint32_t Bits>
-struct bitfield_ref_t {
-  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
-  constexpr static uint8_t kMask = static_cast<uint8_t>((1u << Bits) - 1u);
-  uint8_t* ptr;
-  uint32_t offset;
-
-  constexpr operator uint8_t()  // NOLINT
-  {
-    auto pair = static_cast<uint16_t>(ptr[0]);
-    if (offset + Bits > 8) { pair |= static_cast<uint16_t>(ptr[1]) << 8; }
-    return static_cast<uint8_t>((pair >> offset) & kMask);
-  }
-
-  constexpr auto operator=(uint8_t code) -> bitfield_ref_t&
-  {
-    if (offset + Bits > 8) {
-      auto pair = static_cast<uint16_t>(ptr[0]);
-      pair |= static_cast<uint16_t>(ptr[1]) << 8;
-      pair &= ~(static_cast<uint16_t>(kMask) << offset);
-      pair |= static_cast<uint16_t>(code) << offset;
-      ptr[0] = static_cast<uint8_t>(Pow2<256>::mod(pair));
-      ptr[1] = static_cast<uint8_t>(Pow2<256>::div(pair));
-    } else {
-      ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset);
-    }
-    return *this;
-  }
-};
-
-/**
- * View a byte array as an array of unsigned integers of custom small bit size.
- *
- * @tparam Bits number of bits comprising a single element of the array.
- */
-template <uint32_t Bits>
-struct bitfield_view_t {
-  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
-  uint8_t* raw;
-
-  constexpr auto operator[](uint32_t i) -> bitfield_ref_t<Bits>
-  {
-    uint32_t bit_offset = i * Bits;
-    return bitfield_ref_t<Bits>{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)};
-  }
-};
-
 template <uint32_t BlockDim, typename T, typename S>
 __launch_bounds__(BlockDim) __global__ void copy_warped_kernel(
   T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
@@ -162,8 +106,6 @@ void copy_warped(T* out,
     <<<blocks, threads, 0, stream>>>(out, ld_out, in, ld_in, n_cols, n_rows);
 }
 
-}  // namespace
-
 /**
  * @brief Fill-in a random orthogonal transformation matrix.
  *
@@ -276,7 +218,7 @@ void flat_compute_residuals(
   device_matrix_view<const float, uint32_t, row_major> rotation_matrix,  // [rot_dim, dim]
   device_matrix_view<const float, uint32_t, row_major> centers,          // [n_lists, dim_ext]
   const T* dataset,                                                      // [n_rows, dim]
-  const uint32_t* labels,                                                // [n_rows]
+  std::variant<uint32_t, const uint32_t*> labels,                        // [n_rows]
   rmm::mr::device_memory_resource* device_memory)
 {
   auto stream  = handle.get_stream();
@@ -287,7 +229,9 @@ void flat_compute_residuals(
   linalg::map_offset(handle, tmp_view, [centers, dataset, labels, dim] __device__(size_t i) {
     auto row_ix = i / dim;
     auto el_ix  = i % dim;
-    auto label  = labels[row_ix];
+    auto label  = std::holds_alternative<uint32_t>(labels)
+                    ? std::get<uint32_t>(labels)
+                    : std::get<const uint32_t*>(labels)[row_ix];
     return utils::mapping<float>{}(dataset[i]) - centers(label, el_ix);
   });
 
@@ -558,11 +502,363 @@ void train_per_cluster(raft::device_resources const& handle,
 }
 
 /**
- * Compute the code: find the closest cluster in each pq_dim-subspace.
+ * A helper function: given the dataset in the rotated space
+ *  [n_rows, rot_dim] = [n_rows, pq_dim * pq_len],
+ * reinterpret the last dimension as two: [n_rows, pq_dim, pq_len]
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param vectors input data [n_rows, rot_dim]
+ * @param pq_centers codebook (used to infer the structure - pq_len)
+ * @return reinterpreted vectors [n_rows, pq_dim, pq_len]
+ */
+template <typename T, typename IdxT>
+static __device__ auto reinterpret_vectors(
+  device_matrix_view<T, IdxT, row_major> vectors,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers)
+  -> device_mdspan<T, extent_3d<IdxT>, row_major>
+{
+  const uint32_t pq_len = pq_centers.extent(1);
+  const uint32_t pq_dim = vectors.extent(1) / pq_len;
+  using layout_t        = typename decltype(vectors)::layout_type;
+  using accessor_t      = typename decltype(vectors)::accessor_type;
+  return mdspan<T, extent_3d<IdxT>, layout_t, accessor_t>(
+    vectors.data_handle(), extent_3d<IdxT>{vectors.extent(0), pq_dim, pq_len});
+}
+
+/**
+ * A consumer for the `run_on_list` and `run_on_vector` that just flattens PQ codes
+ * one-per-byte. That is, independent of the code width (pq_bits), one code uses
+ * the whole byte, hence one vectors uses pq_dim bytes.
+ */
+struct unpack_codes {
+  device_matrix_view<uint8_t, uint32_t, row_major> out_codes;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[out] out_codes the destination for the read codes.
+   */
+  __device__ inline unpack_codes(device_matrix_view<uint8_t, uint32_t, row_major> out_codes)
+    : out_codes{out_codes}
+  {
+  }
+
+  /**  Write j-th component (code) of the i-th vector into the output array. */
+  __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j)
+  {
+    out_codes(i, j) = code;
+  }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel(
+  device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  const uint32_t pq_dim = out_codes.extent(1);
+  auto unpack_action    = unpack_codes{out_codes};
+  run_on_list<PqBits>(in_list_data, offset_or_indices, out_codes.extent(0), pq_dim, unpack_action);
+}
+
+/**
+ * Unpack flat PQ codes from an existing list by the given offset.
+ *
+ * @param[out] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] list_data the packed ivf::list data.
+ * @param[in] offset_or_indices how many records in the list to skip or the exact indices.
+ * @param[in] pq_bits codebook size (1 << pq_bits)
+ * @param[in] stream
+ */
+inline void unpack_list_data(
+  device_matrix_view<uint8_t, uint32_t, row_major> codes,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t pq_bits,
+  rmm::cuda_stream_view stream)
+{
+  auto n_rows = codes.extent(0);
+  if (n_rows == 0) { return; }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [pq_bits]() {
+    switch (pq_bits) {
+      case 4: return unpack_list_data_kernel<kBlockSize, 4>;
+      case 5: return unpack_list_data_kernel<kBlockSize, 5>;
+      case 6: return unpack_list_data_kernel<kBlockSize, 6>;
+      case 7: return unpack_list_data_kernel<kBlockSize, 7>;
+      case 8: return unpack_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }();
+  kernel<<<blocks, threads, 0, stream>>>(codes, list_data, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Unpack the list data; see the public interface for the api and usage. */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  unpack_list_data(out_codes,
+                   index.lists()[label]->data.view(),
+                   offset_or_indices,
+                   index.pq_bits(),
+                   res.get_stream());
+}
+
+/** A consumer for the `run_on_list` and `run_on_vector` that approximates the original input data.
+ */
+struct reconstruct_vectors {
+  codebook_gen codebook_kind;
+  uint32_t cluster_ix;
+  uint32_t pq_len;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> centers_rot;
+  device_mdspan<float, extent_3d<uint32_t>, row_major> out_vectors;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[out] out_vectors the destination for the decoded vectors.
+   * @param[in] pq_centers the codebook
+   * @param[in] centers_rot
+   * @param[in] codebook_kind
+   * @param[in] cluster_ix label/id of the cluster.
+   */
+  __device__ inline reconstruct_vectors(
+    device_matrix_view<float, uint32_t, row_major> out_vectors,
+    device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+    device_matrix_view<const float, uint32_t, row_major> centers_rot,
+    codebook_gen codebook_kind,
+    uint32_t cluster_ix)
+    : codebook_kind{codebook_kind},
+      cluster_ix{cluster_ix},
+      pq_len{pq_centers.extent(1)},
+      pq_centers{pq_centers},
+      centers_rot{reinterpret_vectors(centers_rot, pq_centers)},
+      out_vectors{reinterpret_vectors(out_vectors, pq_centers)}
+  {
+  }
+
+  /**
+   * Decode j-th component of the i-th vector by its code and write it into a chunk of the output
+   * vectors (pq_len elements).
+   */
+  __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j)
+  {
+    uint32_t partition_ix;
+    switch (codebook_kind) {
+      case codebook_gen::PER_CLUSTER: {
+        partition_ix = cluster_ix;
+      } break;
+      case codebook_gen::PER_SUBSPACE: {
+        partition_ix = j;
+      } break;
+      default: __builtin_unreachable();
+    }
+    for (uint32_t k = 0; k < pq_len; k++) {
+      out_vectors(i, j, k) = pq_centers(partition_ix, k, code) + centers_rot(cluster_ix, j, k);
+    }
+  }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
+  device_matrix_view<float, uint32_t, row_major> out_vectors,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  device_matrix_view<const float, uint32_t, row_major> centers_rot,
+  codebook_gen codebook_kind,
+  uint32_t cluster_ix,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  const uint32_t pq_dim = out_vectors.extent(1) / pq_centers.extent(1);
+  auto reconstruct_action =
+    reconstruct_vectors{out_vectors, pq_centers, centers_rot, codebook_kind, cluster_ix};
+  run_on_list<PqBits>(
+    in_list_data, offset_or_indices, out_vectors.extent(0), pq_dim, reconstruct_action);
+}
+
+/** Decode the list data; see the public interface for the api and usage. */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label,
+                           std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  auto n_rows = out_vectors.extent(0);
+  if (n_rows == 0) { return; }
+  auto& list = index.lists()[label];
+  if (std::holds_alternative<uint32_t>(offset_or_indices)) {
+    auto n_skip = std::get<uint32_t>(offset_or_indices);
+    // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)`
+    // to avoid an extra device-host data copy and the stream sync.
+    RAFT_EXPECTS(n_skip + n_rows <= list->size.load(),
+                 "offset + output size must be not bigger than the cluster size.");
+  }
+
+  auto tmp = make_device_mdarray<float>(
+    res, res.get_workspace_resource(), make_extents<uint32_t>(n_rows, index.rot_dim()));
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return reconstruct_list_data_kernel<kBlockSize, 4>;
+      case 5: return reconstruct_list_data_kernel<kBlockSize, 5>;
+      case 6: return reconstruct_list_data_kernel<kBlockSize, 6>;
+      case 7: return reconstruct_list_data_kernel<kBlockSize, 7>;
+      case 8: return reconstruct_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(index.pq_bits());
+  kernel<<<blocks, threads, 0, res.get_stream()>>>(tmp.view(),
+                                                   list->data.view(),
+                                                   index.pq_centers(),
+                                                   index.centers_rot(),
+                                                   index.codebook_kind(),
+                                                   label,
+                                                   offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  float* out_float_ptr = nullptr;
+  rmm::device_uvector<float> out_float_buf(0, res.get_stream(), res.get_workspace_resource());
+  if constexpr (std::is_same_v<T, float>) {
+    out_float_ptr = out_vectors.data_handle();
+  } else {
+    out_float_buf.resize(size_t{n_rows} * size_t{index.dim()}, res.get_stream());
+    out_float_ptr = out_float_buf.data();
+  }
+  // Rotate the results back to the original space
+  float alpha = 1.0;
+  float beta  = 0.0;
+  linalg::gemm(res,
+               false,
+               false,
+               index.dim(),
+               n_rows,
+               index.rot_dim(),
+               &alpha,
+               index.rotation_matrix().data_handle(),
+               index.dim(),
+               tmp.data_handle(),
+               index.rot_dim(),
+               &beta,
+               out_float_ptr,
+               index.dim(),
+               res.get_stream());
+  // Transform the data to the original type, if necessary
+  if constexpr (!std::is_same_v<T, float>) {
+    linalg::map(res,
+                out_vectors,
+                utils::mapping<T>{},
+                make_device_matrix_view<const float>(out_float_ptr, n_rows, index.dim()));
+  }
+}
+
+/**
+ * A producer for the `write_list` and `write_vector` reads the codes byte-by-byte. That is,
+ * independent of the code width (pq_bits), one code uses the whole byte, hence one vectors uses
+ * pq_dim bytes.
+ */
+struct pass_codes {
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[in] codes the source codes.
+   */
+  __device__ inline pass_codes(device_matrix_view<const uint8_t, uint32_t, row_major> codes)
+    : codes{codes}
+  {
+  }
+
+  /** Read j-th component (code) of the i-th vector from the source. */
+  __device__ inline auto operator()(uint32_t i, uint32_t j) const -> uint8_t { return codes(i, j); }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void pack_list_data_kernel(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  write_list<PqBits, 1>(
+    list_data, offset_or_indices, codes.extent(0), codes.extent(1), pass_codes{codes});
+}
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_rows).
+ *
+ * @param[out] list_data the packed ivf::list data.
+ * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] offset_or_indices how many records in the list to skip or the exact indices.
+ * @param[in] pq_bits codebook size (1 << pq_bits)
+ * @param[in] stream
+ */
+inline void pack_list_data(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t pq_bits,
+  rmm::cuda_stream_view stream)
+{
+  auto n_rows = codes.extent(0);
+  if (n_rows == 0) { return; }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [pq_bits]() {
+    switch (pq_bits) {
+      case 4: return pack_list_data_kernel<kBlockSize, 4>;
+      case 5: return pack_list_data_kernel<kBlockSize, 5>;
+      case 6: return pack_list_data_kernel<kBlockSize, 6>;
+      case 7: return pack_list_data_kernel<kBlockSize, 7>;
+      case 8: return pack_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }();
+  kernel<<<blocks, threads, 0, stream>>>(list_data, codes, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename IdxT>
+void pack_list_data(raft::device_resources const& res,
+                    index<IdxT>* index,
+                    device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                    uint32_t label,
+                    std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  pack_list_data(index->lists()[label]->data.view(),
+                 new_codes,
+                 offset_or_indices,
+                 index->pq_bits(),
+                 res.get_stream());
+}
+
+/**
+ *
+ * A producer for the `write_list` and `write_vector` that encodes level-1 input vector residuals
+ * into lvl-2 PQ codes.
+ * Computing a PQ code means finding the closest cluster in a pq_dim-subspace.
  *
  * @tparam SubWarpSize
  *   how many threads work on a single vector;
- *   bouded by either WarpSize or pq_book_size.
+ *   bounded by either WarpSize or pq_book_size.
  *
  * @param pq_centers
  *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
@@ -574,56 +870,75 @@ void train_per_cluster(raft::device_resources const& handle,
  * @param j index along pq_dim "dimension"
  * @param cluster_ix is used for PER_CLUSTER codebooks.
  */
-template <uint32_t SubWarpSize>
-__device__ auto compute_pq_code(
-  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
-  device_mdspan<const float, extent_2d<uint32_t>, row_major> new_vector,
-  codebook_gen codebook_kind,
-  uint32_t j,
-  uint32_t cluster_ix) -> uint8_t
-{
-  using subwarp_align = Pow2<SubWarpSize>;
-  uint32_t lane_id    = subwarp_align::mod(laneId());
-  uint32_t partition_ix;
-  switch (codebook_kind) {
-    case codebook_gen::PER_CLUSTER: {
-      partition_ix = cluster_ix;
-    } break;
-    case codebook_gen::PER_SUBSPACE: {
-      partition_ix = j;
-    } break;
-    default: __builtin_unreachable();
+/**
+ */
+template <uint32_t SubWarpSize, typename IdxT>
+struct encode_vectors {
+  codebook_gen codebook_kind;
+  uint32_t cluster_ix;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers;
+  device_mdspan<const float, extent_3d<IdxT>, row_major> in_vectors;
+
+  __device__ inline encode_vectors(
+    device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+    device_matrix_view<const float, IdxT, row_major> in_vectors,
+    codebook_gen codebook_kind,
+    uint32_t cluster_ix)
+    : codebook_kind{codebook_kind},
+      cluster_ix{cluster_ix},
+      pq_centers{pq_centers},
+      in_vectors{reinterpret_vectors(in_vectors, pq_centers)}
+  {
   }
 
-  const uint32_t pq_book_size = pq_centers.extent(2);
-  const uint32_t pq_len       = pq_centers.extent(1);
-  float min_dist              = std::numeric_limits<float>::infinity();
-  uint8_t code                = 0;
-  // calculate the distance for each PQ cluster, find the minimum for each thread
-  for (uint32_t i = lane_id; i < pq_book_size; i += subwarp_align::Value) {
-    // NB: the L2 quantifiers on residuals are always trained on L2 metric.
-    float d = 0.0f;
-    for (uint32_t k = 0; k < pq_len; k++) {
-      auto t = new_vector(j, k) - pq_centers(partition_ix, k, i);
-      d += t * t;
+  /**
+   * Decode j-th component of the i-th vector by its code and write it into a chunk of the output
+   * vectors (pq_len elements).
+   */
+  __device__ inline auto operator()(IdxT i, uint32_t j) -> uint8_t
+  {
+    uint32_t lane_id = Pow2<SubWarpSize>::mod(laneId());
+    uint32_t partition_ix;
+    switch (codebook_kind) {
+      case codebook_gen::PER_CLUSTER: {
+        partition_ix = cluster_ix;
+      } break;
+      case codebook_gen::PER_SUBSPACE: {
+        partition_ix = j;
+      } break;
+      default: __builtin_unreachable();
     }
-    if (d < min_dist) {
-      min_dist = d;
-      code     = uint8_t(i);
+
+    const uint32_t pq_book_size = pq_centers.extent(2);
+    const uint32_t pq_len       = pq_centers.extent(1);
+    float min_dist              = std::numeric_limits<float>::infinity();
+    uint8_t code                = 0;
+    // calculate the distance for each PQ cluster, find the minimum for each thread
+    for (uint32_t l = lane_id; l < pq_book_size; l += SubWarpSize) {
+      // NB: the L2 quantifiers on residuals are always trained on L2 metric.
+      float d = 0.0f;
+      for (uint32_t k = 0; k < pq_len; k++) {
+        auto t = in_vectors(i, j, k) - pq_centers(partition_ix, k, l);
+        d += t * t;
+      }
+      if (d < min_dist) {
+        min_dist = d;
+        code     = uint8_t(l);
+      }
     }
-  }
-  // reduce among threads
+    // reduce among threads
 #pragma unroll
-  for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
-    const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
-    const auto other_code = shfl_xor(code, stride, SubWarpSize);
-    if (other_dist < min_dist) {
-      min_dist = other_dist;
-      code     = other_code;
+    for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
+      const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
+      const auto other_code = shfl_xor(code, stride, SubWarpSize);
+      if (other_dist < min_dist) {
+        min_dist = other_dist;
+        code     = other_code;
+      }
     }
+    return code;
   }
-  return code;
-}
+};
 
 template <uint32_t BlockSize, uint32_t PqBits, typename IdxT>
 __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
@@ -639,7 +954,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
   using subwarp_align             = Pow2<kSubWarpSize>;
   const uint32_t lane_id          = subwarp_align::mod(threadIdx.x);
-  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{blockDim.x} * IdxT{blockIdx.x});
+  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{BlockSize} * IdxT{blockIdx.x});
   if (row_ix >= new_vectors.extent(0)) { return; }
 
   const uint32_t cluster_ix = new_labels[row_ix];
@@ -647,7 +962,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   if (lane_id == 0) { out_ix = atomicAdd(&list_sizes(cluster_ix), 1); }
   out_ix = shfl(out_ix, 0, kSubWarpSize);
 
-  // write the label
+  // write the label  (one record per subwarp)
   auto pq_indices = inds_ptrs(cluster_ix);
   if (lane_id == 0) {
     if (std::holds_alternative<IdxT>(src_offset_or_indices)) {
@@ -657,40 +972,81 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
     }
   }
 
-  // write the codes
-  using group_align         = Pow2<kIndexGroupSize>;
-  const uint32_t group_ix   = group_align::div(out_ix);
-  const uint32_t ingroup_ix = group_align::mod(out_ix);
-  const uint32_t pq_len     = pq_centers.extent(1);
-  const uint32_t pq_dim     = new_vectors.extent(1) / pq_len;
-
+  // write the codes (one record per subwarp):
+  const uint32_t pq_dim = new_vectors.extent(1) / pq_centers.extent(1);
   auto pq_extents = list_spec<uint32_t, IdxT>{PqBits, pq_dim, true}.make_list_extents(out_ix + 1);
-  auto pq_extents_vectorized =
-    make_extents<uint32_t>(pq_extents.extent(0), pq_extents.extent(1), pq_extents.extent(2));
-  auto pq_dataset = make_mdspan<pq_vec_t, uint32_t, row_major, false, true>(
-    reinterpret_cast<pq_vec_t*>(data_ptrs[cluster_ix]), pq_extents_vectorized);
-
-  __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)];
-  pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)];
-  bitfield_view_t<PqBits> out{reinterpret_cast<uint8_t*>(&code)};
-  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
-  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
-    // clear the chunk for writing
-    if (lane_id == 0) { code = pq_vec_t{}; }
-    // fill-in the values, one/pq_dim at a time
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
-      // find the label
-      using layout_t   = typename decltype(new_vectors)::layout_type;
-      using accessor_t = typename decltype(new_vectors)::accessor_type;
-      auto one_vector  = mdspan<const float, extent_2d<uint32_t>, layout_t, accessor_t>(
-        &new_vectors(row_ix, 0), extent_2d<uint32_t>{pq_dim, pq_len});
-      auto l = compute_pq_code<kSubWarpSize>(pq_centers, one_vector, codebook_kind, j, cluster_ix);
-      if (lane_id == 0) { out[k] = l; }
+  auto pq_dataset =
+    make_mdspan<uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
+  write_vector<PqBits, kSubWarpSize>(
+    pq_dataset,
+    out_ix,
+    row_ix,
+    pq_dim,
+    encode_vectors<kSubWarpSize, IdxT>{pq_centers, new_vectors, codebook_kind, cluster_ix});
+}
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void encode_list_data_kernel(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const float, uint32_t, row_major> new_vectors,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  codebook_gen codebook_kind,
+  uint32_t cluster_ix,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
+  const uint32_t pq_dim           = new_vectors.extent(1) / pq_centers.extent(1);
+  auto encode_action =
+    encode_vectors<kSubWarpSize, uint32_t>{pq_centers, new_vectors, codebook_kind, cluster_ix};
+  write_list<PqBits, kSubWarpSize>(
+    list_data, offset_or_indices, new_vectors.extent(0), pq_dim, encode_action);
+}
+
+template <typename T, typename IdxT>
+void encode_list_data(raft::device_resources const& res,
+                      index<IdxT>* index,
+                      device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                      uint32_t label,
+                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  auto n_rows = new_vectors.extent(0);
+  if (n_rows == 0) { return; }
+
+  auto mr = res.get_workspace_resource();
+
+  auto new_vectors_residual =
+    make_device_mdarray<float>(res, mr, make_extents<uint32_t>(n_rows, index->rot_dim()));
+
+  flat_compute_residuals<T, uint32_t>(res,
+                                      new_vectors_residual.data_handle(),
+                                      n_rows,
+                                      index->rotation_matrix(),
+                                      index->centers(),
+                                      new_vectors.data_handle(),
+                                      label,
+                                      mr);
+
+  constexpr uint32_t kBlockSize  = 256;
+  const uint32_t threads_per_vec = std::min<uint32_t>(WarpSize, index->pq_book_size());
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize / threads_per_vec), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return encode_list_data_kernel<kBlockSize, 4>;
+      case 5: return encode_list_data_kernel<kBlockSize, 5>;
+      case 6: return encode_list_data_kernel<kBlockSize, 6>;
+      case 7: return encode_list_data_kernel<kBlockSize, 7>;
+      case 8: return encode_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
     }
-    // write the chunk into the dataset
-    if (lane_id == 0) { pq_dataset(group_ix, i, ingroup_ix) = code; }
-  }
+  }(index->pq_bits());
+  kernel<<<blocks, threads, 0, res.get_stream()>>>(index->lists()[label]->data.view(),
+                                                   new_vectors_residual.view(),
+                                                   index->pq_centers(),
+                                                   index->codebook_kind(),
+                                                   label,
+                                                   offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -732,14 +1088,14 @@ void process_and_fill_codes(raft::device_resources const& handle,
   auto new_vectors_residual =
     make_device_mdarray<float>(handle, mr, make_extents<IdxT>(n_rows, index.rot_dim()));
 
-  flat_compute_residuals(handle,
-                         new_vectors_residual.data_handle(),
-                         n_rows,
-                         index.rotation_matrix(),
-                         index.centers(),
-                         new_vectors,
-                         new_labels,
-                         mr);
+  flat_compute_residuals<T, IdxT>(handle,
+                                  new_vectors_residual.data_handle(),
+                                  n_rows,
+                                  index.rotation_matrix(),
+                                  index.centers(),
+                                  new_vectors,
+                                  new_labels,
+                                  mr);
 
   constexpr uint32_t kBlockSize  = 256;
   const uint32_t threads_per_vec = std::min<uint32_t>(WarpSize, index.pq_book_size());
@@ -819,6 +1175,85 @@ void recompute_internal_state(const raft::device_resources& res, index<IdxT>& in
   }
 }
 
+/**
+ * Helper function: allocate enough space in the list, compute the offset, at which to start
+ * writing, and fill-in indices.
+ *
+ * @return offset for writing the data
+ */
+template <typename IdxT>
+auto extend_list_prepare(raft::device_resources const& res,
+                         index<IdxT>* index,
+                         device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                         uint32_t label) -> uint32_t
+{
+  uint32_t n_rows = new_indices.extent(0);
+  uint32_t offset;
+  // Allocate the lists to fit the new data
+  copy(&offset, index->list_sizes().data_handle() + label, 1, res.get_stream());
+  res.sync_stream();
+  uint32_t new_size = offset + n_rows;
+  copy(index->list_sizes().data_handle() + label, &new_size, 1, res.get_stream());
+  auto spec = list_spec<uint32_t, IdxT>{
+    index->pq_bits(), index->pq_dim(), index->conservative_memory_allocation()};
+  auto& list = index->lists()[label];
+  ivf::resize_list(res, list, spec, new_size, offset);
+  copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream());
+  return offset;
+}
+
+/**
+ * Extend one list of the index in-place, by the list label, skipping the classification and
+ * encoding steps.
+ * See the public interface for the api and usage.
+ */
+template <typename IdxT>
+void extend_list_with_codes(raft::device_resources const& res,
+                            index<IdxT>* index,
+                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                            uint32_t label)
+{
+  // Allocate memory and write indices
+  auto offset = extend_list_prepare(res, index, new_indices, label);
+  // Pack the data
+  pack_list_data<IdxT>(res, index, new_codes, label, offset);
+  // Update the pointers and the sizes
+  recompute_internal_state(res, *index);
+}
+
+/**
+ * Extend one list of the index in-place, by the list label, skipping the classification step.
+ * See the public interface for the api and usage.
+ */
+template <typename T, typename IdxT>
+void extend_list(raft::device_resources const& res,
+                 index<IdxT>* index,
+                 device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                 device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                 uint32_t label)
+{
+  // Allocate memory and write indices
+  auto offset = extend_list_prepare(res, index, new_indices, label);
+  // Encode the data
+  encode_list_data<T, IdxT>(res, index, new_vectors, label, offset);
+  // Update the pointers and the sizes
+  recompute_internal_state(res, *index);
+}
+
+/**
+ * Remove all data from a single list.
+ * See the public interface for the api and usage.
+ */
+template <typename IdxT>
+void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t label)
+{
+  uint32_t zero = 0;
+  copy(index->list_sizes().data_handle() + label, &zero, 1, res.get_stream());
+  index->lists()[label].reset();
+  recompute_internal_state(res, *index);
+}
+
 /** Copy the state of an index into a new index, but share the list data among the two. */
 template <typename IdxT>
 auto clone(const raft::device_resources& res, const index<IdxT>& source) -> index<IdxT>
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh
new file mode 100644
index 0000000000..52969dd176
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/ivf_list.hpp>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
+
+#include <variant>
+
+namespace raft::neighbors::ivf_pq::detail {
+
+/** A chunk of PQ-encoded vector managed by one CUDA thread. */
+using pq_vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
+
+/**
+ * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`.
+ *
+ * @tparam Bits number of bits comprising the value.
+ */
+template <uint32_t Bits>
+struct bitfield_ref_t {
+  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
+  constexpr static uint8_t kMask = static_cast<uint8_t>((1u << Bits) - 1u);
+  uint8_t* ptr;
+  uint32_t offset;
+
+  constexpr operator uint8_t()  // NOLINT
+  {
+    auto pair = static_cast<uint16_t>(ptr[0]);
+    if (offset + Bits > 8) { pair |= static_cast<uint16_t>(ptr[1]) << 8; }
+    return static_cast<uint8_t>((pair >> offset) & kMask);
+  }
+
+  constexpr auto operator=(uint8_t code) -> bitfield_ref_t&
+  {
+    if (offset + Bits > 8) {
+      auto pair = static_cast<uint16_t>(ptr[0]);
+      pair |= static_cast<uint16_t>(ptr[1]) << 8;
+      pair &= ~(static_cast<uint16_t>(kMask) << offset);
+      pair |= static_cast<uint16_t>(code) << offset;
+      ptr[0] = static_cast<uint8_t>(Pow2<256>::mod(pair));
+      ptr[1] = static_cast<uint8_t>(Pow2<256>::div(pair));
+    } else {
+      ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset);
+    }
+    return *this;
+  }
+};
+
+/**
+ * View a byte array as an array of unsigned integers of custom small bit size.
+ *
+ * @tparam Bits number of bits comprising a single element of the array.
+ */
+template <uint32_t Bits>
+struct bitfield_view_t {
+  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
+  uint8_t* raw;
+
+  constexpr auto operator[](uint32_t i) -> bitfield_ref_t<Bits>
+  {
+    uint32_t bit_offset = i * Bits;
+    return bitfield_ref_t<Bits>{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)};
+  }
+};
+
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam Action tells how to process a single vector (e.g. reconstruct or just unpack)
+ *
+ * @param[in] in_list_data the encoded cluster data.
+ * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
+ * @param[in] out_ix the output index passed to the action
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_vector(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  uint32_t in_ix,
+  uint32_t out_ix,
+  uint32_t pq_dim,
+  Action action)
+{
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(in_ix);
+  const uint32_t ingroup_ix = group_align::mod(in_ix);
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // read the chunk
+    code_chunk = *reinterpret_cast<const pq_vec_t*>(&in_list_data(group_ix, i, ingroup_ix, 0));
+    // read the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // read a piece of the reconstructed vector
+      action(code_view[k], out_ix, j);
+    }
+  }
+}
+
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data).
+ * @tparam IdxT type of the index passed to the action
+ * @tparam Action tells how to process a single vector (e.g. encode or just pack)
+ *
+ * @param[in] out_list_data the encoded cluster data.
+ * @param[in] out_ix in-cluster index of the vector to be processed (one-per-SubWarpSize threads).
+ * @param[in] in_ix the input index passed to the action (one-per-SubWarpSize threads).
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, uint32_t SubWarpSize, typename IdxT, typename Action>
+__device__ void write_vector(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
+  uint32_t out_ix,
+  IdxT in_ix,
+  uint32_t pq_dim,
+  Action action)
+{
+  const uint32_t lane_id = Pow2<SubWarpSize>::mod(threadIdx.x);
+
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(out_ix);
+  const uint32_t ingroup_ix = group_align::mod(out_ix);
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // clear the chunk
+    if (lane_id == 0) { code_chunk = pq_vec_t{}; }
+    // write the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // write a single code
+      uint8_t code = action(in_ix, j);
+      if (lane_id == 0) { code_view[k] = code; }
+    }
+    // write the chunk to the list
+    if (lane_id == 0) {
+      *reinterpret_cast<pq_vec_t*>(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk;
+    }
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_list(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t len,
+  uint32_t pq_dim,
+  Action action)
+{
+  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
+    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    run_on_vector<PqBits>(in_list_data, src_ix, ix, pq_dim, action);
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, uint32_t SubWarpSize, typename Action>
+__device__ void write_list(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t len,
+  uint32_t pq_dim,
+  Action action)
+{
+  using subwarp_align = Pow2<SubWarpSize>;
+  uint32_t stride     = subwarp_align::div(blockDim.x);
+  uint32_t ix         = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x);
+  for (; ix < len; ix += stride) {
+    const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    write_vector<PqBits, SubWarpSize>(out_list_data, dst_ix, ix, pq_dim, action);
+  }
+}
+
+}  // namespace raft::neighbors::ivf_pq::detail
diff --git a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
new file mode 100644
index 0000000000..398bd545f1
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_pq_build.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+
+namespace raft::neighbors::ivf_pq::helpers {
+/**
+ * @defgroup ivf_pq_helpers Helper functions for manipulationg IVF PQ Index
+ * @{
+ */
+
+namespace codepacker {
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Bit compression is removed, which means output will have pq_dim dimensional vectors (one code per
+ * byte, instead of ceildiv(pq_dim * pq_bits, 8) bytes of pq codes).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_take = 4;
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_take, index.pq_dim());
+ *   uint32_t offset = 0;
+ *   // unpack n_take elements from the list
+ *   ivf_pq::helpers::codepacker::unpack(res, list_data, index.pq_bits(), offset, codes.view());
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] pq_bits bit length of encoded vector elements
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[out] codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ */
+inline void unpack(
+  raft::device_resources const& res,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  uint32_t pq_bits,
+  uint32_t offset,
+  device_matrix_view<uint8_t, uint32_t, row_major> codes)
+{
+  ivf_pq::detail::unpack_list_data(codes, list_data, offset, pq_bits, res.get_stream());
+}
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data  = index.lists()[label]->data.view();
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_vec, index.pq_dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_pq::helpers::codepacker::pack(
+ *       res, make_const_mdspan(codes.view()), index.pq_bits(), 42, list_data);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] codes flat PQ codes, one code per byte [n_vec, pq_dim]
+ * @param[in] pq_bits bit length of encoded vector elements
+ * @param[in] offset how many records to skip before writing the data into the list
+ * @param[in] list_data block to write into
+ */
+inline void pack(
+  raft::device_resources const& res,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  uint32_t pq_bits,
+  uint32_t offset,
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data)
+{
+  ivf_pq::detail::pack_list_data(list_data, codes, offset, pq_bits, res.get_stream());
+}
+}  // namespace codepacker
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * The list is identified by its label.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will write into the 137th cluster
+ *   uint32_t label = 137;
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<const uint8_t>(res, n_vec, index.pq_dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_pq::helpers::pack_list_data(res, &index, codes_to_pack, label, 42);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[inout] index IVF-PQ index.
+ * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] label The id of the list (cluster) into which we write.
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+template <typename IdxT>
+void pack_list_data(raft::device_resources const& res,
+                    index<IdxT>* index,
+                    device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+                    uint32_t label,
+                    uint32_t offset)
+{
+  ivf_pq::detail::pack_list_data(res, index, codes, label, offset);
+}
+
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`, one code per byte (independently of pq_bits).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will unpack the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, list_size, index.pq_dim());
+ *   // unpack the whole list
+ *   ivf_pq::helpers::unpack_list_data(res, index, codes.view(), label, 0);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[out] out_codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset
+ *   How many records in the list to skip.
+ */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      uint32_t offset)
+{
+  return ivf_pq::detail::unpack_list_data<IdxT>(res, index, out_codes, label, offset);
+}
+
+/**
+ * @brief Unpack a series of records of a single list (cluster) in the compressed index
+ * by their in-list offsets, one code per byte (independently of pq_bits).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will unpack the fourth cluster
+ *   uint32_t label = 3;
+ *   // Create the selection vector
+ *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+ *   ... fill the indices ...
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, selected_indices.size(), index.pq_dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::unpack_list_data(
+ *       res, index, selected_indices.view(), codes.view(), label);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[in] in_cluster_indices
+ *   The offsets of the selected indices within the cluster.
+ * @param[out] out_codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_vector_view<const uint32_t> in_cluster_indices,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label)
+{
+  return ivf_pq::detail::unpack_list_data<IdxT>(res, index, out_codes, label, in_cluster_indices);
+}
+
+/**
+ * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will reconstruct the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto decoded_vectors = raft::make_device_matrix<float>(res, list_size, index.dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[out] out_vectors
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to reconstruct,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset
+ *   How many records in the list to skip.
+ */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label,
+                           uint32_t offset)
+{
+  return ivf_pq::detail::reconstruct_list_data(res, index, out_vectors, label, offset);
+}
+
+/**
+ * @brief Decode a series of records of a single list (cluster) in the compressed index
+ * by their in-list offsets.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will reconstruct the fourth cluster
+ *   uint32_t label = 3;
+ *   // Create the selection vector
+ *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+ *   ... fill the indices ...
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto decoded_vectors = raft::make_device_matrix<float>(
+ *                             res, selected_indices.size(), index.dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::reconstruct_list_data(
+ *       res, index, selected_indices.view(), decoded_vectors.view(), label);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[in] in_cluster_indices
+ *   The offsets of the selected indices within the cluster.
+ * @param[out] out_vectors
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to reconstruct,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_vector_view<const uint32_t> in_cluster_indices,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label)
+{
+  return ivf_pq::detail::reconstruct_list_data(res, index, out_vectors, label, in_cluster_indices);
+}
+
+/**
+ * @brief Extend one list of the index in-place, by the list label, skipping the classification and
+ * encoding steps.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will extend the fourth cluster
+ *   uint32_t label = 3;
+ *   // We will fill 4 new vectors
+ *   uint32_t n_vec = 4;
+ *   // Indices of the new vectors
+ *   auto indices = raft::make_device_vector<uint32_t>(res, n_vec);
+ *   ... fill the indices ...
+ *   auto new_codes = raft::make_device_matrix<uint8_t, uint32_t, row_major> new_codes(
+ *       res, n_vec, index.pq_dim());
+ *   ... fill codes ...
+ *   // extend list with new codes
+ *   ivf_pq::helpers::extend_list_with_codes(
+ *       res, &index, codes.view(), indices.view(), label);
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] new_codes flat PQ codes, one code per byte [n_rows, index.pq_dim()]
+ * @param[in] new_indices source indices [n_rows]
+ * @param[in] label the id of the target list (cluster).
+ */
+template <typename IdxT>
+void extend_list_with_codes(raft::device_resources const& res,
+                            index<IdxT>* index,
+                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                            uint32_t label)
+{
+  ivf_pq::detail::extend_list_with_codes(res, index, new_codes, new_indices, label);
+}
+
+/**
+ * @brief Extend one list of the index in-place, by the list label, skipping the classification
+ * step.
+ *
+ *  Usage example:
+ * @code{.cpp}
+ *   // We will extend the fourth cluster
+ *   uint32_t label = 3;
+ *   // We will extend with 4 new vectors
+ *   uint32_t n_vec = 4;
+ *   // Indices of the new vectors
+ *   auto indices = raft::make_device_vector<uint32_t>(res, n_vec);
+ *   ... fill the indices ...
+ *   auto new_vectors = raft::make_device_matrix<float, uint32_t, row_major> new_codes(
+ *       res, n_vec, index.dim());
+ *   ... fill vectors ...
+ *   // extend list with new vectors
+ *   ivf_pq::helpers::extend_list(
+ *       res, &index, new_vectors.view(), indices.view(), label);
+ * @endcode
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] new_vectors data to encode [n_rows, index.dim()]
+ * @param[in] new_indices source indices [n_rows]
+ * @param[in] label the id of the target list (cluster).
+ *
+ */
+template <typename T, typename IdxT>
+void extend_list(raft::device_resources const& res,
+                 index<IdxT>* index,
+                 device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                 device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                 uint32_t label)
+{
+  ivf_pq::detail::extend_list(res, index, new_vectors, new_indices, label);
+}
+
+/**
+ * @brief Remove all data from a single list (cluster) in the index.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will erase the fourth cluster (label = 3)
+ *   ivf_pq::helpers::erase_list(res, &index, 3);
+ * @endcode
+ *
+ * @tparam IdxT
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] label the id of the target list (cluster).
+ */
+template <typename IdxT>
+void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t label)
+{
+  ivf_pq::detail::erase_list(res, index, label);
+}
+
+/** @} */
+}  // namespace raft::neighbors::ivf_pq::helpers
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 90387cde2f..1cc9760fb5 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -22,8 +22,12 @@
 
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/map_reduce.cuh>
+#include <raft/matrix/gather.cuh>
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/neighbors/ivf_pq_serialize.cuh>
+#include <raft/neighbors/ivf_pq_helpers.cuh>
 #include <raft/random/rng.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -34,8 +38,6 @@
 #include <gtest/gtest.h>
 
 #include <cub/cub.cuh>
-#include <thrust/reduce.h>
-#include <thrust/sequence.h>
 
 #include <algorithm>
 #include <cstddef>
@@ -111,6 +113,33 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   return os;
 }
 
+template <typename T>
+void compare_vectors_l2(
+  const raft::device_resources& res, T a, T b, uint32_t label, double compression_ratio, double eps)
+{
+  auto n_rows = a.extent(0);
+  auto dim    = a.extent(1);
+  rmm::mr::managed_memory_resource managed_memory;
+  auto dist = make_device_mdarray<double>(res, &managed_memory, make_extents<uint32_t>(n_rows));
+  linalg::map_offset(res, dist.view(), [a, b, dim] __device__(uint32_t i) {
+    spatial::knn::detail::utils::mapping<float> f{};
+    double d = 0.0f;
+    for (uint32_t j = 0; j < dim; j++) {
+      double t = f(a(i, j)) - f(b(i, j));
+      d += t * t;
+    }
+    return sqrt(d / double(dim));
+  });
+  res.sync_stream();
+  for (uint32_t i = 0; i < n_rows; i++) {
+    double d = dist(i);
+    // The theoretical estimate of the error is hard to come up with,
+    // the estimate below is based on experimentation + curse of dimensionality
+    ASSERT_LE(d, 1.2 * eps * std::pow(2.0, compression_ratio))
+      << " (label = " << label << ", ix = " << i << ", eps = " << eps << ")";
+  }
+}
+
 template <typename IdxT>
 auto min_output_size(const raft::device_resources& handle,
                      const ivf_pq::index<IdxT>& index,
@@ -135,7 +164,6 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
   {
   }
 
- protected:
   void gen_data()
   {
     database.resize(size_t{ps.num_db_vecs} * size_t{ps.dim}, stream_);
@@ -174,7 +202,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     handle_.sync_stream(stream_);
   }
 
-  index<IdxT> build_only()
+  auto build_only()
   {
     auto ipams              = ps.index_params;
     ipams.add_data_on_build = true;
@@ -184,19 +212,17 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     return ivf_pq::build<DataT, IdxT>(handle_, ipams, index_view);
   }
 
-  index<IdxT> build_2_extends()
+  auto build_2_extends()
   {
-    rmm::device_uvector<IdxT> db_indices(ps.num_db_vecs, stream_);
-    thrust::sequence(handle_.get_thrust_policy(),
-                     thrust::device_pointer_cast(db_indices.data()),
-                     thrust::device_pointer_cast(db_indices.data() + ps.num_db_vecs));
+    auto db_indices = make_device_vector<IdxT>(handle_, ps.num_db_vecs);
+    linalg::map_offset(handle_, db_indices.view(), identity_op{});
     handle_.sync_stream(stream_);
     auto size_1 = IdxT(ps.num_db_vecs) / 2;
     auto size_2 = IdxT(ps.num_db_vecs) - size_1;
     auto vecs_1 = database.data();
     auto vecs_2 = database.data() + size_t(size_1) * size_t(ps.dim);
-    auto inds_1 = db_indices.data();
-    auto inds_2 = db_indices.data() + size_t(size_1);
+    auto inds_1 = db_indices.data_handle();
+    auto inds_2 = db_indices.data_handle() + size_t(size_1);
 
     auto ipams              = ps.index_params;
     ipams.add_data_on_build = false;
@@ -216,17 +242,160 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     return idx;
   }
 
-  index<IdxT> build_serialize()
+  auto build_serialize()
   {
     ivf_pq::serialize<IdxT>(handle_, "ivf_pq_index", build_only());
     return ivf_pq::deserialize<IdxT>(handle_, "ivf_pq_index");
   }
 
+  void check_reconstruction(const index<IdxT>& index,
+                            double compression_ratio,
+                            uint32_t label,
+                            uint32_t n_take,
+                            uint32_t n_skip)
+  {
+    auto& rec_list = index.lists()[label];
+    auto dim       = index.dim();
+    n_take         = std::min<uint32_t>(n_take, rec_list->size.load());
+    n_skip         = std::min<uint32_t>(n_skip, rec_list->size.load() - n_take);
+
+    if (n_take == 0) { return; }
+
+    auto rec_data  = make_device_matrix<DataT>(handle_, n_take, dim);
+    auto orig_data = make_device_matrix<DataT>(handle_, n_take, dim);
+
+    ivf_pq::helpers::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip);
+
+    matrix::gather(database.data(),
+                   IdxT{dim},
+                   IdxT{n_take},
+                   rec_list->indices.data_handle() + n_skip,
+                   IdxT{n_take},
+                   orig_data.data_handle(),
+                   stream_);
+
+    compare_vectors_l2(handle_, rec_data.view(), orig_data.view(), label, compression_ratio, 0.06);
+  }
+
+  void check_reconstruct_extend(index<IdxT>* index, double compression_ratio, uint32_t label)
+  {
+    // NB: this is not reference, the list is retained; the index will have to create a new list on
+    // `erase_list` op.
+    auto old_list = index->lists()[label];
+    auto n_rows   = old_list->size.load();
+    if (n_rows == 0) { return; }
+
+    auto vectors_1 = make_device_matrix<EvalT>(handle_, n_rows, index->dim());
+    auto indices   = make_device_vector<IdxT>(handle_, n_rows);
+    copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
+
+    ivf_pq::helpers::reconstruct_list_data(handle_, *index, vectors_1.view(), label, 0);
+    ivf_pq::helpers::erase_list(handle_, index, label);
+    // NB: passing the type parameter because const->non-const implicit conversion of the mdspans
+    // breaks type inference
+    ivf_pq::helpers::extend_list<EvalT, IdxT>(
+      handle_, index, vectors_1.view(), indices.view(), label);
+
+    auto& new_list = index->lists()[label];
+    ASSERT_NE(old_list.get(), new_list.get())
+      << "The old list should have been shared and retained after ivf_pq index has erased the "
+         "corresponding cluster.";
+
+    auto vectors_2 = make_device_matrix<EvalT>(handle_, n_rows, index->dim());
+    ivf_pq::helpers::reconstruct_list_data(handle_, *index, vectors_2.view(), label, 0);
+    // The code search is unstable, and there's high chance of repeating values of the lvl-2 codes.
+    // Hence, encoding-decoding chain often leads to altering both the PQ codes and the
+    // reconstructed data.
+    compare_vectors_l2(
+      handle_, vectors_1.view(), vectors_2.view(), label, compression_ratio, 0.025);
+  }
+
+  void check_packing(index<IdxT>* index, uint32_t label)
+  {
+    auto old_list = index->lists()[label];
+    auto n_rows   = old_list->size.load();
+
+    if (n_rows == 0) { return; }
+
+    auto codes   = make_device_matrix<uint8_t>(handle_, n_rows, index->pq_dim());
+    auto indices = make_device_vector<IdxT>(handle_, n_rows);
+    copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
+
+    ivf_pq::helpers::unpack_list_data(handle_, *index, codes.view(), label, 0);
+    ivf_pq::helpers::erase_list(handle_, index, label);
+    ivf_pq::helpers::extend_list_with_codes<IdxT>(
+      handle_, index, codes.view(), indices.view(), label);
+
+    auto& new_list = index->lists()[label];
+    ASSERT_NE(old_list.get(), new_list.get())
+      << "The old list should have been shared and retained after ivf_pq index has erased the "
+         "corresponding cluster.";
+    auto list_data_size = (n_rows / ivf_pq::kIndexGroupSize) * new_list->data.extent(1) *
+                          new_list->data.extent(2) * new_list->data.extent(3);
+
+    ASSERT_TRUE(old_list->data.size() >= list_data_size);
+    ASSERT_TRUE(new_list->data.size() >= list_data_size);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+
+    // Pack a few vectors back to the list.
+    int row_offset = 9;
+    int n_vec      = 3;
+    ASSERT_TRUE(row_offset + n_vec < n_rows);
+    size_t offset      = row_offset * index->pq_dim();
+    auto codes_to_pack = make_device_matrix_view<const uint8_t, uint32_t>(
+      codes.data_handle() + offset, n_vec, index->pq_dim());
+    ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+
+    // Another test with the API that take list_data directly
+    auto list_data  = index->lists()[label]->data.view();
+    uint32_t n_take = 4;
+    ASSERT_TRUE(row_offset + n_take < n_rows);
+    auto codes2 = raft::make_device_matrix<uint8_t>(handle_, n_take, index->pq_dim());
+    ivf_pq::helpers::codepacker::unpack(
+      handle_, list_data, index->pq_bits(), row_offset, codes2.view());
+
+    // Write it back
+    ivf_pq::helpers::codepacker::pack(
+      handle_, make_const_mdspan(codes2.view()), index->pq_bits(), row_offset, list_data);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
+  }
+
   template <typename BuildIndex>
   void run(BuildIndex build_index)
   {
     index<IdxT> index = build_index();
 
+    double compression_ratio =
+      static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
+
+    for (uint32_t label = 0; label < index.n_lists(); label++) {
+      switch (label % 3) {
+        case 0: {
+          // Reconstruct and re-write vectors for one label
+          check_reconstruct_extend(&index, compression_ratio, label);
+        } break;
+        case 1: {
+          // Dump and re-write codes for one label
+          check_packing(&index, label);
+        } break;
+        default: {
+          // check a small subset of data in a randomly chosen cluster to see if the data
+          // reconstruction works well.
+          check_reconstruction(index, compression_ratio, label, 100, 7);
+        }
+      }
+    }
+
     size_t queries_size = ps.num_queries * ps.k;
     std::vector<IdxT> indices_ivf_pq(queries_size);
     std::vector<EvalT> distances_ivf_pq(queries_size);
@@ -251,11 +420,9 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     // A very conservative lower bound on recall
     double min_recall =
       static_cast<double>(ps.search_params.n_probes) / static_cast<double>(ps.index_params.n_lists);
-    double low_precision_factor =
-      static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
     // Using a heuristic to lower the required recall due to code-packing errors
     min_recall =
-      std::min(std::erfc(0.05 * low_precision_factor / std::max(min_recall, 0.5)), min_recall);
+      std::min(std::erfc(0.05 * compression_ratio / std::max(min_recall, 0.5)), min_recall);
     // Use explicit per-test min recall value if provided.
     min_recall = ps.min_recall.value_or(min_recall);
 
@@ -265,7 +432,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                                 distances_ivf_pq,
                                 ps.num_queries,
                                 ps.k,
-                                0.0001 * low_precision_factor,
+                                0.0001 * compression_ratio,
                                 min_recall))
       << ps;
 

From 5e84daa6ad0233e80996505086dde9e616c1321b Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Mon, 17 Apr 2023 14:19:51 -0700
Subject: [PATCH 76/89] Add python bindings for matrix::select_k (#1422)

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/1422
---
 cpp/CMakeLists.txt                            |   7 +-
 cpp/include/raft/matrix/select_k.cuh          |  20 +--
 cpp/include/raft_runtime/matrix/select_k.hpp  |  32 +++++
 .../raft_internal/matrix/select_k.cuh         |  13 +-
 .../matrix/select_k_float_int64_t.cu          |  37 +++++
 python/pylibraft/CMakeLists.txt               |   1 +
 .../pylibraft/pylibraft/matrix/CMakeLists.txt |  24 ++++
 .../pylibraft/pylibraft/matrix/__init__.pxd   |  14 ++
 python/pylibraft/pylibraft/matrix/__init__.py |  18 +++
 .../pylibraft/matrix/cpp/__init__.pxd         |   0
 .../pylibraft/matrix/cpp/__init__.py          |  14 ++
 .../pylibraft/matrix/cpp/select_k.pxd         |  39 +++++
 .../pylibraft/pylibraft/matrix/select_k.pyx   | 133 ++++++++++++++++++
 .../pylibraft/neighbors/brute_force.pyx       |   3 +-
 ...test_brue_force.py => test_brute_force.py} |   0
 .../pylibraft/pylibraft/test/test_doctests.py |   2 +
 .../pylibraft/pylibraft/test/test_select_k.py |  54 +++++++
 17 files changed, 389 insertions(+), 22 deletions(-)
 create mode 100644 cpp/include/raft_runtime/matrix/select_k.hpp
 create mode 100644 cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
 create mode 100644 python/pylibraft/pylibraft/matrix/CMakeLists.txt
 create mode 100644 python/pylibraft/pylibraft/matrix/__init__.pxd
 create mode 100644 python/pylibraft/pylibraft/matrix/__init__.py
 create mode 100644 python/pylibraft/pylibraft/matrix/cpp/__init__.pxd
 create mode 100644 python/pylibraft/pylibraft/matrix/cpp/__init__.py
 create mode 100644 python/pylibraft/pylibraft/matrix/cpp/select_k.pxd
 create mode 100644 python/pylibraft/pylibraft/matrix/select_k.pyx
 rename python/pylibraft/pylibraft/test/{test_brue_force.py => test_brute_force.py} (100%)
 create mode 100644 python/pylibraft/pylibraft/test/test_select_k.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1c705cc786..7f64b92306 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -70,13 +70,11 @@ option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
 
-
-# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs
-# to have different values for the `Threads::Threads` target. Setting this flag ensures
+# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
+# have different values for the `Threads::Threads` target. Setting this flag ensures
 # `Threads::Threads` is the same value across all builds so that cache hits occur
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 
-
 include(CMakeDependentOption)
 # cmake_dependent_option( RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for
 # nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARY OFF )
@@ -357,6 +355,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/raft_runtime/cluster/update_centroids_float.cu
     src/raft_runtime/distance/fused_l2_min_arg.cu
     src/raft_runtime/distance/pairwise_distance.cu
+    src/raft_runtime/matrix/select_k_float_int64_t.cu
     src/raft_runtime/neighbors/brute_force_knn_int64_t_float.cu
     src/raft_runtime/neighbors/ivf_flat_build.cu
     src/raft_runtime/neighbors/ivf_flat_search.cu
diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
index 9a1a14fd73..7951cbdb03 100644
--- a/cpp/include/raft/matrix/select_k.cuh
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -42,13 +42,13 @@ namespace raft::matrix {
  * @code{.cpp}
  *   using namespace raft;
  *   // get a 2D row-major array of values to search through
- *   auto in_values = {... input device_matrix_view<const float, size_t, row_major> ...}
+ *   auto in_values = {... input device_matrix_view<const float, int64_t, row_major> ...}
  *   // prepare output arrays
- *   auto out_extents = make_extents<size_t>(in_values.extent(0), k);
+ *   auto out_extents = make_extents<int64_t>(in_values.extent(0), k);
  *   auto out_values  = make_device_mdarray<float>(handle, out_extents);
- *   auto out_indices = make_device_mdarray<size_t>(handle, out_extents);
+ *   auto out_indices = make_device_mdarray<int64_t>(handle, out_extents);
  *   // search `k` smallest values in each row
- *   matrix::select_k<float, size_t>(
+ *   matrix::select_k<float, int64_t>(
  *     handle, in_values, std::nullopt, out_values.view(), out_indices.view(), true);
  * @endcode
  *
@@ -76,13 +76,13 @@ namespace raft::matrix {
  */
 template <typename T, typename IdxT>
 void select_k(const device_resources& handle,
-              raft::device_matrix_view<const T, size_t, row_major> in_val,
-              std::optional<raft::device_matrix_view<const IdxT, size_t, row_major>> in_idx,
-              raft::device_matrix_view<T, size_t, row_major> out_val,
-              raft::device_matrix_view<IdxT, size_t, row_major> out_idx,
+              raft::device_matrix_view<const T, int64_t, row_major> in_val,
+              std::optional<raft::device_matrix_view<const IdxT, int64_t, row_major>> in_idx,
+              raft::device_matrix_view<T, int64_t, row_major> out_val,
+              raft::device_matrix_view<IdxT, int64_t, row_major> out_idx,
               bool select_min)
 {
-  RAFT_EXPECTS(out_val.extent(1) <= size_t(std::numeric_limits<int>::max()),
+  RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
                "output k must fit the int type.");
   auto batch_size = in_val.extent(0);
   auto len        = in_val.extent(1);
@@ -93,7 +93,7 @@ void select_k(const device_resources& handle,
     RAFT_EXPECTS(batch_size == in_idx->extent(0), "batch sizes must be equal");
     RAFT_EXPECTS(len == in_idx->extent(1), "value and index input lengths must be equal");
   }
-  RAFT_EXPECTS(size_t(k) == out_idx.extent(1), "value and index output lengths must be equal");
+  RAFT_EXPECTS(int64_t(k) == out_idx.extent(1), "value and index output lengths must be equal");
   return detail::select_k<T, IdxT>(in_val.data_handle(),
                                    in_idx.has_value() ? in_idx->data_handle() : nullptr,
                                    batch_size,
diff --git a/cpp/include/raft_runtime/matrix/select_k.hpp b/cpp/include/raft_runtime/matrix/select_k.hpp
new file mode 100644
index 0000000000..08c0e01d0a
--- /dev/null
+++ b/cpp/include/raft_runtime/matrix/select_k.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+
+#include <optional>
+
+namespace raft::runtime::matrix {
+void select_k(const device_resources& handle,
+              raft::device_matrix_view<const float, int64_t, row_major> in_val,
+              std::optional<raft::device_matrix_view<const int64_t, int64_t, row_major>> in_idx,
+              raft::device_matrix_view<float, int64_t, row_major> out_val,
+              raft::device_matrix_view<int64_t, int64_t, row_major> out_idx,
+              bool select_min);
+
+}  // namespace raft::runtime::matrix
diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh
index 8dedec67cb..3d7a11e91e 100644
--- a/cpp/internal/raft_internal/matrix/select_k.cuh
+++ b/cpp/internal/raft_internal/matrix/select_k.cuh
@@ -86,12 +86,13 @@ void select_k_impl(const device_resources& handle,
   auto stream = handle.get_stream();
   switch (algo) {
     case Algo::kPublicApi: {
-      auto in_extent   = make_extents<size_t>(batch_size, len);
-      auto out_extent  = make_extents<size_t>(batch_size, k);
-      auto in_span     = make_mdspan<const T, size_t, row_major, false, true>(in, in_extent);
-      auto in_idx_span = make_mdspan<const IdxT, size_t, row_major, false, true>(in_idx, in_extent);
-      auto out_span    = make_mdspan<T, size_t, row_major, false, true>(out, out_extent);
-      auto out_idx_span = make_mdspan<IdxT, size_t, row_major, false, true>(out_idx, out_extent);
+      auto in_extent  = make_extents<int64_t>(batch_size, len);
+      auto out_extent = make_extents<int64_t>(batch_size, k);
+      auto in_span    = make_mdspan<const T, int64_t, row_major, false, true>(in, in_extent);
+      auto in_idx_span =
+        make_mdspan<const IdxT, int64_t, row_major, false, true>(in_idx, in_extent);
+      auto out_span     = make_mdspan<T, int64_t, row_major, false, true>(out, out_extent);
+      auto out_idx_span = make_mdspan<IdxT, int64_t, row_major, false, true>(out_idx, out_extent);
       if (in_idx == nullptr) {
         // NB: std::nullopt prevents automatic inference of the template parameters.
         return matrix::select_k<T, IdxT>(
diff --git a/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu b/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
new file mode 100644
index 0000000000..309ac50c6b
--- /dev/null
+++ b/cpp/src/raft_runtime/matrix/select_k_float_int64_t.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/matrix/select_k.cuh>
+#include <raft/matrix/specializations.cuh>
+
+#include <raft_runtime/matrix/select_k.hpp>
+
+#include <vector>
+
+namespace raft::runtime::matrix {
+
+void select_k(const device_resources& handle,
+              raft::device_matrix_view<const float, int64_t, row_major> in_val,
+              std::optional<raft::device_matrix_view<const int64_t, int64_t, row_major>> in_idx,
+              raft::device_matrix_view<float, int64_t, row_major> out_val,
+              raft::device_matrix_view<int64_t, int64_t, row_major> out_idx,
+              bool select_min)
+{
+  raft::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, select_min);
+}
+}  // namespace raft::runtime::matrix
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 349a2b08ba..069bd98222 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -86,6 +86,7 @@ rapids_cython_init()
 
 add_subdirectory(pylibraft/common)
 add_subdirectory(pylibraft/distance)
+add_subdirectory(pylibraft/matrix)
 add_subdirectory(pylibraft/neighbors)
 add_subdirectory(pylibraft/random)
 add_subdirectory(pylibraft/cluster)
diff --git a/python/pylibraft/pylibraft/matrix/CMakeLists.txt b/python/pylibraft/pylibraft/matrix/CMakeLists.txt
new file mode 100644
index 0000000000..ffba10dea9
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/CMakeLists.txt
@@ -0,0 +1,24 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Set the list of Cython files to build
+set(cython_sources select_k.pyx)
+set(linked_libraries raft::raft raft::compiled)
+
+# Build all of the Cython targets
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX matrix_
+)
diff --git a/python/pylibraft/pylibraft/matrix/__init__.pxd b/python/pylibraft/pylibraft/matrix/__init__.pxd
new file mode 100644
index 0000000000..a7e7b75096
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/__init__.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/matrix/__init__.py b/python/pylibraft/pylibraft/matrix/__init__.py
new file mode 100644
index 0000000000..5eb35795ed
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .select_k import select_k
+
+__all__ = ["select_k"]
diff --git a/python/pylibraft/pylibraft/matrix/cpp/__init__.pxd b/python/pylibraft/pylibraft/matrix/cpp/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/python/pylibraft/pylibraft/matrix/cpp/__init__.py b/python/pylibraft/pylibraft/matrix/cpp/__init__.py
new file mode 100644
index 0000000000..8f2cc34855
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/cpp/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/python/pylibraft/pylibraft/matrix/cpp/select_k.pxd b/python/pylibraft/pylibraft/matrix/cpp/select_k.pxd
new file mode 100644
index 0000000000..ab466fdce6
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/cpp/select_k.pxd
@@ -0,0 +1,39 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+
+from pylibraft.common.cpp.mdspan cimport device_matrix_view, row_major
+from pylibraft.common.cpp.optional cimport optional
+from pylibraft.common.handle cimport device_resources
+
+
+cdef extern from "raft_runtime/matrix/select_k.hpp" \
+        namespace "raft::runtime::matrix" nogil:
+
+    cdef void select_k(const device_resources & handle,
+                       device_matrix_view[float, int64_t, row_major],
+                       optional[device_matrix_view[int64_t,
+                                                   int64_t,
+                                                   row_major]],
+                       device_matrix_view[float, int64_t, row_major],
+                       device_matrix_view[int64_t, int64_t, row_major],
+                       bool) except +
diff --git a/python/pylibraft/pylibraft/matrix/select_k.pyx b/python/pylibraft/pylibraft/matrix/select_k.pyx
new file mode 100644
index 0000000000..fbb1e2e5d3
--- /dev/null
+++ b/python/pylibraft/pylibraft/matrix/select_k.pyx
@@ -0,0 +1,133 @@
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cython.operator cimport dereference as deref
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+
+import numpy as np
+
+from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
+from pylibraft.common.handle import auto_sync_handle
+from pylibraft.common.input_validation import is_c_contiguous
+
+from pylibraft.common.cpp.mdspan cimport (
+    device_matrix_view,
+    host_matrix_view,
+    make_device_matrix_view,
+    make_host_matrix_view,
+    row_major,
+)
+from pylibraft.common.cpp.optional cimport optional
+from pylibraft.common.handle cimport device_resources
+from pylibraft.common.mdspan cimport get_dmv_float, get_dmv_int64
+from pylibraft.matrix.cpp.select_k cimport select_k as c_select_k
+
+
+@auto_sync_handle
+@auto_convert_output
+def select_k(dataset, k=None, distances=None, indices=None, select_min=True,
+             handle=None):
+    """
+    Selects the top k items from each row in a matrix
+
+
+    Parameters
+    ----------
+    dataset : array interface compliant matrix, row-major layout,
+        shape (n_rows, dim). Supported dtype [float]
+    k : int
+        Number of items to return for each row.  Optional if indices or
+        distances arrays are given (in which case their second dimension
+        is k).
+    distances :  Optional array interface compliant matrix shape
+                (n_rows, k), dtype float. If supplied,
+                distances will be written here in-place. (default None)
+    indices :  Optional array interface compliant matrix shape
+                (n_rows, k), dtype int64_t. If supplied, neighbor
+                indices will be written here in-place. (default None)
+    select_min: : bool
+        Whether to select the minimum or maximum K items
+
+    {handle_docstring}
+
+    Returns
+    -------
+    distances: array interface compliant object containing resulting distances
+               shape (n_rows, k)
+
+    indices: array interface compliant object containing resulting indices
+             shape (n_rows, k)
+
+    Examples
+    --------
+
+    >>> import cupy as cp
+
+    >>> from pylibraft.matrix import select_k
+
+    >>> n_features = 50
+    >>> n_rows = 1000
+
+    >>> queries = cp.random.random_sample((n_rows, n_features),
+    ...                                   dtype=cp.float32)
+    >>> k = 40
+    >>> distances, ids = select_k(queries, k)
+    >>> distances = cp.asarray(distances)
+    >>> ids = cp.asarray(ids)
+    """
+
+    dataset_cai = cai_wrapper(dataset)
+
+    if k is None:
+        if indices is not None:
+            k = cai_wrapper(indices).shape[1]
+        elif distances is not None:
+            k = cai_wrapper(distances).shape[1]
+        else:
+            raise ValueError("Argument k must be specified if both indices "
+                             "and distances arg is None")
+
+    n_rows = dataset.shape[0]
+    if indices is None:
+        indices = device_ndarray.empty((n_rows, k), dtype='int64')
+
+    if distances is None:
+        distances = device_ndarray.empty((n_rows, k), dtype='float32')
+
+    distances_cai = cai_wrapper(distances)
+    indices_cai = cai_wrapper(indices)
+
+    cdef device_resources* handle_ = \
+        <device_resources*><size_t>handle.getHandle()
+
+    cdef optional[device_matrix_view[int64_t, int64_t, row_major]] in_idx
+
+    if dataset_cai.dtype == np.float32:
+        c_select_k(deref(handle_),
+                   get_dmv_float(dataset_cai, check_shape=True),
+                   in_idx,
+                   get_dmv_float(distances_cai, check_shape=True),
+                   get_dmv_int64(indices_cai, check_shape=True),
+                   <bool>select_min)
+    else:
+        raise TypeError("dtype %s not supported" % dataset_cai.dtype)
+
+    return distances, indices
diff --git a/python/pylibraft/pylibraft/neighbors/brute_force.pyx b/python/pylibraft/pylibraft/neighbors/brute_force.pyx
index dbd888756d..8836307a5a 100644
--- a/python/pylibraft/pylibraft/neighbors/brute_force.pyx
+++ b/python/pylibraft/pylibraft/neighbors/brute_force.pyx
@@ -40,7 +40,6 @@ from pylibraft.common.handle cimport device_resources
 from pylibraft.common.mdspan cimport get_dmv_float, get_dmv_int64
 
 from pylibraft.common.handle import auto_sync_handle
-from pylibraft.common.input_validation import is_c_contiguous
 from pylibraft.common.interruptible import cuda_interruptible
 
 from pylibraft.distance.distance_type cimport DistanceType
@@ -144,7 +143,7 @@ def knn(dataset, queries, k=None, indices=None, distances=None,
             raise ValueError("Argument k must be specified if both indices "
                              "and distances arg is None")
 
-    n_queries = cai_wrapper(queries).shape[0]
+    n_queries = queries_cai.shape[0]
 
     if indices is None:
         indices = device_ndarray.empty((n_queries, k), dtype='int64')
diff --git a/python/pylibraft/pylibraft/test/test_brue_force.py b/python/pylibraft/pylibraft/test/test_brute_force.py
similarity index 100%
rename from python/pylibraft/pylibraft/test/test_brue_force.py
rename to python/pylibraft/pylibraft/test/test_brute_force.py
diff --git a/python/pylibraft/pylibraft/test/test_doctests.py b/python/pylibraft/pylibraft/test/test_doctests.py
index 34be6c55f5..19e5c5c22f 100644
--- a/python/pylibraft/pylibraft/test/test_doctests.py
+++ b/python/pylibraft/pylibraft/test/test_doctests.py
@@ -22,6 +22,7 @@
 
 import pylibraft.cluster
 import pylibraft.distance
+import pylibraft.matrix
 import pylibraft.neighbors
 import pylibraft.random
 
@@ -94,6 +95,7 @@ def _find_doctests_in_obj(obj, finder=None, criteria=None):
 DOC_STRINGS = list(_find_doctests_in_obj(pylibraft.cluster))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.common))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.distance))
+DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.matrix.select_k))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.ivf_pq))
 DOC_STRINGS.extend(_find_doctests_in_obj(pylibraft.neighbors.brute_force))
diff --git a/python/pylibraft/pylibraft/test/test_select_k.py b/python/pylibraft/pylibraft/test/test_select_k.py
new file mode 100644
index 0000000000..203e735b9c
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/test_select_k.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+
+from pylibraft.common import device_ndarray
+from pylibraft.matrix import select_k
+
+
+@pytest.mark.parametrize("n_rows", [32, 100])
+@pytest.mark.parametrize("n_cols", [40, 100])
+@pytest.mark.parametrize("k", [1, 5, 16, 35])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_select_k(n_rows, n_cols, k, inplace):
+    dataset = np.random.random_sample((n_rows, n_cols)).astype("float32")
+    dataset_device = device_ndarray(dataset)
+
+    indices = np.zeros((n_rows, k), dtype="int64")
+    distances = np.zeros((n_rows, k), dtype="float32")
+    indices_device = device_ndarray(indices)
+    distances_device = device_ndarray(distances)
+
+    ret_distances, ret_indices = select_k(
+        dataset_device,
+        k=k,
+        distances=distances_device,
+        indices=indices_device,
+    )
+
+    distances_device = ret_distances if not inplace else distances_device
+    actual_distances = distances_device.copy_to_host()
+    argsort = np.argsort(dataset, axis=1)
+
+    for i in range(dataset.shape[0]):
+        expected_indices = argsort[i]
+        gpu_dists = actual_distances[i]
+
+        cpu_ordered = dataset[i, expected_indices]
+        np.testing.assert_allclose(
+            cpu_ordered[:k], gpu_dists, atol=1e-4, rtol=1e-4
+        )

From 9906aba8697b2feec4fc62fb86be549d2de5392e Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 18 Apr 2023 14:39:27 +0200
Subject: [PATCH 77/89] Fixup yaml and toml files

---
 .github/workflows/pr.yaml                        |  4 ++--
 .github/workflows/test.yaml                      |  4 ++--
 conda/environments/all_cuda-118_arch-x86_64.yaml |  4 ++--
 dependencies.yaml                                |  6 +++---
 python/pylibraft/pyproject.toml                  | 11 +++++++----
 python/raft-dask/pyproject.toml                  |  6 ++----
 6 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 82435a103f..fc8c8d516e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -103,7 +103,7 @@ jobs:
       build_type: pull-request
       package-name: raft_dask
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-amd64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibraft_cu11 rapids-download-wheels-from-s3 ./local-pylibraft-dep && pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl && pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
       test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
       test-smoketest: "python ./ci/wheel_smoke_test_raft_dask.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 11ff3333d1..dc8f7b6f2b 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -51,6 +51,6 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: raft_dask
-      test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
-      test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
+      test-before-amd64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-arm64: "pip install git+https://github.com/dask/dask.git@2023.3.2 git+https://github.com/dask/distributed.git@2023.3.2.1 git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
       test-unittest: "python -m pytest -v ./python/raft-dask/raft_dask/test"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 5d5dc0e378..0e06076f1a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - cxx-compiler
 - cython>=0.29,<0.30
 - dask-core==2023.3.2
-- dask-cuda==23.4.*
+- dask-cuda==23.6.*
 - dask==2023.3.2
 - distributed==2023.3.2.1
 - doxygen>=1.8.20
@@ -44,7 +44,7 @@ dependencies:
 - pytest
 - pytest-cov
 - recommonmark
-- rmm=23.06
+- rmm==23.6.*
 - scikit-build>=0.13.1
 - scikit-learn
 - scipy
diff --git a/dependencies.yaml b/dependencies.yaml
index 3cbab2fa8d..f3e0cd1167 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -133,7 +133,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - &cuda_python cuda-python >=11.7.1,<12.0
-          - &rmm rmm==23.4.*
+          - &rmm rmm==23.6.*
   checks:
     common:
       - output_types: [conda, requirements]
@@ -266,12 +266,12 @@ dependencies:
       - output_types: [conda, pyproject]
         packages:
           - dask==2023.3.2
-          - dask-cuda==23.4.*
+          - dask-cuda==23.6.*
           - distributed==2023.3.2.1
           - joblib>=0.11
           - numba>=0.49
           - *numpy
-          - ucx-py==0.31.*
+          - ucx-py==0.32.*
       - output_types: conda
         packages:
           - dask-core==2023.3.2
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index b4eb296089..4fe0a52ce6 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -20,7 +20,10 @@ requires = [
     "cython>=0.29,<0.30",
     "ninja",
     "rmm==23.6.*",
-]
+    "scikit-build>=0.13.1",
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -34,10 +37,10 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "numpy",
-    "cuda-python>=11.7.1,<12.0",
+    "cuda-python >=11.7.1,<12.0",
+    "numpy>=1.21",
     "rmm==23.6.*",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 4901df6c38..d7095aa00c 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -34,7 +34,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "dask-cuda==23.4.*",
+    "dask-cuda==23.6.*",
     "dask==2023.3.2",
     "distributed==2023.3.2.1",
     "joblib>=0.11",
@@ -42,9 +42,7 @@ dependencies = [
     "numpy>=1.21",
     "pylibraft==23.6.*",
     "ucx-py==0.32.*",
-    "distributed>=2023.1.1",
-    "pylibraft==23.6.*",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",

From 1299b231c758d82d86f5d29a87685cb57b3880c4 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 18 Apr 2023 14:39:56 +0200
Subject: [PATCH 78/89] Fixup ivpq and ivf_flat files

---
 cpp/include/raft/neighbors/ivf_flat-inl.cuh     | 17 +++++++----------
 cpp/include/raft/neighbors/ivf_pq-ext.cuh       | 12 ++++++------
 cpp/include/raft/neighbors/ivf_pq-inl.cuh       | 10 +++++-----
 cpp/src/neighbors/ivfpq_extend_float_int64_t.cu |  4 ++--
 .../neighbors/ivfpq_extend_int8_t_int64_t.cu    |  4 ++--
 .../neighbors/ivfpq_extend_uint8_t_int64_t.cu   |  4 ++--
 cpp/test/neighbors/ann_ivf_pq.cuh               |  2 +-
 7 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
index 66c6a3cd37..365e483cde 100644
--- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
@@ -94,7 +94,7 @@ auto build(raft::device_resources const& handle,
  *   // use default search parameters
  *   ivf_flat::search_params search_params;
  *   // search K nearest neighbours for each of the N queries
- *   ivf_flat::search(handle, index, queries, out_inds, out_dists, search_params, k);
+ *   ivf_flat::search(handle, search_params, index, queries, out_inds, out_dists);
  * @endcode
  *
  * @tparam T data element type
@@ -138,13 +138,11 @@ auto build(raft::device_resources const& handle,
  *   // use default search parameters
  *   ivf_flat::search_params search_params;
  *   // search K nearest neighbours for each of the N queries
- *   ivf_flat::search(handle, index, queries, out_inds, out_dists, search_params, k);
+ *   ivf_flat::search(handle, search_params, index, queries, out_inds, out_dists);
  * @endcode
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices in the source dataset
- * @tparam int_t precision / type of integral arguments
- * @tparam matrix_IdxT matrix indexing type
  *
  * @param[in] handle
  * @param[in] params configure the index building
@@ -313,7 +311,7 @@ void extend(raft::device_resources const& handle,
  *   index_params.add_data_on_build = false;      // don't populate index on build
  *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
  *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
+ *   auto index_empty = ivf_flat::build(handle, index_params, dataset);
  *   // fill the index with the data
  *   std::optional<raft::device_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
  *   ivf_flat::extend(handle, dataset, no_opt, &index_empty);
@@ -324,7 +322,7 @@ void extend(raft::device_resources const& handle,
  *
  * @param[in] handle
  * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices optional raft::device_matrix_view to a vector of indices [n_rows].
+ * @param[in] new_indices optional raft::device_vector_view to a vector of indices [n_rows].
  *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
  *    here to imply a continuous range `[0...n_rows)`.
  * @param[inout] index pointer to index, to be overwritten in-place
@@ -421,15 +419,14 @@ void search(raft::device_resources const& handle,
  *   ivf_flat::search_params search_params;
  *   // Use the same allocator across multiple searches to reduce the number of
  *   // cuda memory allocations
- *   ivf_flat::search(handle, index, queries1, out_inds1, out_dists1, search_params, K);
- *   ivf_flat::search(handle, index, queries2, out_inds2, out_dists2, search_params, K);
- *   ivf_flat::search(handle, index, queries3, out_inds3, out_dists3, search_params, K);
+ *   ivf_flat::search(handle, search_params, index, queries1, out_inds1, out_dists1);
+ *   ivf_flat::search(handle, search_params, index, queries2, out_inds2, out_dists2);
+ *   ivf_flat::search(handle, search_params, index, queries3, out_inds3, out_dists3);
  *   ...
  * @endcode
  *
  * @tparam T data element type
  * @tparam IdxT type of the indices
- * @tparam int_t precision / type of integral arguments
  *
  * @param[in] handle
  * @param[in] params configure the search
diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
index 2ad32080c6..2e6c8ee858 100644
--- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
@@ -64,7 +64,7 @@ index<IdxT> build(raft::device_resources const& handle,
  *
  * @param[in] handle
  * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device matrix view to a vector of indices [n_rows].
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
  *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
  *    here to imply a continuous range `[0...n_rows)`.
  * @param[inout] idx
@@ -72,7 +72,7 @@ index<IdxT> build(raft::device_resources const& handle,
 template <typename T, typename IdxT>
 index<IdxT> extend(raft::device_resources const& handle,
                    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
-                   std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,
+                   std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,
                    const index<IdxT>& idx) RAFT_EXPLICIT;
 
 /**
@@ -83,7 +83,7 @@ index<IdxT> extend(raft::device_resources const& handle,
  *
  * @param[in] handle
  * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device matrix view to a vector of indices [n_rows].
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
  *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
  *    here to imply a continuous range `[0...n_rows)`.
  * @param[inout] idx
@@ -91,7 +91,7 @@ index<IdxT> extend(raft::device_resources const& handle,
 template <typename T, typename IdxT>
 void extend(raft::device_resources const& handle,
             raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
-            std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,
+            std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,
             index<IdxT>* idx) RAFT_EXPLICIT;
 
 /**
@@ -309,13 +309,13 @@ instantiate_raft_neighbors_ivf_pq_build(uint8_t, int64_t);
   extern template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
     raft::device_resources const& handle,                                                        \
     raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                              \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,            \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,            \
     const raft::neighbors::ivf_pq::index<IdxT>& idx);                                            \
                                                                                                  \
   extern template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
     raft::device_resources const& handle,                                                        \
     raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                              \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,            \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,            \
     raft::neighbors::ivf_pq::index<IdxT>* idx);                                                  \
                                                                                                  \
   extern template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
index 4a12ca72a4..2fd21ef0ee 100644
--- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -69,7 +69,7 @@ index<IdxT> build(raft::device_resources const& handle,
  *
  * @param[in] handle
  * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device matrix view to a vector of indices [n_rows].
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
  *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
  *    here to imply a continuous range `[0...n_rows)`.
  * @param[inout] idx
@@ -77,7 +77,7 @@ index<IdxT> build(raft::device_resources const& handle,
 template <typename T, typename IdxT>
 index<IdxT> extend(raft::device_resources const& handle,
                    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
-                   std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,
+                   std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,
                    const index<IdxT>& idx)
 {
   ASSERT(new_vectors.extent(1) == idx.dim(),
@@ -104,7 +104,7 @@ index<IdxT> extend(raft::device_resources const& handle,
  *
  * @param[in] handle
  * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device matrix view to a vector of indices [n_rows].
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
  *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
  *    here to imply a continuous range `[0...n_rows)`.
  * @param[inout] idx
@@ -112,7 +112,7 @@ index<IdxT> extend(raft::device_resources const& handle,
 template <typename T, typename IdxT>
 void extend(raft::device_resources const& handle,
             raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
-            std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,
+            std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,
             index<IdxT>* idx)
 {
   ASSERT(new_vectors.extent(1) == idx->dim(),
@@ -234,7 +234,7 @@ auto build(raft::device_resources const& handle,
  * @brief Build a new index containing the data of the original plus new extra vectors.
  *
  * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    The new data is clustered according to existing kmeans clusters, the cluster
  *    centers are unchanged.
  *
  * Usage example:
diff --git a/cpp/src/neighbors/ivfpq_extend_float_int64_t.cu b/cpp/src/neighbors/ivfpq_extend_float_int64_t.cu
index 247fe7803f..3e728be38d 100644
--- a/cpp/src/neighbors/ivfpq_extend_float_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_extend_float_int64_t.cu
@@ -21,13 +21,13 @@
   template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
     raft::device_resources const& handle,                                                 \
     raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
     const raft::neighbors::ivf_pq::index<IdxT>& idx);                                     \
                                                                                           \
   template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
     raft::device_resources const& handle,                                                 \
     raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
     raft::neighbors::ivf_pq::index<IdxT>* idx);                                           \
                                                                                           \
   template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
diff --git a/cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu
index 2961dd0353..7853e53f63 100644
--- a/cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_extend_int8_t_int64_t.cu
@@ -21,13 +21,13 @@
   template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
     raft::device_resources const& handle,                                                 \
     raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
     const raft::neighbors::ivf_pq::index<IdxT>& idx);                                     \
                                                                                           \
   template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
     raft::device_resources const& handle,                                                 \
     raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
     raft::neighbors::ivf_pq::index<IdxT>* idx);                                           \
                                                                                           \
   template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
diff --git a/cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu
index 9827486fcf..599a88fc67 100644
--- a/cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_extend_uint8_t_int64_t.cu
@@ -21,13 +21,13 @@
   template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::extend<T, IdxT>( \
     raft::device_resources const& handle,                                                 \
     raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
     const raft::neighbors::ivf_pq::index<IdxT>& idx);                                     \
                                                                                           \
   template void raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
     raft::device_resources const& handle,                                                 \
     raft::device_matrix_view<const T, IdxT, row_major> new_vectors,                       \
-    std::optional<raft::device_matrix_view<const IdxT, IdxT, row_major>> new_indices,     \
+    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,     \
     raft::neighbors::ivf_pq::index<IdxT>* idx);                                           \
                                                                                           \
   template auto raft::neighbors::ivf_pq::extend<T, IdxT>(                                 \
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 1cc9760fb5..90c66ace06 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -26,8 +26,8 @@
 #include <raft/linalg/map_reduce.cuh>
 #include <raft/matrix/gather.cuh>
 #include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/ivf_pq_serialize.cuh>
 #include <raft/neighbors/ivf_pq_helpers.cuh>
+#include <raft/neighbors/ivf_pq_serialize.cuh>
 #include <raft/random/rng.cuh>
 
 #include <rmm/cuda_stream_view.hpp>

From 53f23fb3b7676ae3134f7908eaaf729416f0268b Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 18 Apr 2023 14:52:40 +0200
Subject: [PATCH 79/89] Fix style

---
 cpp/include/raft/distance/distance-ext.cuh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index 31d4dc28a1..73d9e7c518 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -15,12 +15,12 @@
  */
 #pragma once
 
-#include <raft/core/device_mdspan.hpp>                  // raft::device_matrix_view
-#include <raft/core/operators.hpp>                      // raft::identity_op
-#include <raft/core/resources.hpp>                      // raft::resources
-#include <raft/distance/distance_types.hpp>             // raft::distance::DistanceType
-#include <raft/util/raft_explicit.hpp>                  // RAFT_EXPLICIT
-#include <rmm/device_uvector.hpp>                       // rmm::device_uvector
+#include <raft/core/device_mdspan.hpp>       // raft::device_matrix_view
+#include <raft/core/operators.hpp>           // raft::identity_op
+#include <raft/core/resources.hpp>           // raft::resources
+#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
+#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
+#include <rmm/device_uvector.hpp>            // rmm::device_uvector
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE
 

From c684a154a8e4534603a9022a870840a8e4b7b2ee Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 18 Apr 2023 15:20:23 +0200
Subject: [PATCH 80/89] distance: Add back rbf_fin_op instances

Compiling test/distance/gram.cu would fail otherwise, as it did not
explicitly instantiate the raft::distance::distance.
---
 cpp/CMakeLists.txt                            |  1 +
 .../detail/kernels/kernel_matrices.cuh        |  5 +-
 .../distance/detail/kernels/rbf_fin_op.cuh    | 42 ++++++++++++
 .../detail/pairwise_matrix/dispatch-ext.cuh   | 20 ++++++
 cpp/include/raft/distance/distance-ext.cuh    | 28 ++++++--
 .../pairwise_matrix/dispatch_00_generate.py   | 21 ++++++
 .../detail/pairwise_matrix/dispatch_rbf.cu    | 64 +++++++++++++++++++
 cpp/src/distance/distance.cu                  | 16 +++++
 8 files changed, 189 insertions(+), 8 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7f64b92306..c6e61ff92a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -290,6 +290,7 @@ if(RAFT_COMPILE_LIBRARY)
     src/distance/detail/pairwise_matrix/dispatch_lp_unexpanded_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_russel_rao_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_rbf.cu
     src/distance/distance.cu
     src/distance/fused_l2_nn.cu
     src/linalg/detail/coalesced_reduction.cu
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
index d1465efdb0..1b111e77f1 100644
--- a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -17,10 +17,11 @@
 #pragma once
 
 #include "gram_matrix.cuh"
-#include <raft/util/cuda_utils.cuh>
 
+#include <raft/distance/detail/kernels/rbf_fin_op.cuh>
 #include <raft/distance/distance.cuh>
 #include <raft/linalg/gemm.cuh>
+#include <raft/util/cuda_utils.cuh>
 
 namespace raft::distance::kernels::detail {
 
@@ -353,7 +354,7 @@ class RBFKernel : public GramMatrixBase<math_t> {
     math_t gain   = this->gain;
     using index_t = int64_t;
 
-    auto fin_op = [gain] __device__(math_t d_val, index_t idx) { return exp(-gain * d_val); };
+    rbf_fin_op fin_op{gain};
     raft::distance::distance<raft::distance::DistanceType::L2Unexpanded,
                              math_t,
                              math_t,
diff --git a/cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh b/cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh
new file mode 100644
index 0000000000..92118d0b3d
--- /dev/null
+++ b/cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/math.hpp>                 // raft::exp
+#include <raft/util/cuda_dev_essentials.cuh>  // HD
+
+namespace raft::distance::kernels::detail {
+
+/** @brief: Final op for Gram matrix with RBF kernel.
+ *
+ * Calculates output = e^(-gain * in)
+ *
+ */
+template <typename OutT>
+struct rbf_fin_op {
+  OutT gain;
+
+  explicit HD rbf_fin_op(OutT gain_) noexcept : gain(gain_) {}
+
+  template <typename... Args>
+  HDI OutT operator()(OutT d_val, Args... unused_args)
+  {
+    return raft::exp(-gain * d_val);
+  }
+};  // struct rbf_fin_op
+
+}  // namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
index ff95bb56cd..bf070a599f 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -18,6 +18,7 @@
 #include <raft/core/operators.hpp>                          // raft::identity_op
 #include <raft/distance/detail/distance_ops/all_ops.cuh>    // ops::*
 #include <raft/distance/detail/distance_ops/cutlass.cuh>    // ops::has_cutlass_op
+#include <raft/distance/detail/kernels/rbf_fin_op.cuh>      // rbf_fin_op
 #include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
 #include <raft/util/raft_explicit.hpp>                      // RAFT_EXPLICIT
 
@@ -73,6 +74,25 @@ void pairwise_matrix_dispatch(OpT distance_op,
  *
  * After adding an instance here, make sure to also add the instance there.
  */
+
+// The following two instances are used in the RBF kernel object. Note the use of int64_t for the
+// index type.
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  float,
+  float,
+  float,
+  raft::distance::kernels::detail::rbf_fin_op<float>,
+  int64_t);
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::distance::kernels::detail::rbf_fin_op<double>,
+  int64_t);
+
+// Rest of instances
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   raft::distance::detail::ops::canberra_distance_op, float, float, float, raft::identity_op, int);
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index 73d9e7c518..3a2b182ef9 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -15,12 +15,13 @@
  */
 #pragma once
 
-#include <raft/core/device_mdspan.hpp>       // raft::device_matrix_view
-#include <raft/core/operators.hpp>           // raft::identity_op
-#include <raft/core/resources.hpp>           // raft::resources
-#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
-#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
-#include <rmm/device_uvector.hpp>            // rmm::device_uvector
+#include <raft/core/device_mdspan.hpp>                  // raft::device_matrix_view
+#include <raft/core/operators.hpp>                      // raft::identity_op
+#include <raft/core/resources.hpp>                      // raft::resources
+#include <raft/distance/detail/kernels/rbf_fin_op.cuh>  // rbf_fin_op
+#include <raft/distance/distance_types.hpp>             // raft::distance::DistanceType
+#include <raft/util/raft_explicit.hpp>                  // RAFT_EXPLICIT
+#include <rmm/device_uvector.hpp>                       // rmm::device_uvector
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE
 
@@ -358,6 +359,21 @@ void pairwise_distance(raft::resources const& handle,
     bool isRowMajor,                                                                       \
     DataT metric_arg)
 
+// The following two instances are used in test/distance/gram.cu. Note the use
+// of int64_t for the index type.
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::distance::kernels::detail::rbf_fin_op<float>,
+                                   int64_t);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::distance::kernels::detail::rbf_fin_op<double>,
+                                   int64_t);
+
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::Canberra, float, float, float, raft::identity_op, int);
 instantiate_raft_distance_distance(
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
index 4537397aac..97fe120458 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -171,3 +171,24 @@ def arch_headers(archs):
             f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
             f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
         print(f"src/distance/detail/pairwise_matrix/{path}")
+
+# Dispatch kernels for with the RBF fin op.
+with open("dispatch_rbf.cu", "w") as f:
+        OpT="raft::distance::detail::ops::l2_unexp_distance_op"
+        archs = [60]
+
+        f.write(header)
+        f.write("#include <raft/distance/detail/kernels/rbf_fin_op.cuh> // rbf_fin_op\n")
+        f.write(arch_headers(archs))
+        f.write(macro)
+
+        for dt in data_type_instances:
+            DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
+            IdxT = "int64_t"    # overwrite IdxT
+
+            FinOpT = f"raft::distance::kernels::detail::rbf_fin_op<{DataT}>"
+            f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
+
+        f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
+
+print("src/distance/detail/pairwise_matrix/dispatch_rbf.cu")
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
new file mode 100644
index 0000000000..15855cea0a
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include <raft/core/operators.hpp>                                // raft::identity_op
+#include <raft/distance/detail/distance_ops/all_ops.cuh>          // ops::*
+#include <raft/distance/detail/kernels/rbf_fin_op.cuh>            // rbf_fin_op
+#include <raft/distance/detail/pairwise_matrix/dispatch-inl.cuh>  // dispatch
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm60.cuh>
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void raft::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const DataT* x_norm,                                                             \
+      const DataT* y_norm,                                                             \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  float,
+  float,
+  float,
+  raft::distance::kernels::detail::rbf_fin_op<float>,
+  int64_t);
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  raft::distance::detail::ops::l2_unexp_distance_op,
+  double,
+  double,
+  double,
+  raft::distance::kernels::detail::rbf_fin_op<double>,
+  int64_t);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
index 91bd506724..8c94608311 100644
--- a/cpp/src/distance/distance.cu
+++ b/cpp/src/distance/distance.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <raft/distance/detail/kernels/rbf_fin_op.cuh>  // rbf_fin_op
 #include <raft/distance/distance-inl.cuh>
 
 /*
@@ -40,6 +41,21 @@
     bool isRowMajor,                                                                 \
     DataT metric_arg)
 
+// The following two instances are used in test/distance/gram.cu. Note the use
+// of int64_t for the index type.
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   float,
+                                   float,
+                                   float,
+                                   raft::distance::kernels::detail::rbf_fin_op<float>,
+                                   int64_t);
+instantiate_raft_distance_distance(raft::distance::DistanceType::L2Unexpanded,
+                                   double,
+                                   double,
+                                   double,
+                                   raft::distance::kernels::detail::rbf_fin_op<double>,
+                                   int64_t);
+
 instantiate_raft_distance_distance(
   raft::distance::DistanceType::Canberra, float, float, float, raft::identity_op, int);
 instantiate_raft_distance_distance(

From 6a6b1e582562fcd2302f4327555a54832c9bfac1 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 19 Apr 2023 14:15:15 +0200
Subject: [PATCH 81/89] Remove unused code

---
 cpp/include/raft/distance/fused_l2_nn-inl.cuh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/cpp/include/raft/distance/fused_l2_nn-inl.cuh b/cpp/include/raft/distance/fused_l2_nn-inl.cuh
index 05a994b3c7..5cf382cdd9 100644
--- a/cpp/include/raft/distance/fused_l2_nn-inl.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn-inl.cuh
@@ -202,15 +202,9 @@ void fusedL2NNMinReduce(OutT* min,
                         bool initOutBuffer,
                         cudaStream_t stream)
 {
-  // detail::MinAndDistanceReduceOpImpl<IdxT, DataT> redOp;
-  // detail::KVPMinReduceImpl<IdxT, DataT> pairRedOp;
-
   detail::MinAndDistanceReduceOpImpl<IdxT, DataT> redOp;
   detail::KVPMinReduceImpl<IdxT, DataT> pairRedOp;
 
-  // MinAndDistanceReduceOp<IdxT, DataT> redOp;
-  // KVPMinReduce<IdxT, DataT> pairRedOp;
-
   fusedL2NN<DataT, OutT, IdxT>(
     min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
 }

From 486c2e92946683158525b8be2fb99c1e3cf6acf0 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 19 Apr 2023 14:23:33 +0200
Subject: [PATCH 82/89] Remove spurious macros from brute_force instances

---
 cpp/src/neighbors/brute_force_00_generate.py  | 23 +++++++++++--------
 .../brute_force_fused_l2_knn_float_int64_t.cu | 15 ------------
 .../brute_force_knn_int64_t_float_int64_t.cu  | 11 ---------
 .../brute_force_knn_int64_t_float_uint32_t.cu | 11 ---------
 .../brute_force_knn_int_float_int.cu          | 11 ---------
 ...brute_force_knn_uint32_t_float_uint32_t.cu | 11 ---------
 6 files changed, 13 insertions(+), 69 deletions(-)

diff --git a/cpp/src/neighbors/brute_force_00_generate.py b/cpp/src/neighbors/brute_force_00_generate.py
index 53a10b0a08..251dd53b1c 100644
--- a/cpp/src/neighbors/brute_force_00_generate.py
+++ b/cpp/src/neighbors/brute_force_00_generate.py
@@ -41,6 +41,9 @@
 #include <cstdint>
 #include <raft/neighbors/brute_force-inl.cuh>
 
+"""
+
+knn_macro = """
 #define instantiate_raft_neighbors_brute_force_knn(idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op) \\
     template void raft::neighbors::brute_force::knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>( \\
         raft::device_resources const& handle,                           \\
@@ -53,7 +56,9 @@
         std::optional<idx_t> global_id_offset,                          \\
         epilogue_op distance_epilogue);
 
+"""
 
+fused_l2_knn_macro = """
 #define instantiate_raft_neighbors_brute_force_fused_l2_knn(value_t, idx_t, idx_layout, query_layout) \\
     template void raft::neighbors::brute_force::fused_l2_knn(    \\
         raft::device_resources const& handle,                           \\
@@ -65,12 +70,6 @@
 
 """
 
-trailer = """
-
-#undef instantiate_raft_neighbors_brute_force_knn
-#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
-"""
-
 knn_types = dict(
     int64_t_float_uint32_t=("int64_t","float","uint32_t"),
     int64_t_float_int64_t=("int64_t","float","int64_t"),
@@ -87,8 +86,10 @@
     path = f"brute_force_knn_{type_path}.cu"
     with open(path, "w") as f:
         f.write(header)
-        f.write(f"instantiate_raft_neighbors_brute_force_knn({idx_t},{value_t},{matrix_idx},raft::row_major,raft::row_major,raft::identity_op);\n")
-        f.write(trailer)
+        f.write(knn_macro)
+        f.write(f"instantiate_raft_neighbors_brute_force_knn({idx_t},{value_t},{matrix_idx},raft::row_major,raft::row_major,raft::identity_op);\n\n")
+        f.write("#undef instantiate_raft_neighbors_brute_force_knn\n")
+
     # For pasting into CMakeLists.txt
     print(f"src/neighbors/{path}")
 
@@ -97,7 +98,9 @@
     path = f"brute_force_fused_l2_knn_{type_path}.cu"
     with open(path, "w") as f:
         f.write(header)
-        f.write(f"instantiate_raft_neighbors_brute_force_fused_l2_knn({value_t},{idx_t},raft::row_major,raft::row_major);\n")
-        f.write(trailer)
+        f.write(fused_l2_knn_macro)
+        f.write(f"instantiate_raft_neighbors_brute_force_fused_l2_knn({value_t},{idx_t},raft::row_major,raft::row_major);\n\n")
+        f.write("#undef instantiate_raft_neighbors_brute_force_fused_l2_knn\n")
+
     # For pasting into CMakeLists.txt
     print(f"src/neighbors/{path}")
diff --git a/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu b/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
index 4b7eeb034c..4e1805f9a8 100644
--- a/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
+++ b/cpp/src/neighbors/brute_force_fused_l2_knn_float_int64_t.cu
@@ -27,20 +27,6 @@
 #include <cstdint>
 #include <raft/neighbors/brute_force-inl.cuh>
 
-#define instantiate_raft_neighbors_brute_force_knn(                                         \
-  idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op)                     \
-  template void raft::neighbors::brute_force::                                              \
-    knn<idx_t, value_t, matrix_idx, index_layout, search_layout, epilogue_op>(              \
-      raft::device_resources const& handle,                                                 \
-      std::vector<raft::device_matrix_view<const value_t, matrix_idx, index_layout>> index, \
-      raft::device_matrix_view<const value_t, matrix_idx, search_layout> search,            \
-      raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,                       \
-      raft::device_matrix_view<value_t, matrix_idx, row_major> distances,                   \
-      raft::distance::DistanceType metric,                                                  \
-      std::optional<float> metric_arg,                                                      \
-      std::optional<idx_t> global_id_offset,                                                \
-      epilogue_op distance_epilogue);
-
 #define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
   value_t, idx_t, idx_layout, query_layout)                             \
   template void raft::neighbors::brute_force::fused_l2_knn(             \
@@ -56,5 +42,4 @@ instantiate_raft_neighbors_brute_force_fused_l2_knn(float,
                                                     raft::row_major,
                                                     raft::row_major);
 
-#undef instantiate_raft_neighbors_brute_force_knn
 #undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu b/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
index cb2414d164..a668b076d6 100644
--- a/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
+++ b/cpp/src/neighbors/brute_force_knn_int64_t_float_int64_t.cu
@@ -41,18 +41,7 @@
       std::optional<idx_t> global_id_offset,                                                \
       epilogue_op distance_epilogue);
 
-#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
-  value_t, idx_t, idx_layout, query_layout)                             \
-  template void raft::neighbors::brute_force::fused_l2_knn(             \
-    raft::device_resources const& handle,                               \
-    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
-    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
-    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
-    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
-    raft::distance::DistanceType metric);
-
 instantiate_raft_neighbors_brute_force_knn(
   int64_t, float, int64_t, raft::row_major, raft::row_major, raft::identity_op);
 
 #undef instantiate_raft_neighbors_brute_force_knn
-#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu b/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
index e4b9c608f1..21cac5034a 100644
--- a/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
+++ b/cpp/src/neighbors/brute_force_knn_int64_t_float_uint32_t.cu
@@ -41,18 +41,7 @@
       std::optional<idx_t> global_id_offset,                                                \
       epilogue_op distance_epilogue);
 
-#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
-  value_t, idx_t, idx_layout, query_layout)                             \
-  template void raft::neighbors::brute_force::fused_l2_knn(             \
-    raft::device_resources const& handle,                               \
-    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
-    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
-    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
-    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
-    raft::distance::DistanceType metric);
-
 instantiate_raft_neighbors_brute_force_knn(
   int64_t, float, uint32_t, raft::row_major, raft::row_major, raft::identity_op);
 
 #undef instantiate_raft_neighbors_brute_force_knn
-#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/src/neighbors/brute_force_knn_int_float_int.cu b/cpp/src/neighbors/brute_force_knn_int_float_int.cu
index c9df0b3bbd..b76fe09c2a 100644
--- a/cpp/src/neighbors/brute_force_knn_int_float_int.cu
+++ b/cpp/src/neighbors/brute_force_knn_int_float_int.cu
@@ -41,18 +41,7 @@
       std::optional<idx_t> global_id_offset,                                                \
       epilogue_op distance_epilogue);
 
-#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
-  value_t, idx_t, idx_layout, query_layout)                             \
-  template void raft::neighbors::brute_force::fused_l2_knn(             \
-    raft::device_resources const& handle,                               \
-    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
-    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
-    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
-    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
-    raft::distance::DistanceType metric);
-
 instantiate_raft_neighbors_brute_force_knn(
   int, float, int, raft::row_major, raft::row_major, raft::identity_op);
 
 #undef instantiate_raft_neighbors_brute_force_knn
-#undef instantiate_raft_neighbors_brute_force_fused_l2_knn
diff --git a/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu b/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
index 17076857df..4d3f627182 100644
--- a/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
+++ b/cpp/src/neighbors/brute_force_knn_uint32_t_float_uint32_t.cu
@@ -41,18 +41,7 @@
       std::optional<idx_t> global_id_offset,                                                \
       epilogue_op distance_epilogue);
 
-#define instantiate_raft_neighbors_brute_force_fused_l2_knn(            \
-  value_t, idx_t, idx_layout, query_layout)                             \
-  template void raft::neighbors::brute_force::fused_l2_knn(             \
-    raft::device_resources const& handle,                               \
-    raft::device_matrix_view<const value_t, idx_t, idx_layout> index,   \
-    raft::device_matrix_view<const value_t, idx_t, query_layout> query, \
-    raft::device_matrix_view<idx_t, idx_t, row_major> out_inds,         \
-    raft::device_matrix_view<value_t, idx_t, row_major> out_dists,      \
-    raft::distance::DistanceType metric);
-
 instantiate_raft_neighbors_brute_force_knn(
   uint32_t, float, uint32_t, raft::row_major, raft::row_major, raft::identity_op);
 
 #undef instantiate_raft_neighbors_brute_force_knn
-#undef instantiate_raft_neighbors_brute_force_fused_l2_knn

From fe6d335028c325d90db01cf1885a1af86963b0f6 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 19 Apr 2023 14:28:06 +0200
Subject: [PATCH 83/89] Add documentation for rbf_fin_op

---
 cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh b/cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh
index 92118d0b3d..cd19675477 100644
--- a/cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh
+++ b/cpp/include/raft/distance/detail/kernels/rbf_fin_op.cuh
@@ -16,6 +16,15 @@
 
 #pragma once
 
+/*
+ * This file defines rbf_fin_op, which is used in GramMatrixBase.
+ *
+ * This struct has been moved to a separate file, so that it is cheap to include
+ * in distance/distance-ext.cuh, where an instance of raft::distance::distance
+ * with the rbf_fin_op is instantiated.
+ *
+ */
+
 #include <raft/core/math.hpp>                 // raft::exp
 #include <raft/util/cuda_dev_essentials.cuh>  // HD
 

From be030b8e1dbc3eb1cd8ecd4efb589bfb87f02810 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 19 Apr 2023 15:13:48 +0200
Subject: [PATCH 84/89] Add documentation for split header structure

---
 docs/source/developer_guide.md | 65 ++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 6f57453e28..1528390bfb 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -260,6 +260,71 @@ Sometimes, we need to temporarily change the log pattern (eg: for reporting deci
 
 4. Before creating a new primitive, check to see if one exists already. If one exists but the API isn't flexible enough to include your use-case, consider first refactoring the existing primitive. If that is not possible without an extreme number of changes, consider how the public API could be made more flexible. If the new primitive is different enough from all existing primitives, consider whether an existing public API could invoke the new primitive as an option or argument. If the new primitive is different enough from what exists already, add a header for the new public API function to the appropriate subdirectory and namespace.
 
+## Header organization of expensive function templates
+
+RAFT is a heavily templated library. Several core functions are expensive to compile and we want to prevent duplicate compilation of this functionality. To limit build time, RAFT provides a precompiled library (libraft.so) where expensive function templates are instantiated for the most commonly used template parameters. To prevent (1) accidental instantiation of these templates and (2) unnecessary dependency on the internals of these templates, we use the following header structure.
+
+Any header file that defines an expensive function template (say `expensive.cuh`) should be split in three parts: `expensive.cuh`, `expensive-inl.cuh`, and `expensive-ext.cuh`. The file `expensive-inl.cuh` ("inl" for "inline") contains the template definitions, i.e., the actual code. The file `expensive.cuh` includes one or both of the other two files, depending on the values of the `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE` macros. The file `expensive-ext.cuh` contains `extern template` instantiations. In addition, if `RAFT_EXPLICIT_INSTANTIATE` is set, it contains template definitions to ensure that a compiler error is raised in case of accidental instantiation.
+
+The dispatching by `expensive.cuh` is performed as follows:
+``` c++
+#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+// If implicit instantiation is allowed, include template definitions.
+#include "expensive-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+// Include extern template instantiations when RAFT is compiled.
+#include "expensive-ext.cuh"
+#endif
+```
+
+The file `expensive-inl.cuh` is unchanged:
+``` c++
+namespace raft {
+template <typename T>
+void expensive(T arg) {
+  // .. function body
+}
+} // namespace raft
+```
+
+The file `expensive-ext.cuh` contains the following:
+``` c++
+#include <raft/util/raft_explicit.cuh> // RAFT_EXPLICIT
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE
+namespace raft {
+// (1) define templates to raise an error in case of accidental instantiation 
+template <typename T> void expensive(T arg) RAFT_EXPLICIT;
+} // namespace raft
+#endif //RAFT_EXPLICIT_INSTANTIATE
+
+// (2) Provide extern template instantiations.
+extern template void raft::expensive<int>(int);
+extern template void raft::expensive<float>(float);
+```
+
+This header has two responsibilities: (1) define templates to raise an error in case of accidental instantiation and (2) provide `extern template` instantiations.
+First, if `RAFT_EXPLICIT_INSTANTIATE` is set, `expensive` is defined. This is done for two reasons: (1) to give a definition, because the definition in `expensive-inl.cuh` was skipped and (2) to indicate that the template should be explicitly instantiated by taging it with the `RAFT_EXPLICIT` macro. This macro defines the function body, and it ensures that an informative error message is generated when an implicit instantiation erroneously occurs. Finally, the `extern template` instantiations are listed.
+
+To actually generate the code for the template instances, the file `src/expensive.cu` contains the following. Note that the only difference between the extern template instantiations in `expensive-ext.cuh` and these lines are the removal of the word `extern`:
+
+``` c++
+#include <raft/expensive-inl.cuh>
+
+template void raft::expensive<int>(int);
+template void raft::expensive<float>(float);
+```
+
+**Design considerations**: 
+
+1. In the `-ext.cuh` header, do not include implementation headers. Only include function parameter types and types that are used to instantiate the templates.
+
+2. Keep docstrings in the `-inl.cuh` header, as it is closer to the code. Remove docstrings from template definitions in the `-ext.cuh` header.
+
+This header structure was proposed in [issue #1416](https://github.com/rapidsai/raft/issues/1416), which contains more background on the motivation of this structure and the mechanics of C++ template instantiation. 
+
 ## Testing
 
 It's important for RAFT to maintain a high test coverage of the public APIs in order to minimize the potential for downstream projects to encounter unexpected build or runtime behavior as a result of changes. 

From eb0d3b22877f2a0e3e42b8a993337a7f046c2416 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 19 Apr 2023 15:23:41 +0200
Subject: [PATCH 85/89] Rename RAFT_EXPLICIT_INSTANTIATE =>
 RAFT_EXPLICIT_INSTANTIATE_ONLY

---
 cpp/CMakeLists.txt                                     |  2 +-
 cpp/bench/prims/CMakeLists.txt                         |  2 +-
 .../distance/detail/pairwise_matrix/dispatch-ext.cuh   |  4 ++--
 .../raft/distance/detail/pairwise_matrix/dispatch.cuh  |  2 +-
 cpp/include/raft/distance/distance-ext.cuh             |  4 ++--
 cpp/include/raft/distance/distance.cuh                 |  2 +-
 cpp/include/raft/distance/fused_l2_nn-ext.cuh          |  4 ++--
 cpp/include/raft/distance/fused_l2_nn.cuh              |  2 +-
 cpp/include/raft/matrix/detail/select_k-ext.cuh        |  4 ++--
 cpp/include/raft/matrix/detail/select_k.cuh            |  2 +-
 cpp/include/raft/neighbors/ball_cover-ext.cuh          |  4 ++--
 cpp/include/raft/neighbors/ball_cover.cuh              |  2 +-
 cpp/include/raft/neighbors/brute_force-ext.cuh         |  4 ++--
 cpp/include/raft/neighbors/brute_force.cuh             |  2 +-
 .../neighbors/detail/ivf_flat_interleaved_scan-ext.cuh |  4 ++--
 .../neighbors/detail/ivf_flat_interleaved_scan.cuh     |  2 +-
 .../raft/neighbors/detail/ivf_flat_search-ext.cuh      |  4 ++--
 cpp/include/raft/neighbors/detail/ivf_flat_search.cuh  |  2 +-
 .../neighbors/detail/ivf_pq_compute_similarity-ext.cuh |  4 ++--
 .../neighbors/detail/ivf_pq_compute_similarity.cuh     |  2 +-
 .../raft/neighbors/detail/selection_faiss-ext.cuh      |  4 ++--
 cpp/include/raft/neighbors/detail/selection_faiss.cuh  |  2 +-
 cpp/include/raft/neighbors/ivf_flat-ext.cuh            |  4 ++--
 cpp/include/raft/neighbors/ivf_flat.cuh                |  2 +-
 cpp/include/raft/neighbors/ivf_pq-ext.cuh              |  4 ++--
 cpp/include/raft/neighbors/ivf_pq.cuh                  |  2 +-
 cpp/include/raft/neighbors/refine-ext.cuh              |  4 ++--
 cpp/include/raft/neighbors/refine.cuh                  |  2 +-
 .../spatial/knn/detail/ball_cover/registers-ext.cuh    |  4 ++--
 .../raft/spatial/knn/detail/ball_cover/registers.cuh   |  2 +-
 .../raft/spatial/knn/detail/fused_l2_knn-ext.cuh       |  4 ++--
 cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh   |  2 +-
 cpp/include/raft/util/raft_explicit.hpp                |  2 +-
 cpp/test/CMakeLists.txt                                |  2 +-
 cpp/test/cluster/linkage.cu                            |  2 +-
 cpp/test/distance/dist_adj_distance_instance.cu        |  2 +-
 cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu   |  2 +-
 cpp/test/neighbors/selection.cu                        |  2 +-
 cpp/test/sparse/neighbors/connect_components.cu        |  2 +-
 docs/source/developer_guide.md                         | 10 +++++-----
 docs/source/using_libraft.md                           |  4 ++--
 41 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c6e61ff92a..955dbf8f49 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -418,7 +418,7 @@ if(RAFT_COMPILE_LIBRARY)
                      "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
   target_compile_definitions(raft_lib PUBLIC "RAFT_COMPILED")
-  target_compile_definitions(raft_lib PRIVATE "RAFT_EXPLICIT_INSTANTIATE")
+  target_compile_definitions(raft_lib PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
 
   # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
   target_link_options(raft_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index e222316635..cb83d1aa69 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -54,7 +54,7 @@ function(ConfigureBench)
     ${BENCH_NAME} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
                           "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
-  target_compile_definitions(${BENCH_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE")
+  target_compile_definitions(${BENCH_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
 
   target_include_directories(
     ${BENCH_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench/prims>"
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
index bf070a599f..e1dc6f9b37 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -22,7 +22,7 @@
 #include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
 #include <raft/util/raft_explicit.hpp>                      // RAFT_EXPLICIT
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::distance::detail {
 
@@ -47,7 +47,7 @@ void pairwise_matrix_dispatch(OpT distance_op,
 
 };  // namespace raft::distance::detail
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
   OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index f26c67a8d9..31aebed3d0 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "dispatch-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index 3a2b182ef9..e002323bd7 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -23,7 +23,7 @@
 #include <raft/util/raft_explicit.hpp>                  // RAFT_EXPLICIT
 #include <rmm/device_uvector.hpp>                       // rmm::device_uvector
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft {
 namespace distance {
@@ -330,7 +330,7 @@ void pairwise_distance(raft::resources const& handle,
 };  // namespace distance
 };  // namespace raft
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 /*
  * Hierarchy of instantiations:
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index bf38d5e5fe..7d5cc5d486 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "distance-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/distance/fused_l2_nn-ext.cuh b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
index 4968fcb602..927ba62252 100644
--- a/cpp/include/raft/distance/fused_l2_nn-ext.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
@@ -20,7 +20,7 @@
 #include <raft/core/kvp.hpp>
 #include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft {
 namespace distance {
@@ -94,7 +94,7 @@ void fusedL2NNMinReduce(OutT* min,
 }  // namespace distance
 }  // namespace raft
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_distance_fusedL2NNMinReduce(DataT, OutT, IdxT)                          \
   extern template void raft::distance::fusedL2NNMinReduce<DataT, OutT, IdxT>(OutT * min,         \
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
index 9501602353..737d3fcb08 100644
--- a/cpp/include/raft/distance/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "fused_l2_nn-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index 2f2912219f..a09c84c18a 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -22,7 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::matrix::detail {
 
@@ -76,7 +76,7 @@ void select_k(const T* in_val,
               rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
 }  // namespace raft::matrix::detail
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_matrix_detail_select_k(T, IdxT)                            \
   extern template void raft::matrix::detail::select_k(const T* in_val,              \
diff --git a/cpp/include/raft/matrix/detail/select_k.cuh b/cpp/include/raft/matrix/detail/select_k.cuh
index 54dabf77bd..d011f23534 100644
--- a/cpp/include/raft/matrix/detail/select_k.cuh
+++ b/cpp/include/raft/matrix/detail/select_k.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "select_k-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/neighbors/ball_cover-ext.cuh b/cpp/include/raft/neighbors/ball_cover-ext.cuh
index 89ea855d31..add2d024d4 100644
--- a/cpp/include/raft/neighbors/ball_cover-ext.cuh
+++ b/cpp/include/raft/neighbors/ball_cover-ext.cuh
@@ -25,7 +25,7 @@
 
 #include <thrust/transform.h>
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::neighbors::ball_cover {
 
@@ -282,7 +282,7 @@ void knn_query(raft::device_resources const& handle,
 
 }  // namespace raft::neighbors::ball_cover
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_neighbors_ball_cover(idx_t, value_t, int_t, matrix_idx_t)                 \
   extern template void                                                                             \
diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover.cuh
index de01e756a0..82c56b64dd 100644
--- a/cpp/include/raft/neighbors/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/ball_cover.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "ball_cover-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/neighbors/brute_force-ext.cuh b/cpp/include/raft/neighbors/brute_force-ext.cuh
index d3a70716e0..14ed70bcb3 100644
--- a/cpp/include/raft/neighbors/brute_force-ext.cuh
+++ b/cpp/include/raft/neighbors/brute_force-ext.cuh
@@ -23,7 +23,7 @@
 #include <raft/spatial/knn/detail/fused_l2_knn.cuh>
 #include <raft/util/raft_explicit.hpp>
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::neighbors::brute_force {
 
@@ -184,7 +184,7 @@ void fused_l2_knn(raft::device_resources const& handle,
 
 }  // namespace raft::neighbors::brute_force
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 // No extern template for raft::neighbors::brute_force::knn_merge_parts
 
diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force.cuh
index f767c840bf..8453a83df4 100644
--- a/cpp/include/raft/neighbors/brute_force.cuh
+++ b/cpp/include/raft/neighbors/brute_force.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "brute_force-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
index a0eaea0260..a7cf4ebd79 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
@@ -22,7 +22,7 @@
 #include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
 #include <rmm/cuda_stream_view.hpp>               // rmm:cuda_stream_view
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::neighbors::ivf_flat::detail {
 
@@ -69,7 +69,7 @@ void ivfflat_interleaved_scan(const raft::neighbors::ivf_flat::index<T, IdxT>& i
 
 }  // namespace raft::neighbors::ivf_flat::detail
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_neighbors_ivf_flat_detail_ivfflat_interleaved_scan(T, AccT, IdxT)         \
   extern template void raft::neighbors::ivf_flat::detail::ivfflat_interleaved_scan<T, AccT, IdxT>( \
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
index d6a4fed973..63f341dd9a 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "ivf_flat_interleaved_scan-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
index 529412a17c..7e27c50bf7 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
@@ -22,7 +22,7 @@
 #include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
 #include <rmm/cuda_stream_view.hpp>               // rmm:cuda_stream_view
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::neighbors::ivf_flat::detail {
 
@@ -40,7 +40,7 @@ void search(raft::device_resources const& handle,
 
 }  // namespace raft::neighbors::ivf_flat::detail
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT)         \
   extern template void raft::neighbors::ivf_flat::detail::search<T, IdxT>( \
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
index 1f262e4463..acf9d2c99d 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "ivf_flat_search-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
index 104a31e869..14f5e18013 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
@@ -23,7 +23,7 @@
 #include <raft/util/raft_explicit.hpp>               // RAFT_EXPLICIT
 #include <rmm/cuda_stream_view.hpp>                  // rmm::cuda_stream_view
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::neighbors::ivf_pq::detail {
 
@@ -140,7 +140,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props,
 
 }  // namespace raft::neighbors::ivf_pq::detail
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_neighbors_ivf_pq_detail_compute_similarity_select(OutT, LutT)         \
   extern template auto raft::neighbors::ivf_pq::detail::compute_similarity_select<OutT, LutT>( \
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
index 2d63617798..d987c0d4ed 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "ivf_pq_compute_similarity-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
index 7ff30e3eff..b080294065 100644
--- a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
@@ -20,7 +20,7 @@
 #include <cstdint>                      // uint32_t
 #include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
 
-#if defined(RAFT_EXPLICIT_INSTANTIATE)
+#if defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 
 namespace raft::neighbors::detail {
 /**
@@ -49,7 +49,7 @@ void select_k(const key_t* inK,
               cudaStream_t stream) RAFT_EXPLICIT;
 };  // namespace raft::neighbors::detail
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_neighbors_detail_select_k(payload_t, key_t)           \
   extern template void raft::neighbors::detail::select_k(const key_t* inK,     \
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss.cuh b/cpp/include/raft/neighbors/detail/selection_faiss.cuh
index d1a2ac1a17..06b4478010 100644
--- a/cpp/include/raft/neighbors/detail/selection_faiss.cuh
+++ b/cpp/include/raft/neighbors/detail/selection_faiss.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "selection_faiss-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
index ad41534510..fe4e411aa4 100644
--- a/cpp/include/raft/neighbors/ivf_flat-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
@@ -27,7 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::neighbors::ivf_flat {
 
@@ -411,7 +411,7 @@ void search(raft::device_resources const& handle,
 
 }  // namespace raft::neighbors::ivf_flat
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_neighbors_ivf_flat_build(T, IdxT)        \
   extern template auto raft::neighbors::ivf_flat::build<T, IdxT>( \
diff --git a/cpp/include/raft/neighbors/ivf_flat.cuh b/cpp/include/raft/neighbors/ivf_flat.cuh
index 8e3e2bb813..4906ddab60 100644
--- a/cpp/include/raft/neighbors/ivf_flat.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "ivf_flat-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
index 2e6c8ee858..31cc299996 100644
--- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
@@ -25,7 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::neighbors::ivf_pq {
 
@@ -283,7 +283,7 @@ void search(raft::device_resources const& handle,
 
 }  // namespace raft::neighbors::ivf_pq
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_neighbors_ivf_pq_build(T, IdxT)                                        \
   extern template raft::neighbors::ivf_pq::index<IdxT> raft::neighbors::ivf_pq::build<T, IdxT>( \
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 7b3dd05efd..055d159b94 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "ivf_pq-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/neighbors/refine-ext.cuh b/cpp/include/raft/neighbors/refine-ext.cuh
index c210e599ae..65e356eed4 100644
--- a/cpp/include/raft/neighbors/refine-ext.cuh
+++ b/cpp/include/raft/neighbors/refine-ext.cuh
@@ -24,7 +24,7 @@
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/raft_explicit.hpp>
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::neighbors {
 
@@ -101,7 +101,7 @@ void refine(raft::device_resources const& handle,
 /** @} */  // end group ann_refine
 }  // namespace raft::neighbors
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_neighbors_refine(idx_t, data_t, distance_t, matrix_idx)       \
   extern template void raft::neighbors::refine<idx_t, data_t, distance_t, matrix_idx>( \
diff --git a/cpp/include/raft/neighbors/refine.cuh b/cpp/include/raft/neighbors/refine.cuh
index 06cbc8241c..7fe190493f 100644
--- a/cpp/include/raft/neighbors/refine.cuh
+++ b/cpp/include/raft/neighbors/refine.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "refine-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
index 2cbf8fb89e..7d0b409bf6 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
@@ -21,7 +21,7 @@
 #include <cstdint>                      // uint32_t
 #include <raft/util/raft_explicit.hpp>  //RAFT_EXPLICIT
 
-#if defined(RAFT_EXPLICIT_INSTANTIATE)
+#if defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 
 namespace raft::spatial::knn::detail {
 
@@ -63,7 +63,7 @@ void rbc_low_dim_pass_two(raft::device_resources const& handle,
 
 };  // namespace raft::spatial::knn::detail
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_spatial_knn_detail_rbc_low_dim_pass_one(                            \
   Mvalue_idx, Mvalue_t, Mvalue_int, Mdims, Mdist_func)                                       \
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index b5925680f2..b60cd645b4 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "registers-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
index 07ddf3a166..b12482b19d 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
@@ -19,7 +19,7 @@
 #include <raft/distance/distance_types.hpp>  // DistanceType
 #include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
 
-#if defined(RAFT_EXPLICIT_INSTANTIATE)
+#if defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 
 namespace raft::spatial::knn::detail {
 /**
@@ -54,7 +54,7 @@ void fusedL2Knn(size_t D,
 
 }  // namespace raft::spatial::knn::detail
 
-#endif  // RAFT_EXPLICIT_INSTANTIATE
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #define instantiate_raft_spatial_knn_detail_fusedL2Knn(Mvalue_idx, Mvalue_t, MusePrevTopKs) \
   extern template void                                                                      \
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index f9b9138168..38dd2f332f 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 #include "fused_l2_knn-inl.cuh"
 #endif
 
diff --git a/cpp/include/raft/util/raft_explicit.hpp b/cpp/include/raft/util/raft_explicit.hpp
index 8b5c97390e..7edb2f0b42 100644
--- a/cpp/include/raft/util/raft_explicit.hpp
+++ b/cpp/include/raft/util/raft_explicit.hpp
@@ -56,7 +56,7 @@
                                                                                                \
       "Simplest temporary solution:\n\n"                                                       \
                                                                                                \
-      "    Add '#undef RAFT_EXPLICIT_INSTANTIATE' at the top of your .cpp/.cu file.\n\n"       \
+      "    Add '#undef RAFT_EXPLICIT_INSTANTIATE_ONLY' at the top of your .cpp/.cu file.\n\n"  \
                                                                                                \
       "Best solution:\n\n"                                                                     \
                                                                                                \
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 175411f983..b236a21e66 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -59,7 +59,7 @@ function(ConfigureTest)
                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
 
-  target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE")
+  target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
   target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>")
 
   install(
diff --git a/cpp/test/cluster/linkage.cu b/cpp/test/cluster/linkage.cu
index 5b17c9be63..b2b177dde6 100644
--- a/cpp/test/cluster/linkage.cu
+++ b/cpp/test/cluster/linkage.cu
@@ -21,7 +21,7 @@
 //
 // TODO: consider adding this to libraft.so or creating an instance in a
 // separate translation unit for this test.
-#undef RAFT_EXPLICIT_INSTANTIATE
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #include "../test_utils.cuh"
 
diff --git a/cpp/test/distance/dist_adj_distance_instance.cu b/cpp/test/distance/dist_adj_distance_instance.cu
index 2250701fe9..d4685d8095 100644
--- a/cpp/test/distance/dist_adj_distance_instance.cu
+++ b/cpp/test/distance/dist_adj_distance_instance.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#undef RAFT_EXPLICIT_INSTANTIATE
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #include "dist_adj_threshold.cuh"
 #include <cstdint>
diff --git a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
index 67f790a8ae..3d362a5261 100644
--- a/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
@@ -19,7 +19,7 @@
 //
 // TODO: consider removing this test or consider adding an instantiation to the
 // library.
-#undef RAFT_EXPLICIT_INSTANTIATE
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #include "../ann_ivf_pq.cuh"
 
diff --git a/cpp/test/neighbors/selection.cu b/cpp/test/neighbors/selection.cu
index 281400c396..1b114f60df 100644
--- a/cpp/test/neighbors/selection.cu
+++ b/cpp/test/neighbors/selection.cu
@@ -17,7 +17,7 @@
 // XXX: we currently disable the EXPLICIT_INSTANTIATION restriction for now because we
 // need kFaissMax, which is not exposed by selection_faiss-ext.cuh.
 // TODO-inl-headers: consider how to re-enable it.
-#undef RAFT_EXPLICIT_INSTANTIATE
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
 #include <raft/neighbors/detail/selection_faiss.cuh>
 
 #include <algorithm>
diff --git a/cpp/test/sparse/neighbors/connect_components.cu b/cpp/test/sparse/neighbors/connect_components.cu
index a4fdd35558..e14cd9a180 100644
--- a/cpp/test/sparse/neighbors/connect_components.cu
+++ b/cpp/test/sparse/neighbors/connect_components.cu
@@ -21,7 +21,7 @@
 //
 // TODO: consider adding this to libraft.so or creating an instance in a
 // separate translation unit for this test.
-#undef RAFT_EXPLICIT_INSTANTIATE
+#undef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 #include <gtest/gtest.h>
 
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 1528390bfb..8b4470fc16 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -264,11 +264,11 @@ Sometimes, we need to temporarily change the log pattern (eg: for reporting deci
 
 RAFT is a heavily templated library. Several core functions are expensive to compile and we want to prevent duplicate compilation of this functionality. To limit build time, RAFT provides a precompiled library (libraft.so) where expensive function templates are instantiated for the most commonly used template parameters. To prevent (1) accidental instantiation of these templates and (2) unnecessary dependency on the internals of these templates, we use the following header structure.
 
-Any header file that defines an expensive function template (say `expensive.cuh`) should be split in three parts: `expensive.cuh`, `expensive-inl.cuh`, and `expensive-ext.cuh`. The file `expensive-inl.cuh` ("inl" for "inline") contains the template definitions, i.e., the actual code. The file `expensive.cuh` includes one or both of the other two files, depending on the values of the `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE` macros. The file `expensive-ext.cuh` contains `extern template` instantiations. In addition, if `RAFT_EXPLICIT_INSTANTIATE` is set, it contains template definitions to ensure that a compiler error is raised in case of accidental instantiation.
+Any header file that defines an expensive function template (say `expensive.cuh`) should be split in three parts: `expensive.cuh`, `expensive-inl.cuh`, and `expensive-ext.cuh`. The file `expensive-inl.cuh` ("inl" for "inline") contains the template definitions, i.e., the actual code. The file `expensive.cuh` includes one or both of the other two files, depending on the values of the `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY` macros. The file `expensive-ext.cuh` contains `extern template` instantiations. In addition, if `RAFT_EXPLICIT_INSTANTIATE_ONLY` is set, it contains template definitions to ensure that a compiler error is raised in case of accidental instantiation.
 
 The dispatching by `expensive.cuh` is performed as follows:
 ``` c++
-#if !defined(RAFT_EXPLICIT_INSTANTIATE)
+#if !defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 // If implicit instantiation is allowed, include template definitions.
 #include "expensive-inl.cuh"
 #endif
@@ -293,12 +293,12 @@ The file `expensive-ext.cuh` contains the following:
 ``` c++
 #include <raft/util/raft_explicit.cuh> // RAFT_EXPLICIT
 
-#ifdef RAFT_EXPLICIT_INSTANTIATE
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 namespace raft {
 // (1) define templates to raise an error in case of accidental instantiation 
 template <typename T> void expensive(T arg) RAFT_EXPLICIT;
 } // namespace raft
-#endif //RAFT_EXPLICIT_INSTANTIATE
+#endif //RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 // (2) Provide extern template instantiations.
 extern template void raft::expensive<int>(int);
@@ -306,7 +306,7 @@ extern template void raft::expensive<float>(float);
 ```
 
 This header has two responsibilities: (1) define templates to raise an error in case of accidental instantiation and (2) provide `extern template` instantiations.
-First, if `RAFT_EXPLICIT_INSTANTIATE` is set, `expensive` is defined. This is done for two reasons: (1) to give a definition, because the definition in `expensive-inl.cuh` was skipped and (2) to indicate that the template should be explicitly instantiated by taging it with the `RAFT_EXPLICIT` macro. This macro defines the function body, and it ensures that an informative error message is generated when an implicit instantiation erroneously occurs. Finally, the `extern template` instantiations are listed.
+First, if `RAFT_EXPLICIT_INSTANTIATE_ONLY` is set, `expensive` is defined. This is done for two reasons: (1) to give a definition, because the definition in `expensive-inl.cuh` was skipped and (2) to indicate that the template should be explicitly instantiated by taging it with the `RAFT_EXPLICIT` macro. This macro defines the function body, and it ensures that an informative error message is generated when an implicit instantiation erroneously occurs. Finally, the `extern template` instantiations are listed.
 
 To actually generate the code for the template instances, the file `src/expensive.cu` contains the following. Note that the only difference between the extern template instantiations in `expensive-ext.cuh` and these lines are the removal of the word `extern`:
 
diff --git a/docs/source/using_libraft.md b/docs/source/using_libraft.md
index a50b6ad690..c28fadab46 100644
--- a/docs/source/using_libraft.md
+++ b/docs/source/using_libraft.md
@@ -22,12 +22,12 @@ There are three ways to speed up compile times:
 
 ### How do I verify template instantiations didn't compile into my binary?
 
-To verify that you are not accidentally instantiating templates that have not been pre-compiled in RAFT, set the `RAFT_EXPLICIT_INSTANTIATE` macro. This only works if you are linking with the pre-compiled libraft (i.e., when `RAFT_COMPILED` has been defined). To check if, for instance, `raft::distance::distance` has been precompiled with specific template arguments, you can set `RAFT_EXPLICIT_INSTANTIATE` at the top of the file you are compiling, as in the following example:
+To verify that you are not accidentally instantiating templates that have not been pre-compiled in RAFT, set the `RAFT_EXPLICIT_INSTANTIATE_ONLY` macro. This only works if you are linking with the pre-compiled libraft (i.e., when `RAFT_COMPILED` has been defined). To check if, for instance, `raft::distance::distance` has been precompiled with specific template arguments, you can set `RAFT_EXPLICIT_INSTANTIATE_ONLY` at the top of the file you are compiling, as in the following example:
 
 ```c++
 
 #ifdef RAFT_COMPILED
-#define RAFT_EXPLICIT_INSTANTIATE
+#define RAFT_EXPLICIT_INSTANTIATE_ONLY
 #endif
 
 #include <cstdint>

From 0b9b79684a4b27cff912937d335b7edd4e6c775b Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 19 Apr 2023 15:26:32 +0200
Subject: [PATCH 86/89] Remove docstrings from -ext headers

Only keep in logger-ext.hpp: I don't know how to document the logger
class otherwise.
---
 cpp/include/raft/distance/distance-ext.cuh    | 187 -----------
 cpp/include/raft/distance/fused_l2_nn-ext.cuh |  49 +--
 .../raft/matrix/detail/select_k-ext.cuh       |  37 ---
 cpp/include/raft/neighbors/ball_cover-ext.cuh | 208 ------------
 .../raft/neighbors/brute_force-ext.cuh        | 121 -------
 .../detail/ivf_flat_interleaved_scan-ext.cuh  |  25 --
 .../neighbors/detail/ivf_flat_search-ext.cuh  |   1 -
 .../detail/ivf_pq_compute_similarity-ext.cuh  |   1 +
 .../neighbors/detail/selection_faiss-ext.cuh  |  15 +-
 cpp/include/raft/neighbors/ivf_flat-ext.cuh   | 314 ------------------
 cpp/include/raft/neighbors/ivf_pq-ext.cuh     | 195 -----------
 cpp/include/raft/neighbors/refine-ext.cuh     |  53 ---
 .../spatial/knn/detail/fused_l2_knn-ext.cuh   |  15 -
 cpp/include/raft/util/memory_pool-ext.hpp     |  32 --
 cpp/include/raft/util/memory_pool-inl.hpp     |  32 ++
 docs/source/developer_guide.md                |   6 +-
 16 files changed, 41 insertions(+), 1250 deletions(-)

diff --git a/cpp/include/raft/distance/distance-ext.cuh b/cpp/include/raft/distance/distance-ext.cuh
index e002323bd7..7171ba605f 100644
--- a/cpp/include/raft/distance/distance-ext.cuh
+++ b/cpp/include/raft/distance/distance-ext.cuh
@@ -28,37 +28,6 @@
 namespace raft {
 namespace distance {
 
-/**
- * @defgroup pairwise_distance pointer-based pairwise distance prims
- * @{
- */
-
-/**
- * @brief Evaluate pairwise distances with the user epilogue lamba allowed
- * @tparam DistanceType which distance to evaluate
- * @tparam DataT input argument type
- * @tparam AccT accumulation type
- * @tparam OutT output type
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam IdxT Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param fin_op the final gemm epilogue lambda
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- *
- * @note fin_op: This is a device lambda which is supposed to operate upon the
- * input which is AccT and returns the output in OutT. It's signature is
- * as follows:  <pre>OutT fin_op(AccT in, int g_idx);</pre>. If one needs
- * any other parameters, feel free to pass them via closure.
- */
 template <raft::distance::DistanceType DistT,
           typename DataT,
           typename AccT,
@@ -78,25 +47,6 @@ void distance(raft::resources const& handle,
               bool isRowMajor  = true,
               DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
-/**
- * @brief Evaluate pairwise distances for the simple use case
- * @tparam DistanceType which distance to evaluate
- * @tparam DataT input argument type
- * @tparam AccT accumulation type
- * @tparam OutT output type
- * @tparam IdxT Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- */
 template <raft::distance::DistanceType DistT,
           typename DataT,
           typename AccT,
@@ -114,22 +64,6 @@ void distance(raft::resources const& handle,
               bool isRowMajor  = true,
               DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
-/**
- * @brief Return the exact workspace size to compute the distance
- * @tparam DistanceType which distance to evaluate
- * @tparam DataT input argument type
- * @tparam AccT accumulation type
- * @tparam OutT output type
- * @tparam IdxT Index type
- * @param x first set of points
- * @param y second set of points
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- *
- * @note If the specified DistT doesn't need the workspace at all, it
- * returns 0.
- */
 template <raft::distance::DistanceType DistT,
           typename DataT,
           typename AccT,
@@ -137,20 +71,6 @@ template <raft::distance::DistanceType DistT,
           typename IdxT = int>
 size_t getWorkspaceSize(const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) RAFT_EXPLICIT;
 
-/**
- * @brief Return the exact workspace size to compute the distance
- * @tparam DistanceType which distance to evaluate
- * @tparam DataT input argument type
- * @tparam AccT accumulation type
- * @tparam OutT output type
- * @tparam IdxT Index type
- * @param x first set of points (size m*k)
- * @param y second set of points (size n*k)
- * @return number of bytes needed in workspace
- *
- * @note If the specified DistT doesn't need the workspace at all, it
- * returns 0.
- */
 template <raft::distance::DistanceType DistT,
           typename DataT,
           typename AccT,
@@ -160,23 +80,6 @@ template <raft::distance::DistanceType DistT,
 size_t getWorkspaceSize(raft::device_matrix_view<DataT, IdxT, layout> const& x,
                         raft::device_matrix_view<DataT, IdxT, layout> const& y) RAFT_EXPLICIT;
 
-/**
- * @brief Evaluate pairwise distances for the simple use case
- * @tparam DistanceType which distance to evaluate
- * @tparam DataT input argument type
- * @tparam AccT accumulation type
- * @tparam OutT output type
- * @tparam IdxT Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- */
 template <raft::distance::DistanceType DistT,
           typename DataT,
           typename AccT,
@@ -192,24 +95,6 @@ void distance(raft::resources const& handle,
               bool isRowMajor  = true,
               DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam IdxT indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace buffer which can get resized as per the
- * needed workspace size
- * @param metric distance metric
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- */
 template <typename Type, typename IdxT = int>
 void pairwise_distance(raft::resources const& handle,
                        const Type* x,
@@ -223,22 +108,6 @@ void pairwise_distance(raft::resources const& handle,
                        bool isRowMajor = true,
                        Type metric_arg = 2.0f) RAFT_EXPLICIT;
 
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam IdxT indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param metric distance metric
- * @param isRowMajor whether the matrices are row-major or col-major
- * @param metric_arg metric argument (used for Minkowski distance)
- */
 template <typename Type, typename IdxT = int>
 void pairwise_distance(raft::resources const& handle,
                        const Type* x,
@@ -250,49 +119,7 @@ void pairwise_distance(raft::resources const& handle,
                        raft::distance::DistanceType metric,
                        bool isRowMajor = true,
                        Type metric_arg = 2.0f) RAFT_EXPLICIT;
-/** @} */
 
-/**
- * \defgroup distance_mdspan Pairwise distance functions
- * @{
- */
-
-/**
- * @brief Evaluate pairwise distances for the simple use case.
- *
- * Note: Only contiguous row- or column-major layouts supported currently.
- *
- * Usage example:
- * @code{.cpp}
- * #include <raft/core/device_resources.hpp>
- * #include <raft/core/device_mdarray.hpp>
- * #include <raft/random/make_blobs.cuh>
- * #include <raft/distance/distance.cuh>
- *
- * raft::raft::device_resources handle;
- * int n_samples = 5000;
- * int n_features = 50;
- *
- * auto input = raft::make_device_matrix<float>(handle, n_samples, n_features);
- * auto labels = raft::make_device_vector<int>(handle, n_samples);
- * auto output = raft::make_device_matrix<float>(handle, n_samples, n_samples);
- *
- * raft::random::make_blobs(handle, input.view(), labels.view());
- * auto metric = raft::distance::DistanceType::L2SqrtExpanded;
- * raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric);
- * @endcode
- *
- * @tparam DistanceType which distance to evaluate
- * @tparam DataT input argument type
- * @tparam AccT accumulation type
- * @tparam OutT output type
- * @tparam IdxT Index type
- * @param handle raft handle for managing expensive resources
- * @param x first set of points (size n*k)
- * @param y second set of points (size m*k)
- * @param dist output distance matrix (size n*m)
- * @param metric_arg metric argument (used for Minkowski distance)
- */
 template <raft::distance::DistanceType DistT,
           typename DataT,
           typename AccT,
@@ -305,18 +132,6 @@ void distance(raft::resources const& handle,
               raft::device_matrix_view<OutT, IdxT, layout> dist,
               DataT metric_arg = 2.0f) RAFT_EXPLICIT;
 
-/**
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam IdxT indexing type
- * @param handle raft handle for managing expensive resources
- * @param x first matrix of points (size mxk)
- * @param y second matrix of points (size nxk)
- * @param dist output distance matrix (size mxn)
- * @param metric distance metric
- * @param metric_arg metric argument (used for Minkowski distance)
- */
 template <typename Type, typename layout = layout_c_contiguous, typename IdxT = int>
 void pairwise_distance(raft::resources const& handle,
                        device_matrix_view<Type, IdxT, layout> const x,
@@ -325,8 +140,6 @@ void pairwise_distance(raft::resources const& handle,
                        raft::distance::DistanceType metric,
                        Type metric_arg = 2.0f) RAFT_EXPLICIT;
 
-/** @} */
-
 };  // namespace distance
 };  // namespace raft
 
diff --git a/cpp/include/raft/distance/fused_l2_nn-ext.cuh b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
index 927ba62252..5dfbd48e8f 100644
--- a/cpp/include/raft/distance/fused_l2_nn-ext.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
@@ -16,23 +16,15 @@
 
 #pragma once
 
-#include <cstdint>  // int64_t
-#include <raft/core/kvp.hpp>
+#include <cstdint>                      // int64_t
+#include <raft/core/kvp.hpp>            // raft::KeyValuePair
 #include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft {
 namespace distance {
-/**
- * \defgroup fused_l2_nn Fused 1-nearest neighbors
- * @{
- * @}
- */
 
-/**
- * Initialize array using init value from reduction op
- */
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
 void initialize(raft::device_resources const& handle,
                 OutT* min,
@@ -40,41 +32,6 @@ void initialize(raft::device_resources const& handle,
                 DataT maxVal,
                 ReduceOpT redOp) RAFT_EXPLICIT;
 
-/**
- * \ingroup fused_l2_nn
- * @{
- */
-
-/**
- * @brief Wrapper around fusedL2NN with minimum reduction operators.
- *
- * fusedL2NN cannot be compiled in the distance library due to the lambda
- * operators, so this wrapper covers the most common case (minimum).
- * This should be preferred to the more generic API when possible, in order to
- * reduce compilation times for users of the shared library.
- *
- * @tparam DataT     data type
- * @tparam OutT      output type to either store 1-NN indices and their minimum
- *                   distances (e.g. raft::KeyValuePair<int, float>) or store only the min
- * distances.
- * @tparam IdxT      indexing arithmetic type
- * @param[out] min           will contain the reduced output (Length = `m`)
- *                           (on device)
- * @param[in]  x             first matrix. Row major. Dim = `m x k`.
- *                           (on device).
- * @param[in]  y             second matrix. Row major. Dim = `n x k`.
- *                           (on device).
- * @param[in]  xn            L2 squared norm of `x`. Length = `m`. (on device).
- * @param[in]  yn            L2 squared norm of `y`. Length = `n`. (on device)
- * @param[in]  m             gemm m
- * @param[in]  n             gemm n
- * @param[in]  k             gemm k
- * @param[in]  workspace     temp workspace. Size = sizeof(int)*m. (on device)
- * @param[in]  sqrt          Whether the output `minDist` should contain L2-sqrt
- * @param[in]  initOutBuffer whether to initialize the output buffer before the
- *                           main kernel launch
- * @param[in]  stream        cuda stream
- */
 template <typename DataT, typename OutT, typename IdxT>
 void fusedL2NNMinReduce(OutT* min,
                         const DataT* x,
@@ -89,8 +46,6 @@ void fusedL2NNMinReduce(OutT* min,
                         bool initOutBuffer,
                         cudaStream_t stream) RAFT_EXPLICIT;
 
-/** @} */
-
 }  // namespace distance
 }  // namespace raft
 
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index a09c84c18a..7691b03250 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -26,43 +26,6 @@
 
 namespace raft::matrix::detail {
 
-/**
- * Select k smallest or largest key/values from each row in the input data.
- *
- * If you think of the input data `in_val` as a row-major matrix with `len` columns and
- * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
- * in the row-major matrix `out_val` of size (batch_size, k).
- *
- * @tparam T
- *   the type of the keys (what is being compared).
- * @tparam IdxT
- *   the index type (what is being selected together with the keys).
- *
- * @param[in] in_val
- *   contiguous device array of inputs of size (len * batch_size);
- *   these are compared and selected.
- * @param[in] in_idx
- *   contiguous device array of inputs of size (len * batch_size);
- *   typically, these are indices of the corresponding in_val.
- * @param batch_size
- *   number of input rows, i.e. the batch size.
- * @param len
- *   length of a single input array (row); also sometimes referred as n_cols.
- *   Invariant: len >= k.
- * @param k
- *   the number of outputs to select in each input row.
- * @param[out] out_val
- *   contiguous device array of outputs of size (k * batch_size);
- *   the k smallest/largest values from each row of the `in_val`.
- * @param[out] out_idx
- *   contiguous device array of outputs of size (k * batch_size);
- *   the payload selected together with `out_val`.
- * @param select_min
- *   whether to select k smallest (true) or largest (false) keys.
- * @param stream
- * @param mr an optional memory resource to use across the calls (you can provide a large enough
- *           memory pool here to avoid memory allocations within the call).
- */
 template <typename T, typename IdxT>
 void select_k(const T* in_val,
               const IdxT* in_idx,
diff --git a/cpp/include/raft/neighbors/ball_cover-ext.cuh b/cpp/include/raft/neighbors/ball_cover-ext.cuh
index add2d024d4..dd28cdf92a 100644
--- a/cpp/include/raft/neighbors/ball_cover-ext.cuh
+++ b/cpp/include/raft/neighbors/ball_cover-ext.cuh
@@ -29,71 +29,10 @@
 
 namespace raft::neighbors::ball_cover {
 
-/**
- * @defgroup random_ball_cover Random Ball Cover algorithm
- * @{
- */
-
-/**
- * Builds and populates a previously unbuilt BallCoverIndex
- *
- * Usage example:
- * @code{.cpp}
- *
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/ball_cover.cuh>
- *  #include <raft/distance/distance_types.hpp>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  auto metric = raft::distance::DistanceType::L2Expanded;
- *  BallCoverIndex index(handle, X, metric);
- *
- *  ball_cover::build_index(handle, index);
- * @endcode
- *
- * @tparam idx_t knn index type
- * @tparam value_t knn value type
- * @tparam int_t integral type for knn params
- * @tparam matrix_idx_t matrix indexing type
- * @param[in] handle library resource management handle
- * @param[inout] index an empty (and not previous built) instance of BallCoverIndex
- */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
 void build_index(raft::device_resources const& handle,
                  BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index) RAFT_EXPLICIT;
 
-/** @} */  // end group random_ball_cover
-
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * performs an all neighbors knn, which can reuse memory when
- * the index and query are the same array. This function will
- * build the index and assumes rbc_build_index() has not already
- * been called.
- * @tparam idx_t knn index type
- * @tparam value_t knn distance type
- * @tparam int_t type for integers, such as number of rows/cols
- * @param[in] handle raft handle for resource management
- * @param[inout] index ball cover index which has not yet been built
- * @param[in] k number of nearest neighbors to find
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
 void all_knn_query(raft::device_resources const& handle,
                    BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
@@ -103,61 +42,6 @@ void all_knn_query(raft::device_resources const& handle,
                    bool perform_post_filtering = true,
                    float weight                = 1.0) RAFT_EXPLICIT;
 
-/**
- * @ingroup random_ball_cover
- * @{
- */
-
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * performs an all neighbors knn, which can reuse memory when
- * the index and query are the same array. This function will
- * build the index and assumes rbc_build_index() has not already
- * been called.
- *
- * Usage example:
- * @code{.cpp}
- *
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/ball_cover.cuh>
- *  #include <raft/distance/distance_types.hpp>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  auto metric = raft::distance::DistanceType::L2Expanded;
- *
- *  // Construct a ball cover index
- *  BallCoverIndex index(handle, X, metric);
- *
- *  // Perform all neighbors knn query
- *  ball_cover::all_knn_query(handle, index, inds, dists, k);
- * @endcode
- *
- * @tparam idx_t knn index type
- * @tparam value_t knn distance type
- * @tparam int_t type for integers, such as number of rows/cols
- * @tparam matrix_idx_t matrix indexing type
- *
- * @param[in] handle raft handle for resource management
- * @param[in] index ball cover index which has not yet been built
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] k number of nearest neighbors to find
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
 void all_knn_query(raft::device_resources const& handle,
                    BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
@@ -167,37 +51,6 @@ void all_knn_query(raft::device_resources const& handle,
                    bool perform_post_filtering = true,
                    float weight                = 1.0) RAFT_EXPLICIT;
 
-/** @} */
-
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * function does not build the index and assumes rbc_build_index() has
- * already been called. Use this function when the index and
- * query arrays are different, otherwise use rbc_all_knn_query().
- * @tparam idx_t index type
- * @tparam value_t distances type
- * @tparam int_t integer type for size info
- * @param[in] handle raft handle for resource management
- * @param[inout] index ball cover index which has not yet been built
- * @param[in] k number of nearest neighbors to find
- * @param[in] query the
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- * @param[in] n_query_pts number of query points
- */
 template <typename idx_t, typename value_t, typename int_t>
 void knn_query(raft::device_resources const& handle,
                const BallCoverIndex<idx_t, value_t, int_t>& index,
@@ -208,62 +61,7 @@ void knn_query(raft::device_resources const& handle,
                value_t* dists,
                bool perform_post_filtering = true,
                float weight                = 1.0) RAFT_EXPLICIT;
-/**
- * @ingroup random_ball_cover
- * @{
- */
-
-/**
- * Performs a faster exact knn in metric spaces using the triangle
- * inequality with a number of landmark points to reduce the
- * number of distance computations from O(n^2) to O(sqrt(n)). This
- * function does not build the index and assumes rbc_build_index() has
- * already been called. Use this function when the index and
- * query arrays are different, otherwise use rbc_all_knn_query().
- *
- * Usage example:
- * @code{.cpp}
- *
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/ball_cover.cuh>
- *  #include <raft/distance/distance_types.hpp>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  auto metric = raft::distance::DistanceType::L2Expanded;
- *
- *  // Build a ball cover index
- *  BallCoverIndex index(handle, X, metric);
- *  ball_cover::build_index(handle, index);
- *
- *  // Perform all neighbors knn query
- *  ball_cover::knn_query(handle, index, inds, dists, k);
- * @endcode
 
- *
- * @tparam idx_t index type
- * @tparam value_t distances type
- * @tparam int_t integer type for size info
- * @tparam matrix_idx_t
- * @param[in] handle raft handle for resource management
- * @param[in] index ball cover index which has not yet been built
- * @param[in] query device matrix containing query data points
- * @param[out] inds output knn indices
- * @param[out] dists output knn distances
- * @param[in] k number of nearest neighbors to find
- * @param[in] perform_post_filtering if this is false, only the closest k landmarks
- *                               are considered (which will return approximate
- *                               results).
- * @param[in] weight a weight for overlap between the closest landmark and
- *               the radius of other landmarks when pruning distances.
- *               Setting this value below 1 can effectively turn off
- *               computing distances against many other balls, enabling
- *               approximate nearest neighbors. Recall can be adjusted
- *               based on how many relevant balls are ignored. Note that
- *               many datasets can still have great recall even by only
- *               looking in the closest landmark.
- */
 template <typename idx_t, typename value_t, typename int_t, typename matrix_idx_t>
 void knn_query(raft::device_resources const& handle,
                const BallCoverIndex<idx_t, value_t, int_t, matrix_idx_t>& index,
@@ -274,12 +72,6 @@ void knn_query(raft::device_resources const& handle,
                bool perform_post_filtering = true,
                float weight                = 1.0) RAFT_EXPLICIT;
 
-/** @} */
-
-// TODO: implement functions for:
-//  4. rbc_eps_neigh() - given a populated index, perform query against different query array
-//  5. rbc_all_eps_neigh() - populate a BallCoverIndex and query against training data
-
 }  // namespace raft::neighbors::ball_cover
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
diff --git a/cpp/include/raft/neighbors/brute_force-ext.cuh b/cpp/include/raft/neighbors/brute_force-ext.cuh
index 14ed70bcb3..fd343f1654 100644
--- a/cpp/include/raft/neighbors/brute_force-ext.cuh
+++ b/cpp/include/raft/neighbors/brute_force-ext.cuh
@@ -27,59 +27,6 @@
 
 namespace raft::neighbors::brute_force {
 
-/**
- * @defgroup brute_force_knn Brute-force K-Nearest Neighbors
- * @{
- */
-
-/**
- * @brief Performs a k-select across several (contiguous) row-partitioned index/distance
- * matrices formatted like the following:
- *
- * part1row1: k0, k1, k2, k3
- * part1row2: k0, k1, k2, k3
- * part1row3: k0, k1, k2, k3
- * part2row1: k0, k1, k2, k3
- * part2row2: k0, k1, k2, k3
- * part2row3: k0, k1, k2, k3
- * etc...
- *
- * The example above shows what an aggregated index/distance matrix
- * would look like with two partitions when n_samples=3 and k=4.
- *
- * When working with extremely large data sets that have been broken
- * over multiple indexes, such as when computing over multiple GPUs,
- * the ids will often start at 0 for each local knn index but the
- * global ids need to be used when merging them together. An optional
- * translations vector can be supplied to map the starting id of
- * each partition to its global id so that the final merged knn
- * is based on the global ids.
- *
- * Usage example:
- * @code{.cpp}
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/brute_force.cuh>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  compute multiple knn graphs and aggregate row-wise
- *  (see detailed description above)
- *  ...
- *  brute_force::knn_merge_parts(handle, in_keys, in_values, out_keys, out_values, n_samples);
- * @endcode
- *
- * @tparam idx_t
- * @tparam value_t
- *
- * @param[in] handle
- * @param[in] in_keys matrix of input keys (size n_samples * n_parts * k)
- * @param[in] in_values matrix of input values (size n_samples * n_parts * k)
- * @param[out] out_keys matrix of output keys (size n_samples * k)
- * @param[out] out_values matrix of output values (size n_samples * k)
- * @param[in] n_samples number of rows in each partition
- * @param[in] translations optional vector of starting global id mappings for each local partition
- */
 template <typename value_t, typename idx_t>
 inline void knn_merge_parts(
   raft::device_resources const& handle,
@@ -90,41 +37,6 @@ inline void knn_merge_parts(
   size_t n_samples,
   std::optional<raft::device_vector_view<idx_t, idx_t>> translations = std::nullopt) RAFT_EXPLICIT;
 
-/**
- * @brief Flat C++ API function to perform a brute force knn on
- * a series of input arrays and combine the results into a single
- * output array for indexes and distances. Inputs can be either
- * row- or column-major but the output matrices will always be in
- * row-major format.
- *
- * Usage example:
- * @code{.cpp}
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/brute_force.cuh>
- *  #include <raft/distance/distance_types.hpp>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
- *  brute_force::knn(handle, index, search, indices, distances, metric);
- * @endcode
- *
- * @param[in] handle: the cuml handle to use
- * @param[in] index: vector of device matrices (each size m_i*d) to be used as the knn index
- * @param[in] search: matrix (size n*d) to be used for searching the index
- * @param[out] indices: matrix (size n*k) to store output knn indices
- * @param[out] distances: matrix (size n*k) to store the output knn distance
- * @param[in] metric: distance metric to use. Euclidean (L2) is used by default
- * @param[in] metric_arg: the value of `p` for Minkowski (l-p) distances. This
- * 					 is ignored if the metric_type is not Minkowski.
- * @param[in] global_id_offset: optional starting global id mapping for the local partition
- *                              (assumes the index contains contiguous ids in the global id space)
- * @param[in] distance_epilogue: optional epilogue function to run after computing distances. This
-                                 function takes a triple of the (value, rowid, colid) for each
-                                 element in the pairwise distances and returns a transformed value
-                                 back.
- */
 template <typename idx_t,
           typename value_t,
           typename matrix_idx,
@@ -141,37 +53,6 @@ void knn(raft::device_resources const& handle,
          std::optional<idx_t> global_id_offset = std::nullopt,
          epilogue_op distance_epilogue         = raft::identity_op()) RAFT_EXPLICIT;
 
-/**
- * @brief Compute the k-nearest neighbors using L2 expanded/unexpanded distance.
- *
- * This is a specialized function for fusing the k-selection with the distance
- * computation when k < 64. The value of k will be inferred from the number
- * of columns in the output matrices.
- *
- * Usage example:
- * @code{.cpp}
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/neighbors/brute_force.cuh>
- *  #include <raft/distance/distance_types.hpp>
- *  using namespace raft::neighbors;
- *
- *  raft::raft::device_resources handle;
- *  ...
- *  auto metric = raft::distance::DistanceType::L2SqrtExpanded;
- *  brute_force::fused_l2_knn(handle, index, search, indices, distances, metric);
- * @endcode
-
- * @tparam value_t type of values
- * @tparam idx_t type of indices
- * @tparam idx_layout layout type of index matrix
- * @tparam query_layout layout type of query matrix
- * @param[in] handle raft handle for sharing expensive resources
- * @param[in] index input index array on device (size m * d)
- * @param[in] query input query array on device (size n * d)
- * @param[out] out_inds output indices array on device (size n * k)
- * @param[out] out_dists output dists array on device (size n * k)
- * @param[in] metric type of distance computation to perform (must be a variant of L2)
- */
 template <typename value_t, typename idx_t, typename idx_layout, typename query_layout>
 void fused_l2_knn(raft::device_resources const& handle,
                   raft::device_matrix_view<const value_t, idx_t, idx_layout> index,
@@ -180,8 +61,6 @@ void fused_l2_knn(raft::device_resources const& handle,
                   raft::device_matrix_view<value_t, idx_t, row_major> out_dists,
                   raft::distance::DistanceType metric) RAFT_EXPLICIT;
 
-/** @} */  // end group brute_force_knn
-
 }  // namespace raft::neighbors::brute_force
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
index a7cf4ebd79..f3aa4f299f 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
@@ -28,31 +28,6 @@ namespace raft::neighbors::ivf_flat::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
-/**
- * @brief Configure and launch an appropriate template instance of the interleaved scan kernel.
- *
- * @tparam T value type
- * @tparam AccT accumulated type
- * @tparam IdxT type of the indices
- *
- * @param index previously built ivf-flat index
- * @param[in] queries device pointer to the query vectors [batch_size, dim]
- * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes]
- * @param n_queries batch size
- * @param metric type of the measured distance
- * @param n_probes number of nearest clusters to query
- * @param k number of nearest neighbors.
- *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
- * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
- * metric.
- * @param[out] neighbors device pointer to the result indices for each query and cluster
- * [batch_size, grid_dim_x, k]
- * @param[out] distances device pointer to the result distances for each query and cluster
- * [batch_size, grid_dim_x, k]
- * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters;
- *               (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes)
- * @param stream
- */
 template <typename T, typename AccT, typename IdxT>
 void ivfflat_interleaved_scan(const raft::neighbors::ivf_flat::index<T, IdxT>& index,
                               const T* queries,
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
index 7e27c50bf7..95f179e59f 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
@@ -26,7 +26,6 @@
 
 namespace raft::neighbors::ivf_flat::detail {
 
-/** See raft::neighbors::ivf_flat::search docs */
 template <typename T, typename IdxT>
 void search(raft::device_resources const& handle,
             const search_params& params,
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
index 14f5e18013..58c8d2a0dd 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
@@ -64,6 +64,7 @@ template <typename OutT, typename LutT>
 using compute_similarity_kernel_t =
   decltype(&compute_similarity_kernel<OutT, LutT, 8, 0, true, true>);
 
+// TODO: consider remove
 template <typename OutT, typename LutT>
 struct occupancy_t {
   using shmem_unit = Pow2<128>;
diff --git a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
index b080294065..62b2b25261 100644
--- a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh
@@ -23,20 +23,7 @@
 #if defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 
 namespace raft::neighbors::detail {
-/**
- * @brief Select the k-nearest neighbors from dense
- * distance and index matrices.
- *
- * @param[in] inK partitioned knn distance matrix
- * @param[in] inV partitioned knn index matrix
- * @param[in] n_rows number of rows in distance and index matrices
- * @param[in] n_cols number of columns in distance and index matrices
- * @param[out] outK merged knn distance matrix
- * @param[out] outV merged knn index matrix
- * @param[in] select_min whether to select the min or the max distances
- * @param[in] k number of neighbors per partition (also number of merged neighbors)
- * @param[in] stream CUDA stream to use
- */
+
 template <typename payload_t = int, typename key_t = float>
 void select_k(const key_t* inK,
               const payload_t* inV,
diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
index fe4e411aa4..4b261eebdc 100644
--- a/cpp/include/raft/neighbors/ivf_flat-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
@@ -31,38 +31,6 @@
 
 namespace raft::neighbors::ivf_flat {
 
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_flat::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_flat::build(handle, index_params, dataset, N, D);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_flat::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
- * @param[in] n_rows the number of samples
- * @param[in] dim the dimensionality of the data
- *
- * @return the constructed ivf-flat index
- */
 template <typename T, typename IdxT>
 auto build(raft::device_resources const& handle,
            const index_params& params,
@@ -70,121 +38,18 @@ auto build(raft::device_resources const& handle,
            IdxT n_rows,
            uint32_t dim) -> index<T, IdxT> RAFT_EXPLICIT;
 
-/**
- * @defgroup ivf_flat IVF Flat Algorithm
- * @{
- */
-
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_flat::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_flat::build(handle, dataset, index_params);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_flat::search(handle, index, queries, out_inds, out_dists, search_params, k);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- * @tparam int_t precision / type of integral arguments
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
- *
- * @return the constructed ivf-flat index
- */
 template <typename T, typename IdxT>
 auto build(raft::device_resources const& handle,
            const index_params& params,
            raft::device_matrix_view<const T, IdxT, row_major> dataset)
   -> index<T, IdxT> RAFT_EXPLICIT;
 
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_flat::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   ivf_flat::index<decltype(dataset::Type), decltype(dataset::index_type)> index;
- *   ivf_flat::build(handle, dataset, index_params, index);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_flat::search(handle, index, queries, out_inds, out_dists, search_params, k);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- * @tparam int_t precision / type of integral arguments
- * @tparam matrix_IdxT matrix indexing type
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset raft::device_matrix_view to a row-major matrix [n_rows, dim]
- * @param[out] idx reference to ivf_flat::index
- *
- */
 template <typename T, typename IdxT>
 void build(raft::device_resources const& handle,
            const index_params& params,
            raft::device_matrix_view<const T, IdxT, row_major> dataset,
            raft::neighbors::ivf_flat::index<T, IdxT>& idx) RAFT_EXPLICIT;
 
-/** @} */
-
-/**
- * @brief Build a new index containing the data of the original plus new extra vectors.
- *
- * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
- *    centers are adjusted to match the newly labeled data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   ivf_flat::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
- *   // fill the index with the data
- *   auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] orig_index original index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] n_rows number of rows in `new_vectors`
- *
- * @return the constructed extended ivf-flat index
- */
 template <typename T, typename IdxT>
 auto extend(raft::device_resources const& handle,
             const index<T, IdxT>& orig_index,
@@ -192,76 +57,12 @@ auto extend(raft::device_resources const& handle,
             const IdxT* new_indices,
             IdxT n_rows) -> index<T, IdxT> RAFT_EXPLICIT;
 
-/**
- * @ingroup ivf_flat
- * @{
- */
-
-/**
- * @brief Build a new index containing the data of the original plus new extra vectors.
- *
- * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
- *    centers are adjusted to match the newly labeled data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   ivf_flat::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset);
- *   // fill the index with the data
- *   auto index = ivf_flat::extend(handle, index_empty, dataset);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices optional raft::device_matrix_view to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] orig_index original index
- *
- * @return the constructed extended ivf-flat index
- */
 template <typename T, typename IdxT>
 auto extend(raft::device_resources const& handle,
             raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
             std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
             const index<T, IdxT>& orig_index) -> index<T, IdxT> RAFT_EXPLICIT;
 
-/** @} */
-
-/**
- * @brief Extend the index in-place with the new data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   ivf_flat::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D);
- *   // fill the index with the data
- *   ivf_flat::extend(handle, index_empty, dataset, nullptr, N);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param handle
- * @param[inout] index
- * @param[in] new_vectors a device pointer to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices a device pointer to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] n_rows the number of samples
- */
 template <typename T, typename IdxT>
 void extend(raft::device_resources const& handle,
             index<T, IdxT>* index,
@@ -269,87 +70,12 @@ void extend(raft::device_resources const& handle,
             const IdxT* new_indices,
             IdxT n_rows) RAFT_EXPLICIT;
 
-/**
- * @ingroup ivf_flat
- * @{
- */
-
-/**
- * @brief Extend the index in-place with the new data.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   ivf_flat::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_flat::build(handle, index_params, dataset);
- *   // fill the index with the data
- *   std::optional<raft::device_vector_view<const IdxT, IdxT>> no_op = std::nullopt;
- *   ivf_flat::extend(handle, dataset, no_opt, &index_empty);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] new_vectors raft::device_matrix_view to a row-major matrix [n_rows, index.dim()]
- * @param[in] new_indices optional raft::device_matrix_view to a vector of indices [n_rows].
- *    If the original index is empty (`orig_index.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[inout] index pointer to index, to be overwritten in-place
- */
 template <typename T, typename IdxT>
 void extend(raft::device_resources const& handle,
             raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
             std::optional<raft::device_vector_view<const IdxT, IdxT>> new_indices,
             index<T, IdxT>* index) RAFT_EXPLICIT;
 
-/** @} */
-
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
- * @code{.cpp}
- *   ...
- *   // Create a pooling memory resource with a pre-defined initial size.
- *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
- *     rmm::mr::get_current_device_resource(), 1024 * 1024);
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   ivf_flat::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
- *   ivf_flat::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
- *   ivf_flat::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
- *   ...
- * @endcode
- * The exact size of the temporary buffer depends on multiple factors and is an implementation
- * detail. However, you can safely specify a small initial size for the memory pool, so that only a
- * few allocations happen to grow it during the first invocations of the `search`.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] handle
- * @param[in] params configure the search
- * @param[in] index ivf-flat constructed index
- * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
- * @param[in] n_queries the batch size
- * @param[in] k the number of neighbors to find for each query.
- * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] mr an optional memory resource to use across the searches (you can provide a large
- * enough memory pool here to avoid memory allocations within search).
- */
 template <typename T, typename IdxT>
 void search(raft::device_resources const& handle,
             const search_params& params,
@@ -361,44 +87,6 @@ void search(raft::device_resources const& handle,
             float* distances,
             rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
 
-/**
- * @ingroup ivf_flat
- * @{
- */
-
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_flat::build](#ivf_flat::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
- * @code{.cpp}
- *   ...
- *   // use default search parameters
- *   ivf_flat::search_params search_params;
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   ivf_flat::search(handle, index, queries1, out_inds1, out_dists1, search_params, K);
- *   ivf_flat::search(handle, index, queries2, out_inds2, out_dists2, search_params, K);
- *   ivf_flat::search(handle, index, queries3, out_inds3, out_dists3, search_params, K);
- *   ...
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- * @tparam int_t precision / type of integral arguments
- *
- * @param[in] handle
- * @param[in] params configure the search
- * @param[in] index ivf-flat constructed index
- * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
- * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- */
 template <typename T, typename IdxT>
 void search(raft::device_resources const& handle,
             const search_params& params,
@@ -407,8 +95,6 @@ void search(raft::device_resources const& handle,
             raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
             raft::device_matrix_view<float, IdxT, row_major> distances) RAFT_EXPLICIT;
 
-/** @} */
-
 }  // namespace raft::neighbors::ivf_flat
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
index 31cc299996..409da67e37 100644
--- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
@@ -29,96 +29,23 @@
 
 namespace raft::neighbors::ivf_pq {
 
-/**
- * @defgroup ivf_pq IVF PQ Algorithm
- * @{
- */
-
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset a device matrix view to a row-major matrix [n_rows, dim]
- *
- * @return the constructed ivf-pq index
- */
 template <typename T, typename IdxT = uint32_t>
 index<IdxT> build(raft::device_resources const& handle,
                   const index_params& params,
                   raft::device_matrix_view<const T, IdxT, row_major> dataset) RAFT_EXPLICIT;
 
-/**
- * @brief Extend the index with the new data.
- * *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device vector view to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[inout] idx
- */
 template <typename T, typename IdxT>
 index<IdxT> extend(raft::device_resources const& handle,
                    raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
                    std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,
                    const index<IdxT>& idx) RAFT_EXPLICIT;
 
-/**
- * @brief Extend the index with the new data.
- * *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device vector view to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[inout] idx
- */
 template <typename T, typename IdxT>
 void extend(raft::device_resources const& handle,
             raft::device_matrix_view<const T, IdxT, row_major> new_vectors,
             std::optional<raft::device_vector_view<const IdxT, IdxT, row_major>> new_indices,
             index<IdxT>* idx) RAFT_EXPLICIT;
 
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`.
- * The exact size of the temporary buffer depends on multiple factors and is an implementation
- * detail. However, you can safely specify a small initial size for the memory pool, so that only a
- * few allocations happen to grow it during the first invocations of the `search`.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] handle
- * @param[in] params configure the search
- * @param[in] idx ivf-pq constructed index
- * @param[in] queries a device matrix view to a row-major matrix [n_queries, index->dim()]
- * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device matrix view to the distances to the selected neighbors [n_queries,
- * k]
- */
 template <typename T, typename IdxT>
 void search(raft::device_resources const& handle,
             const search_params& params,
@@ -127,40 +54,6 @@ void search(raft::device_resources const& handle,
             raft::device_matrix_view<IdxT, IdxT, row_major> neighbors,
             raft::device_matrix_view<float, IdxT, row_major> distances) RAFT_EXPLICIT;
 
-/** @} */  // end group ivf_pq
-
-/**
- * @brief Build the index from the dataset for efficient search.
- *
- * NB: Currently, the following distance metrics are supported:
- * - L2Expanded
- * - L2Unexpanded
- * - InnerProduct
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_pq::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
- *   // use default search parameters
- *   ivf_pq::search_params search_params;
- *   // search K nearest neighbours for each of the N queries
- *   ivf_pq::search(handle, search_params, index, queries, N, K, out_inds, out_dists);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[in] params configure the index building
- * @param[in] dataset a device/host pointer to a row-major matrix [n_rows, dim]
- * @param[in] n_rows the number of samples
- * @param[in] dim the dimensionality of the data
- *
- * @return the constructed ivf-pq index
- */
 template <typename T, typename IdxT = uint32_t>
 auto build(raft::device_resources const& handle,
            const index_params& params,
@@ -168,38 +61,6 @@ auto build(raft::device_resources const& handle,
            IdxT n_rows,
            uint32_t dim) -> index<IdxT> RAFT_EXPLICIT;
 
-/**
- * @brief Build a new index containing the data of the original plus new extra vectors.
- *
- * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
- *    centers are unchanged.
- *
- * Usage example:
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   ivf_pq::index_params index_params;
- *   index_params.add_data_on_build = false;      // don't populate index on build
- *   index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training
- *   // train the index from a [N, D] dataset
- *   auto index_empty = ivf_pq::build(handle, index_params, dataset, N, D);
- *   // fill the index with the data
- *   auto index = ivf_pq::extend(handle, index_empty, dataset, nullptr, N);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[inout] idx original index
- * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] n_rows the number of samples
- *
- * @return the constructed extended ivf-pq index
- */
 template <typename T, typename IdxT>
 auto extend(raft::device_resources const& handle,
             const index<IdxT>& idx,
@@ -207,20 +68,6 @@ auto extend(raft::device_resources const& handle,
             const IdxT* new_indices,
             IdxT n_rows) -> index<IdxT> RAFT_EXPLICIT;
 
-/**
- * @brief Extend the index with the new data.
- * *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] handle
- * @param[inout] idx
- * @param[in] new_vectors a device/host pointer to a row-major matrix [n_rows, idx.dim()]
- * @param[in] new_indices a device/host pointer to a vector of indices [n_rows].
- *    If the original index is empty (`idx.size() == 0`), you can pass `nullptr`
- *    here to imply a continuous range `[0...n_rows)`.
- * @param[in] n_rows the number of samples
- */
 template <typename T, typename IdxT>
 void extend(raft::device_resources const& handle,
             index<IdxT>* idx,
@@ -228,48 +75,6 @@ void extend(raft::device_resources const& handle,
             const IdxT* new_indices,
             IdxT n_rows) RAFT_EXPLICIT;
 
-/**
- * @brief Search ANN using the constructed index.
- *
- * See the [ivf_pq::build](#ivf_pq::build) documentation for a usage example.
- *
- * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
- * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
- * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
- * eliminate entirely allocations happening within `search`:
- * @code{.cpp}
- *   ...
- *   // Create a pooling memory resource with a pre-defined initial size.
- *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
- *     rmm::mr::get_current_device_resource(), 1024 * 1024);
- *   // use default search parameters
- *   ivf_pq::search_params search_params;
- *   // Use the same allocator across multiple searches to reduce the number of
- *   // cuda memory allocations
- *   ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
- *   ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
- *   ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
- *   ...
- * @endcode
- * The exact size of the temporary buffer depends on multiple factors and is an implementation
- * detail. However, you can safely specify a small initial size for the memory pool, so that only a
- * few allocations happen to grow it during the first invocations of the `search`.
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices
- *
- * @param[in] handle
- * @param[in] params configure the search
- * @param[in] idx ivf-pq constructed index
- * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
- * @param[in] n_queries the batch size
- * @param[in] k the number of neighbors to find for each query.
- * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
- * [n_queries, k]
- * @param[out] distances a device pointer to the distances to the selected neighbors [n_queries, k]
- * @param[in] mr an optional memory resource to use across the searches (you can provide a large
- * enough memory pool here to avoid memory allocations within search).
- */
 template <typename T, typename IdxT>
 void search(raft::device_resources const& handle,
             const raft::neighbors::ivf_pq::search_params& params,
diff --git a/cpp/include/raft/neighbors/refine-ext.cuh b/cpp/include/raft/neighbors/refine-ext.cuh
index 65e356eed4..79431086a7 100644
--- a/cpp/include/raft/neighbors/refine-ext.cuh
+++ b/cpp/include/raft/neighbors/refine-ext.cuh
@@ -28,48 +28,6 @@
 
 namespace raft::neighbors {
 
-/**
- * @defgroup ann_refine Approximate Nearest Neighbors Refinement
- * @{
- */
-
-/**
- * @brief Refine nearest neighbor search.
- *
- * Refinement is an operation that follows an approximate NN search. The approximate search has
- * already selected n_candidates neighbor candidates for each query. We narrow it down to k
- * neighbors. For each query, we calculate the exact distance between the query and its
- * n_candidates neighbor candidate, and select the k nearest ones.
- *
- * The k nearest neighbors and distances are returned.
- *
- * Example usage
- * @code{.cpp}
- *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_pq::index_params index_params;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = ivf_pq::build(handle, index_params, dataset, N, D);
- *   // use default search parameters
- *   ivf_pq::search_params search_params;
- *   // search m = 4 * k nearest neighbours for each of the N queries
- *   ivf_pq::search(handle, search_params, index, queries, N, 4 * k, neighbor_candidates,
- *                  out_dists_tmp);
- *   // refine it to the k nearest one
- *   refine(handle, dataset, queries, neighbor_candidates, out_indices, out_dists,
- *           index.metric());
- * @endcode
- *
- *
- * @param[in] handle the raft handle
- * @param[in] dataset device matrix that stores the dataset [n_rows, dims]
- * @param[in] queries device matrix of the queries [n_queris, dims]
- * @param[in] neighbor_candidates indices of candidate vectors [n_queries, n_candidates], where
- *   n_candidates >= k
- * @param[out] indices device matrix that stores the refined indices [n_queries, k]
- * @param[out] distances device matrix that stores the refined distances [n_queries, k]
- * @param[in] metric distance metric to use. Euclidean (L2) is used by default
- */
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
 void refine(raft::device_resources const& handle,
             raft::device_matrix_view<const data_t, matrix_idx, row_major> dataset,
@@ -79,16 +37,6 @@ void refine(raft::device_resources const& handle,
             raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
             distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT;
 
-/** Same as above, but all input and out data is in host memory.
- * @param[in] handle the raft handle
- * @param[in] dataset host matrix that stores the dataset [n_rows, dims]
- * @param[in] queries host matrix of the queries [n_queris, dims]
- * @param[in] neighbor_candidates host matrix with indices of candidate vectors [n_queries,
- *   n_candidates], where n_candidates >= k
- * @param[out] indices host matrix that stores the refined indices [n_queries, k]
- * @param[out] distances host matrix that stores the refined distances [n_queries, k]
- * @param[in] metric distance metric to use. Euclidean (L2) is used by default
- */
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
 void refine(raft::device_resources const& handle,
             raft::host_matrix_view<const data_t, matrix_idx, row_major> dataset,
@@ -98,7 +46,6 @@ void refine(raft::device_resources const& handle,
             raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
             distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT;
 
-/** @} */  // end group ann_refine
 }  // namespace raft::neighbors
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
index b12482b19d..32abcf8b81 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
@@ -22,22 +22,7 @@
 #if defined(RAFT_EXPLICIT_INSTANTIATE_ONLY)
 
 namespace raft::spatial::knn::detail {
-/**
- * Compute the k-nearest neighbors using L2 expanded/unexpanded distance.
 
- * @tparam value_idx
- * @tparam value_t
- * @param[out] out_inds output indices array on device (size n_query_rows * k)
- * @param[out] out_dists output dists array on device (size n_query_rows * k)
- * @param[in] index input index array on device (size n_index_rows * D)
- * @param[in] query input query array on device (size n_query_rows * D)
- * @param[in] n_index_rows number of rows in index array
- * @param[in] n_query_rows number of rows in query array
- * @param[in] k number of closest neighbors to return
- * @param[in] rowMajorIndex are the index arrays in row-major layout?
- * @param[in] rowMajorQuery are the query array in row-major layout?
- * @param[in] stream stream to order kernel launch
- */
 template <typename value_idx, typename value_t, bool usePrevTopKs = false>
 void fusedL2Knn(size_t D,
                 value_idx* out_inds,
diff --git a/cpp/include/raft/util/memory_pool-ext.hpp b/cpp/include/raft/util/memory_pool-ext.hpp
index fb48bc70c4..e3bf11d425 100644
--- a/cpp/include/raft/util/memory_pool-ext.hpp
+++ b/cpp/include/raft/util/memory_pool-ext.hpp
@@ -21,38 +21,6 @@
 
 namespace raft {
 
-/**
- * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
- * unique pointer.
- *
- * This function is useful in the code where multiple repeated allocations/deallocations are
- * expected.
- * Use case example:
- * @code{.cpp}
- *   void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) {
- *     auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float));
- *     if (pool_guard){
- *       RAFT_LOG_INFO("Created a pool %zu bytes", pool_guard->pool_size());
- *     } else {
- *       RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource");
- *     }
- *     rmm::device_uvector<float> x(n, stream, mr);
- *     rmm::device_uvector<float> y(n, stream, mr);
- *     ...
- *   }
- * @endcode
- * Here, the new memory resource would be created within the function scope if the passed `mr` is
- * null and the default resource is not a pool. After the call, `mr` contains a valid memory
- * resource in any case.
- *
- * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
- * into a `pool_memory_resource` if necessary and return the pointer to the result.
- * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
- * to 256 bytes).
- *
- * @return if a new memory pool is created, it returns a unique_ptr to it;
- *   this managed pointer controls the lifetime of the created memory resource.
- */
 std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
   rmm::mr::device_memory_resource*& mr, size_t initial_size);
 
diff --git a/cpp/include/raft/util/memory_pool-inl.hpp b/cpp/include/raft/util/memory_pool-inl.hpp
index 6d457923b9..7968779e3d 100644
--- a/cpp/include/raft/util/memory_pool-inl.hpp
+++ b/cpp/include/raft/util/memory_pool-inl.hpp
@@ -25,6 +25,38 @@
 
 namespace raft {
 
+/**
+ * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
+ * unique pointer.
+ *
+ * This function is useful in the code where multiple repeated allocations/deallocations are
+ * expected.
+ * Use case example:
+ * @code{.cpp}
+ *   void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) {
+ *     auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float));
+ *     if (pool_guard){
+ *       RAFT_LOG_INFO("Created a pool %zu bytes", pool_guard->pool_size());
+ *     } else {
+ *       RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource");
+ *     }
+ *     rmm::device_uvector<float> x(n, stream, mr);
+ *     rmm::device_uvector<float> y(n, stream, mr);
+ *     ...
+ *   }
+ * @endcode
+ * Here, the new memory resource would be created within the function scope if the passed `mr` is
+ * null and the default resource is not a pool. After the call, `mr` contains a valid memory
+ * resource in any case.
+ *
+ * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
+ * into a `pool_memory_resource` if necessary and return the pointer to the result.
+ * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
+ * to 256 bytes).
+ *
+ * @return if a new memory pool is created, it returns a unique_ptr to it;
+ *   this managed pointer controls the lifetime of the created memory resource.
+ */
 RAFT_INLINE_CONDITIONAL std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
   rmm::mr::device_memory_resource*& mr, size_t initial_size)
 {
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 8b4470fc16..d26028e458 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -319,10 +319,14 @@ template void raft::expensive<float>(float);
 
 **Design considerations**: 
 
-1. In the `-ext.cuh` header, do not include implementation headers. Only include function parameter types and types that are used to instantiate the templates.
+1. In the `-ext.cuh` header, do not include implementation headers. Only include function parameter types and types that are used to instantiate the templates. If a primitive takes custom parameter types, define them in a separate header called `<primitive_name>_types.hpp`. 
 
 2. Keep docstrings in the `-inl.cuh` header, as it is closer to the code. Remove docstrings from template definitions in the `-ext.cuh` header.
 
+3. The order of inclusion in `expensive.cuh` is extremely important. If `RAFT_EXPLICIT_INSTANTIATE_ONLY` is not defined, but `RAFT_COMPILED` is defined, then we must include the template definitions before the `extern template` instantiations.
+
+4. If a header file defines multiple expensive templates, it can be that one of them is not instantiated. In this case, **do define** the template with `RAFT_EXPLICIT` in the `-ext` header. This way, when the template is instantiated, the developer gets a helpful error message instead of a confusing "function not found".
+
 This header structure was proposed in [issue #1416](https://github.com/rapidsai/raft/issues/1416), which contains more background on the motivation of this structure and the mechanics of C++ template instantiation. 
 
 ## Testing

From e8fc2da1a498cd8941da34ebb65fbb69774911b0 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 19 Apr 2023 17:18:26 +0200
Subject: [PATCH 87/89] Fix extraneous and missing includes in headers

---
 cpp/include/raft/distance/fused_l2_nn-ext.cuh |  7 ++++---
 .../detail/coalesced_reduction_types.cuh      |  1 +
 .../raft/matrix/detail/select_k-ext.cuh       | 10 +++++-----
 cpp/include/raft/neighbors/ball_cover-ext.cuh | 13 ++++--------
 .../raft/neighbors/brute_force-ext.cuh        | 11 +++++-----
 .../detail/ivf_flat_interleaved_scan-ext.cuh  | 11 ++++------
 .../neighbors/detail/ivf_flat_search-ext.cuh  |  8 +++-----
 .../neighbors/detail/ivf_flat_search-inl.cuh  |  3 +++
 .../neighbors/detail/ivf_flat_serialize.cuh   |  1 +
 .../detail/ivf_pq_compute_similarity-ext.cuh  | 16 ---------------
 cpp/include/raft/neighbors/ivf_flat-ext.cuh   | 16 +++++++--------
 cpp/include/raft/neighbors/ivf_pq-ext.cuh     | 11 +++++-----
 cpp/include/raft/neighbors/refine-ext.cuh     | 20 ++++++++++---------
 .../knn/detail/ball_cover/registers-ext.cuh   |  2 +-
 .../spatial/knn/detail/fused_l2_knn-ext.cuh   |  1 +
 cpp/include/raft/util/memory_pool-ext.hpp     |  2 +-
 16 files changed, 56 insertions(+), 77 deletions(-)

diff --git a/cpp/include/raft/distance/fused_l2_nn-ext.cuh b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
index 5dfbd48e8f..a0af04c4e8 100644
--- a/cpp/include/raft/distance/fused_l2_nn-ext.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn-ext.cuh
@@ -16,9 +16,10 @@
 
 #pragma once
 
-#include <cstdint>                      // int64_t
-#include <raft/core/kvp.hpp>            // raft::KeyValuePair
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+#include <cstdint>                         // int64_t
+#include <raft/core/device_resources.hpp>  // raft::device_resources
+#include <raft/core/kvp.hpp>               // raft::KeyValuePair
+#include <raft/util/raft_explicit.hpp>     // RAFT_EXPLICIT
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction_types.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction_types.cuh
index c31b4363dd..f8bcd03e89 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction_types.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction_types.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+// TODO: consider putting this back in coalesced reduction
 namespace raft::linalg::detail {
 
 template <int warpSize, int rpb>
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index 7691b03250..2b233c156d 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
-#include <cstdint>
-#include <cuda_fp16.h>
-#include <raft/util/raft_explicit.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
+#include <cstdint>                                   // uint32_t
+#include <cuda_fp16.h>                               // __half
+#include <raft/util/raft_explicit.hpp>               // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
+#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/neighbors/ball_cover-ext.cuh b/cpp/include/raft/neighbors/ball_cover-ext.cuh
index dd28cdf92a..b6ab12d8e1 100644
--- a/cpp/include/raft/neighbors/ball_cover-ext.cuh
+++ b/cpp/include/raft/neighbors/ball_cover-ext.cuh
@@ -15,15 +15,10 @@
  */
 #pragma once
 
-#include <cstdint>
-
-#include <raft/distance/distance_types.hpp>
-#include <raft/neighbors/ball_cover_types.hpp>
-#include <raft/spatial/knn/detail/ball_cover.cuh>
-#include <raft/spatial/knn/detail/ball_cover/common.cuh>
-#include <raft/util/raft_explicit.hpp>
-
-#include <thrust/transform.h>
+#include <cstdint>                              // uint32_t
+#include <raft/distance/distance_types.hpp>     // raft::distance::DistanceType
+#include <raft/neighbors/ball_cover_types.hpp>  // BallCoverIndex
+#include <raft/util/raft_explicit.hpp>          // RAFT_EXPLICIT
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/neighbors/brute_force-ext.cuh b/cpp/include/raft/neighbors/brute_force-ext.cuh
index fd343f1654..98a186db86 100644
--- a/cpp/include/raft/neighbors/brute_force-ext.cuh
+++ b/cpp/include/raft/neighbors/brute_force-ext.cuh
@@ -16,12 +16,11 @@
 
 #pragma once
 
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/operators.hpp>  // raft::identity_op
-#include <raft/distance/distance_types.hpp>
-#include <raft/neighbors/detail/knn_brute_force.cuh>
-#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
-#include <raft/util/raft_explicit.hpp>
+#include <raft/core/device_mdspan.hpp>       // raft::device_matrix_view
+#include <raft/core/device_resources.hpp>    // raft::device_resources
+#include <raft/core/operators.hpp>           // raft::identity_op
+#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
+#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
index f3aa4f299f..46f72c4005 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh
@@ -16,18 +16,15 @@
 
 #pragma once
 
-#include <cstdint>                                // uintX_t
-#include <raft/neighbors/ivf_flat_types.hpp>      // index
-#include <raft/spatial/knn/detail/ann_utils.cuh>  // TODO: consider remove
-#include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
-#include <rmm/cuda_stream_view.hpp>               // rmm:cuda_stream_view
+#include <cstdint>                            // uintX_t
+#include <raft/neighbors/ivf_flat_types.hpp>  // raft::neighbors::ivf_flat::index
+#include <raft/util/raft_explicit.hpp>        // RAFT_EXPLICIT
+#include <rmm/cuda_stream_view.hpp>           // rmm:cuda_stream_view
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
 namespace raft::neighbors::ivf_flat::detail {
 
-using namespace raft::spatial::knn::detail;  // NOLINT
-
 template <typename T, typename AccT, typename IdxT>
 void ivfflat_interleaved_scan(const raft::neighbors::ivf_flat::index<T, IdxT>& index,
                               const T* queries,
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
index 95f179e59f..3bb3a4308d 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
@@ -16,11 +16,9 @@
 
 #pragma once
 
-#include <cstdint>                                // uintX_t
-#include <raft/neighbors/ivf_flat_types.hpp>      // index
-#include <raft/spatial/knn/detail/ann_utils.cuh>  // TODO: consider remove
-#include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
-#include <rmm/cuda_stream_view.hpp>               // rmm:cuda_stream_view
+#include <cstdint>                            // uintX_t
+#include <raft/neighbors/ivf_flat_types.hpp>  // raft::neighbors::ivf_flat::index
+#include <raft/util/raft_explicit.hpp>        // RAFT_EXPLICIT
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
index 92ba3613d8..89a4597acf 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
@@ -25,10 +25,13 @@
 #include <raft/matrix/detail/select_k.cuh>                      // matrix::detail::select_k
 #include <raft/neighbors/detail/ivf_flat_interleaved_scan.cuh>  // interleaved_scan
 #include <raft/neighbors/ivf_flat_types.hpp>                    // raft::neighbors::ivf_flat::index
+#include <raft/spatial/knn/detail/ann_utils.cuh>                // utils::mapping
 #include <rmm/mr/device/per_device_resource.hpp>                // rmm::device_memory_resource
 
 namespace raft::neighbors::ivf_flat::detail {
 
+using namespace raft::spatial::knn::detail;  // NOLINT
+
 template <typename T, typename AccT, typename IdxT>
 void search_impl(raft::device_resources const& handle,
                  const raft::neighbors::ivf_flat::index<T, IdxT>& index,
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh
index 1bb7f97123..bec3b890eb 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_serialize.cuh
@@ -21,6 +21,7 @@
 #include <raft/neighbors/ivf_flat_types.hpp>
 #include <raft/neighbors/ivf_list.hpp>
 #include <raft/neighbors/ivf_list_types.hpp>
+#include <raft/util/pow2_utils.cuh>
 
 #include <fstream>
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
index 58c8d2a0dd..0d5ca90297 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh
@@ -64,22 +64,6 @@ template <typename OutT, typename LutT>
 using compute_similarity_kernel_t =
   decltype(&compute_similarity_kernel<OutT, LutT, 8, 0, true, true>);
 
-// TODO: consider remove
-template <typename OutT, typename LutT>
-struct occupancy_t {
-  using shmem_unit = Pow2<128>;
-
-  int blocks_per_sm = 0;
-  double occupancy  = 0.0;
-  double shmem_use  = 1.0;
-
-  inline occupancy_t() = default;
-  inline occupancy_t(size_t smem,
-                     uint32_t n_threads,
-                     compute_similarity_kernel_t<OutT, LutT> kernel,
-                     const cudaDeviceProp& dev_props) RAFT_EXPLICIT;
-};
-
 template <typename OutT, typename LutT>
 struct selected {
   compute_similarity_kernel_t<OutT, LutT> kernel;
diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
index 4b261eebdc..60edf8a068 100644
--- a/cpp/include/raft/neighbors/ivf_flat-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
@@ -16,16 +16,14 @@
 
 #pragma once
 
-#include <raft/neighbors/detail/ivf_flat_build.cuh>
-#include <raft/neighbors/detail/ivf_flat_search.cuh>
-#include <raft/neighbors/ivf_flat_serialize.cuh>
-#include <raft/neighbors/ivf_flat_types.hpp>
+#include <cstdint>  // int64_t
 
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <raft/core/device_mdspan.hpp>     // raft::device_matrix_view
+#include <raft/core/device_resources.hpp>  // raft::device_resources
+#include <raft/neighbors/ivf_flat_serialize.cuh>
+#include <raft/neighbors/ivf_flat_types.hpp>      // raft::neighbors::ivf_flat::index
+#include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
+#include <rmm/mr/device/per_device_resource.hpp>  // rmm::mr::device_memory_resource
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
index 409da67e37..60588966d8 100644
--- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
@@ -17,13 +17,12 @@
 #pragma once
 
 #include <cstdint>  // int64_t
-#include <raft/neighbors/ivf_pq_types.hpp>
 
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <raft/core/device_mdspan.hpp>            // raft::device_matrix_view
+#include <raft/core/device_resources.hpp>         // raft::device_resources
+#include <raft/neighbors/ivf_pq_types.hpp>        // raft::neighbors::ivf_pq::index
+#include <raft/util/raft_explicit.hpp>            // RAFT_EXPLICIT
+#include <rmm/mr/device/per_device_resource.hpp>  // rmm::mr::device_memory_resource
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
diff --git a/cpp/include/raft/neighbors/refine-ext.cuh b/cpp/include/raft/neighbors/refine-ext.cuh
index 79431086a7..edd14f1770 100644
--- a/cpp/include/raft/neighbors/refine-ext.cuh
+++ b/cpp/include/raft/neighbors/refine-ext.cuh
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/device_resources.hpp>
-#include <raft/core/host_mdspan.hpp>
-#include <raft/matrix/matrix.cuh>
-#include <raft/neighbors/detail/refine.cuh>
-#include <raft/spatial/knn/detail/ann_utils.cuh>
-#include <raft/util/raft_explicit.hpp>
+#include <cstdint>  // int64_t
+
+#include <raft/core/device_mdspan.hpp>       // raft::device_matrix_view
+#include <raft/core/device_resources.hpp>    // raft::device_resources
+#include <raft/core/host_mdspan.hpp>         // // raft::host_matrix_view
+#include <raft/distance/distance_types.hpp>  // raft::distance::DistanceType
+#include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
 
@@ -35,7 +35,8 @@ void refine(raft::device_resources const& handle,
             raft::device_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
             raft::device_matrix_view<idx_t, matrix_idx, row_major> indices,
             raft::device_matrix_view<distance_t, matrix_idx, row_major> distances,
-            distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT;
+            raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
+  RAFT_EXPLICIT;
 
 template <typename idx_t, typename data_t, typename distance_t, typename matrix_idx>
 void refine(raft::device_resources const& handle,
@@ -44,7 +45,8 @@ void refine(raft::device_resources const& handle,
             raft::host_matrix_view<const idx_t, matrix_idx, row_major> neighbor_candidates,
             raft::host_matrix_view<idx_t, matrix_idx, row_major> indices,
             raft::host_matrix_view<distance_t, matrix_idx, row_major> distances,
-            distance::DistanceType metric = distance::DistanceType::L2Unexpanded) RAFT_EXPLICIT;
+            raft::distance::DistanceType metric = distance::DistanceType::L2Unexpanded)
+  RAFT_EXPLICIT;
 
 }  // namespace raft::neighbors
 
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
index 7d0b409bf6..199da01ddb 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers-ext.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "../../ball_cover_types.hpp"
+#include "../../ball_cover_types.hpp"   // BallCoverIndex
 #include "registers_types.cuh"          // DistFunc
 #include <cstdint>                      // uint32_t
 #include <raft/util/raft_explicit.hpp>  //RAFT_EXPLICIT
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
index 32abcf8b81..390436939f 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn-ext.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cstddef>                           // size_t
+#include <cstdint>                           // uint32_t
 #include <raft/distance/distance_types.hpp>  // DistanceType
 #include <raft/util/raft_explicit.hpp>       // RAFT_EXPLICIT
 
diff --git a/cpp/include/raft/util/memory_pool-ext.hpp b/cpp/include/raft/util/memory_pool-ext.hpp
index e3bf11d425..a02908346b 100644
--- a/cpp/include/raft/util/memory_pool-ext.hpp
+++ b/cpp/include/raft/util/memory_pool-ext.hpp
@@ -15,7 +15,7 @@
  */
 
 #pragma once
-#include <cstddef>
+#include <cstddef>                                   // size_t
 #include <memory>                                    // std::unique_ptr
 #include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
 

From 488a273ae068c0da0bbdcaa0ceae3d4efc99d77d Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 19 Apr 2023 17:19:37 +0200
Subject: [PATCH 88/89] Add tests to ensure headers compile in isolation

---
 cpp/test/CMakeLists.txt                       | 47 ++++++++++-
 cpp/test/ext_headers/00_generate.py           | 79 +++++++++++++++++++
 cpp/test/ext_headers/raft_core_logger.cpp     | 27 +++++++
 ...istance_detail_pairwise_matrix_dispatch.cu | 27 +++++++
 .../ext_headers/raft_distance_distance.cu     | 27 +++++++
 .../ext_headers/raft_distance_fused_l2_nn.cu  | 27 +++++++
 .../raft_linalg_detail_coalesced_reduction.cu | 27 +++++++
 .../raft_matrix_detail_select_k.cu            | 27 +++++++
 .../ext_headers/raft_neighbors_ball_cover.cu  | 27 +++++++
 .../ext_headers/raft_neighbors_brute_force.cu | 27 +++++++
 ...ghbors_detail_ivf_flat_interleaved_scan.cu | 27 +++++++
 .../raft_neighbors_detail_ivf_flat_search.cu  | 27 +++++++
 ...ghbors_detail_ivf_pq_compute_similarity.cu | 27 +++++++
 .../raft_neighbors_detail_selection_faiss.cu  | 27 +++++++
 .../ext_headers/raft_neighbors_ivf_flat.cu    | 27 +++++++
 cpp/test/ext_headers/raft_neighbors_ivf_pq.cu | 27 +++++++
 cpp/test/ext_headers/raft_neighbors_refine.cu | 27 +++++++
 ...spatial_knn_detail_ball_cover_registers.cu | 27 +++++++
 .../raft_spatial_knn_detail_fused_l2_knn.cu   | 27 +++++++
 .../ext_headers/raft_util_memory_pool.cpp     | 27 +++++++
 20 files changed, 610 insertions(+), 2 deletions(-)
 create mode 100644 cpp/test/ext_headers/00_generate.py
 create mode 100644 cpp/test/ext_headers/raft_core_logger.cpp
 create mode 100644 cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
 create mode 100644 cpp/test/ext_headers/raft_distance_distance.cu
 create mode 100644 cpp/test/ext_headers/raft_distance_fused_l2_nn.cu
 create mode 100644 cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
 create mode 100644 cpp/test/ext_headers/raft_matrix_detail_select_k.cu
 create mode 100644 cpp/test/ext_headers/raft_neighbors_ball_cover.cu
 create mode 100644 cpp/test/ext_headers/raft_neighbors_brute_force.cu
 create mode 100644 cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
 create mode 100644 cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
 create mode 100644 cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
 create mode 100644 cpp/test/ext_headers/raft_neighbors_detail_selection_faiss.cu
 create mode 100644 cpp/test/ext_headers/raft_neighbors_ivf_flat.cu
 create mode 100644 cpp/test/ext_headers/raft_neighbors_ivf_pq.cu
 create mode 100644 cpp/test/ext_headers/raft_neighbors_refine.cu
 create mode 100644 cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
 create mode 100644 cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
 create mode 100644 cpp/test/ext_headers/raft_util_memory_pool.cpp

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index b236a21e66..0f65aa2c49 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -17,7 +17,7 @@
 
 function(ConfigureTest)
 
-  set(options OPTIONAL LIB)
+  set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY)
   set(oneValueArgs NAME)
   set(multiValueArgs PATH TARGETS CONFIGURATIONS)
 
@@ -59,7 +59,10 @@ function(ConfigureTest)
                          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
   )
 
-  target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
+  if(ConfigureTest_EXPLICIT_INSTANTIATE_ONLY)
+    target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY")
+  endif()
+
   target_include_directories(${TEST_NAME} PUBLIC "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/test>")
 
   install(
@@ -146,6 +149,43 @@ if(BUILD_TESTS)
     LIB
   )
 
+  list(
+    APPEND
+    EXT_HEADER_TEST_SOURCES
+    test/ext_headers/raft_neighbors_brute_force.cu
+    test/ext_headers/raft_distance_distance.cu
+    test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
+    test/ext_headers/raft_matrix_detail_select_k.cu
+    test/ext_headers/raft_neighbors_ball_cover.cu
+    test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
+    test/ext_headers/raft_distance_fused_l2_nn.cu
+    test/ext_headers/raft_neighbors_ivf_pq.cu
+    test/ext_headers/raft_util_memory_pool.cpp
+    test/ext_headers/raft_neighbors_ivf_flat.cu
+    test/ext_headers/raft_core_logger.cpp
+    test/ext_headers/raft_neighbors_refine.cu
+    test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
+    test/ext_headers/raft_neighbors_detail_selection_faiss.cu
+    test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
+    test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
+    test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
+    test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
+  )
+
+  # Test that the split headers compile in isolation with:
+  #
+  # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined
+  # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined
+  # * EXT_HEADERS_TEST_IMPLICIT:          no macros defined.
+  ConfigureTest(
+    NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} OPTIONAL LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+  ConfigureTest(
+    NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} OPTIONAL LIB
+  )
+  ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
+
   ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
 
   ConfigureTest(
@@ -325,6 +365,9 @@ if(BUILD_TESTS)
     CLUSTER_TEST
     CORE_TEST
     DISTANCE_TEST
+    EXT_HEADERS_TEST_COMPILED_EXPLICIT
+    EXT_HEADERS_TEST_COMPILED_IMPLICIT
+    EXT_HEADERS_TEST_IMPLICIT
     LABEL_TEST
     LINALG_TEST
     MATRIX_TEST
diff --git a/cpp/test/ext_headers/00_generate.py b/cpp/test/ext_headers/00_generate.py
new file mode 100644
index 0000000000..15f90e1cc5
--- /dev/null
+++ b/cpp/test/ext_headers/00_generate.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+copyright_notice = """
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+"""
+
+ext_headers = [
+    "raft/neighbors/brute_force-ext.cuh",
+    "raft/distance/distance-ext.cuh",
+    "raft/distance/detail/pairwise_matrix/dispatch-ext.cuh",
+    "raft/matrix/detail/select_k-ext.cuh",
+    "raft/neighbors/ball_cover-ext.cuh",
+    "raft/spatial/knn/detail/fused_l2_knn-ext.cuh",
+    "raft/distance/fused_l2_nn-ext.cuh",
+    "raft/neighbors/ivf_pq-ext.cuh",
+    "raft/util/memory_pool-ext.hpp",
+    "raft/neighbors/ivf_flat-ext.cuh",
+    "raft/core/logger-ext.hpp",
+    "raft/neighbors/refine-ext.cuh",
+    "raft/neighbors/detail/ivf_flat_search-ext.cuh",
+    "raft/neighbors/detail/selection_faiss-ext.cuh",
+    "raft/linalg/detail/coalesced_reduction-ext.cuh",
+    "raft/spatial/knn/detail/ball_cover/registers-ext.cuh",
+    "raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh",
+    "raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh",
+]
+
+for ext_header in ext_headers:
+    header = ext_header.replace("-ext", "")
+
+    path = (
+        header
+        .replace("/", "_")
+        .replace(".cuh", ".cu")
+        .replace(".hpp", ".cpp")
+    )
+
+    with open(path, "w") as f:
+        f.write(copyright_notice)
+        f.write(f"#include <{header}>\n")
+
+    # For in CMakeLists.txt
+    print(f"test/ext_headers/{path}")
diff --git a/cpp/test/ext_headers/raft_core_logger.cpp b/cpp/test/ext_headers/raft_core_logger.cpp
new file mode 100644
index 0000000000..18ba9ef48d
--- /dev/null
+++ b/cpp/test/ext_headers/raft_core_logger.cpp
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/core/logger.hpp>
diff --git a/cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu b/cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
new file mode 100644
index 0000000000..02e4c8e331
--- /dev/null
+++ b/cpp/test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>
diff --git a/cpp/test/ext_headers/raft_distance_distance.cu b/cpp/test/ext_headers/raft_distance_distance.cu
new file mode 100644
index 0000000000..458d6385ed
--- /dev/null
+++ b/cpp/test/ext_headers/raft_distance_distance.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/distance/distance.cuh>
diff --git a/cpp/test/ext_headers/raft_distance_fused_l2_nn.cu b/cpp/test/ext_headers/raft_distance_fused_l2_nn.cu
new file mode 100644
index 0000000000..23ab58a67b
--- /dev/null
+++ b/cpp/test/ext_headers/raft_distance_fused_l2_nn.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/distance/fused_l2_nn.cuh>
diff --git a/cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu b/cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
new file mode 100644
index 0000000000..7f94824287
--- /dev/null
+++ b/cpp/test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/linalg/detail/coalesced_reduction.cuh>
diff --git a/cpp/test/ext_headers/raft_matrix_detail_select_k.cu b/cpp/test/ext_headers/raft_matrix_detail_select_k.cu
new file mode 100644
index 0000000000..adb10f5bbb
--- /dev/null
+++ b/cpp/test/ext_headers/raft_matrix_detail_select_k.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/matrix/detail/select_k.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_ball_cover.cu b/cpp/test/ext_headers/raft_neighbors_ball_cover.cu
new file mode 100644
index 0000000000..8aaabe1872
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_ball_cover.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ball_cover.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_brute_force.cu b/cpp/test/ext_headers/raft_neighbors_brute_force.cu
new file mode 100644
index 0000000000..2c37799ae6
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_brute_force.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/brute_force.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu b/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
new file mode 100644
index 0000000000..5a3a0b3f76
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_flat_interleaved_scan.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu b/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
new file mode 100644
index 0000000000..a6274c1c80
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_flat_search.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu b/cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
new file mode 100644
index 0000000000..fd5ad62204
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/ivf_pq_compute_similarity.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_detail_selection_faiss.cu b/cpp/test/ext_headers/raft_neighbors_detail_selection_faiss.cu
new file mode 100644
index 0000000000..f8bd21e86f
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_detail_selection_faiss.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/detail/selection_faiss.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_ivf_flat.cu b/cpp/test/ext_headers/raft_neighbors_ivf_flat.cu
new file mode 100644
index 0000000000..ab38e4c02c
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_ivf_flat.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_flat.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_ivf_pq.cu b/cpp/test/ext_headers/raft_neighbors_ivf_pq.cu
new file mode 100644
index 0000000000..43a66bde18
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_ivf_pq.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/ivf_pq.cuh>
diff --git a/cpp/test/ext_headers/raft_neighbors_refine.cu b/cpp/test/ext_headers/raft_neighbors_refine.cu
new file mode 100644
index 0000000000..6152f83aab
--- /dev/null
+++ b/cpp/test/ext_headers/raft_neighbors_refine.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/neighbors/refine.cuh>
diff --git a/cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu b/cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
new file mode 100644
index 0000000000..39320a40c0
--- /dev/null
+++ b/cpp/test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/spatial/knn/detail/ball_cover/registers.cuh>
diff --git a/cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu b/cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
new file mode 100644
index 0000000000..f884d1b062
--- /dev/null
+++ b/cpp/test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/spatial/knn/detail/fused_l2_knn.cuh>
diff --git a/cpp/test/ext_headers/raft_util_memory_pool.cpp b/cpp/test/ext_headers/raft_util_memory_pool.cpp
new file mode 100644
index 0000000000..11a024b958
--- /dev/null
+++ b/cpp/test/ext_headers/raft_util_memory_pool.cpp
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/util/memory_pool.hpp>

From f37c0e784f5d338ed22eea4a388f62667d185198 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 19 Apr 2023 17:54:24 +0200
Subject: [PATCH 89/89] Add macro tables to developer guide

---
 docs/source/developer_guide.md | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index d26028e458..3f95cf0a01 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -262,9 +262,31 @@ Sometimes, we need to temporarily change the log pattern (eg: for reporting deci
 
 ## Header organization of expensive function templates
 
-RAFT is a heavily templated library. Several core functions are expensive to compile and we want to prevent duplicate compilation of this functionality. To limit build time, RAFT provides a precompiled library (libraft.so) where expensive function templates are instantiated for the most commonly used template parameters. To prevent (1) accidental instantiation of these templates and (2) unnecessary dependency on the internals of these templates, we use the following header structure.
+RAFT is a heavily templated library. Several core functions are expensive to compile and we want to prevent duplicate compilation of this functionality. To limit build time, RAFT provides a precompiled library (libraft.so) where expensive function templates are instantiated for the most commonly used template parameters. To prevent (1) accidental instantiation of these templates and (2) unnecessary dependency on the internals of these templates, we use a split header structure and define macros to control template instantiation. This section describes the macros and header structure.
 
-Any header file that defines an expensive function template (say `expensive.cuh`) should be split in three parts: `expensive.cuh`, `expensive-inl.cuh`, and `expensive-ext.cuh`. The file `expensive-inl.cuh` ("inl" for "inline") contains the template definitions, i.e., the actual code. The file `expensive.cuh` includes one or both of the other two files, depending on the values of the `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY` macros. The file `expensive-ext.cuh` contains `extern template` instantiations. In addition, if `RAFT_EXPLICIT_INSTANTIATE_ONLY` is set, it contains template definitions to ensure that a compiler error is raised in case of accidental instantiation.
+**Macros.** We define the macros `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY`. The `RAFT_COMPILED` macro is defined by `CMake` when compiling code that (1) is part of `libraft.so` or (2) is linked with `libraft.so`. It indicates that a precompiled `libraft.so` is present at runtime.
+
+The `RAFT_EXPLICIT_INSTANTIATE_ONLY` macro is defined by `CMake` during compilation of `libraft.so` itself. When defined, it indicates that implicit instantiations of expensive function templates are forbidden (they result in a compiler error). In the RAFT project, we additionally define this macro during compilation of the tests and benchmarks. 
+
+Below, we summarize which combinations of `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY` are used in practice and what the effect of the combination is. 
+
+| RAFT_COMPILED | RAFT_EXPLICIT_INSTANTIATE_ONLY | Which targets                                                                                        |
+|---------------|--------------------------------|------------------------------------------------------------------------------------------------------|
+| defined       | defined                        | `raft::compiled`, RAFT tests, RAFT benchmarks                                                        |
+| defined       |                                | Downstream libraries depending of `libraft` like cuML, cuGraph.                                      |
+|               |                                | Downstream libraries depending on `libraft-headers` like cugraph-ops.                                |
+
+
+| RAFT_COMPILED | RAFT_EXPLICIT_INSTANTIATE_ONLY | Effect                                                                                                |
+|---------------|--------------------------------|-------------------------------------------------------------------------------------------------------|
+| defined       | defined                        | Templates are precompiled. Compiler error on accidental instantiation of expensive function template. |
+| defined       |                                | Templates are precompiled. Implicit instantiation allowed.                                            |
+|               |                                | Nothing precompiled. Implicit instantiation allowed.                                                  |
+|               | defined                        | Avoid this: nothing precompiled. Compiler error on any instantiation of expensive function template.  |
+
+
+
+**Header organization.** Any header file that defines an expensive function template (say `expensive.cuh`) should be split in three parts: `expensive.cuh`, `expensive-inl.cuh`, and `expensive-ext.cuh`. The file `expensive-inl.cuh` ("inl" for "inline") contains the template definitions, i.e., the actual code. The file `expensive.cuh` includes one or both of the other two files, depending on the values of the `RAFT_COMPILED` and `RAFT_EXPLICIT_INSTANTIATE_ONLY` macros. The file `expensive-ext.cuh` contains `extern template` instantiations. In addition, if `RAFT_EXPLICIT_INSTANTIATE_ONLY` is set, it contains template definitions to ensure that a compiler error is raised in case of accidental instantiation.
 
 The dispatching by `expensive.cuh` is performed as follows:
 ``` c++