From 85589acac80abc87eed065ce3da575316d163c84 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 23 Jan 2023 17:10:23 +0100
Subject: [PATCH 1/4] Squash-merge enh-matrix-topk

---
 cpp/bench/CMakeLists.txt                      |   6 +-
 cpp/bench/matrix/select_k.cu                  | 133 +++++
 cpp/bench/neighbors/selection.cu              | 123 -----
 .../topk.cuh => matrix/detail/select_k.cuh}   |  58 +--
 .../detail/select_radix.cuh}                  | 113 +++--
 .../detail/select_warpsort.cuh}               | 415 +++++++++++-----
 cpp/include/raft/matrix/select_k.cuh          | 110 +++++
 cpp/include/raft/neighbors/detail/refine.cuh  |   4 +-
 .../spatial/knn/detail/ivf_flat_search.cuh    |  75 +--
 .../raft/spatial/knn/detail/ivf_pq_search.cuh |  79 ++-
 cpp/include/raft/spatial/knn/knn.cuh          |  38 +-
 .../knn/detail/topk => util}/bitonic_sort.cuh |  83 ++--
 cpp/include/raft/util/integer_utils.hpp       |  34 +-
 cpp/test/CMakeLists.txt                       |   5 +-
 cpp/test/matrix/select_k.cu                   | 459 ++++++++++++++++++
 cpp/test/matrix/select_k.cuh                  | 127 +++++
 cpp/test/neighbors/ann_ivf_flat.cu            |   8 +-
 cpp/test/neighbors/ann_utils.cuh              |  23 +-
 cpp/test/neighbors/selection.cu               |   2 +-
 cpp/test/util/bitonic_sort.cu                 | 200 ++++++++
 docs/source/cpp_api/matrix_ordering.rst       |  12 +
 21 files changed, 1631 insertions(+), 476 deletions(-)
 create mode 100644 cpp/bench/matrix/select_k.cu
 delete mode 100644 cpp/bench/neighbors/selection.cu
 rename cpp/include/raft/{spatial/knn/detail/topk.cuh => matrix/detail/select_k.cuh} (59%)
 rename cpp/include/raft/{spatial/knn/detail/topk/radix_topk.cuh => matrix/detail/select_radix.cuh} (87%)
 rename cpp/include/raft/{spatial/knn/detail/topk/warpsort_topk.cuh => matrix/detail/select_warpsort.cuh} (71%)
 create mode 100644 cpp/include/raft/matrix/select_k.cuh
 rename cpp/include/raft/{spatial/knn/detail/topk => util}/bitonic_sort.cuh (68%)
 create mode 100644 cpp/test/matrix/select_k.cu
 create mode 100644 cpp/test/matrix/select_k.cuh
 create mode 100644 cpp/test/util/bitonic_sort.cu

diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 8dcdb325e9..6b985acfc3 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -103,7 +103,10 @@ if(BUILD_BENCH)
     bench/main.cpp
   )
 
-  ConfigureBench(NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/matrix/gather.cu bench/main.cpp)
+  ConfigureBench(
+    NAME MATRIX_BENCH PATH bench/matrix/argmin.cu bench/matrix/gather.cu bench/matrix/select_k.cu
+    bench/main.cpp
+  )
 
   ConfigureBench(
     NAME RANDOM_BENCH PATH bench/random/make_blobs.cu bench/random/permute.cu bench/random/rng.cu
@@ -127,7 +130,6 @@ if(BUILD_BENCH)
     bench/neighbors/knn/ivf_pq_int8_t_int64_t.cu
     bench/neighbors/knn/ivf_pq_uint8_t_uint32_t.cu
     bench/neighbors/refine.cu
-    bench/neighbors/selection.cu
     bench/main.cpp
     OPTIONAL
     DIST
diff --git a/cpp/bench/matrix/select_k.cu b/cpp/bench/matrix/select_k.cu
new file mode 100644
index 0000000000..452a50ba50
--- /dev/null
+++ b/cpp/bench/matrix/select_k.cu
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * TODO: reconsider how to organize shared test+bench files better
+ *       Related Issue: https://github.com/rapidsai/raft/issues/1153
+ *       (although this header does not depend on any gtest headers)
+ */
+#include "../../test/matrix/select_k.cuh"
+
+#include <common/benchmark.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <raft/matrix/detail/select_radix.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/matrix/select_k.cuh>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft::matrix {
+
+using namespace raft::bench;  // NOLINT
+
+template <typename KeyT, typename IdxT, select::Algo Algo>
+struct selection : public fixture {
+  explicit selection(const select::params& p)
+    : params_(p),
+      in_dists_(p.batch_size * p.len, stream),
+      in_ids_(p.batch_size * p.len, stream),
+      out_dists_(p.batch_size * p.k, stream),
+      out_ids_(p.batch_size * p.k, stream)
+  {
+    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.batch_size), IdxT(p.len), stream);
+    raft::random::RngState state{42};
+    raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0));
+  }
+
+  void run_benchmark(::benchmark::State& state) override  // NOLINT
+  {
+    handle_t handle{stream};
+    using_pool_memory_res res;
+    try {
+      std::ostringstream label_stream;
+      label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k;
+      state.SetLabel(label_stream.str());
+      loop_on_state(state, [this, &handle]() {
+        select::select_k_impl<KeyT, IdxT>(handle,
+                                          Algo,
+                                          in_dists_.data(),
+                                          in_ids_.data(),
+                                          params_.batch_size,
+                                          params_.len,
+                                          params_.k,
+                                          out_dists_.data(),
+                                          out_ids_.data(),
+                                          params_.select_min);
+      });
+    } catch (raft::exception& e) {
+      state.SkipWithError(e.what());
+    }
+  }
+
+ private:
+  const select::params params_;
+  rmm::device_uvector<KeyT> in_dists_, out_dists_;
+  rmm::device_uvector<IdxT> in_ids_, out_ids_;
+};
+
+const std::vector<select::params> kInputs{
+  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
+  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
+  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
+
+  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
+  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
+  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
+
+  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
+  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
+  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
+
+  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
+  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
+  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
+};
+
+#define SELECTION_REGISTER(KeyT, IdxT, A)                          \
+  namespace BENCHMARK_PRIVATE_NAME(selection)                      \
+  {                                                                \
+    using SelectK = selection<KeyT, IdxT, select::Algo::A>;        \
+    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #A, kInputs); \
+  }
+
+SELECTION_REGISTER(float, int, kPublicApi);           // NOLINT
+SELECTION_REGISTER(float, int, kRadix8bits);          // NOLINT
+SELECTION_REGISTER(float, int, kRadix11bits);         // NOLINT
+SELECTION_REGISTER(float, int, kWarpAuto);            // NOLINT
+SELECTION_REGISTER(float, int, kWarpImmediate);       // NOLINT
+SELECTION_REGISTER(float, int, kWarpFiltered);        // NOLINT
+SELECTION_REGISTER(float, int, kWarpDistributed);     // NOLINT
+SELECTION_REGISTER(float, int, kWarpDistributedShm);  // NOLINT
+
+SELECTION_REGISTER(double, int, kRadix8bits);   // NOLINT
+SELECTION_REGISTER(double, int, kRadix11bits);  // NOLINT
+SELECTION_REGISTER(double, int, kWarpAuto);     // NOLINT
+
+SELECTION_REGISTER(double, size_t, kRadix8bits);          // NOLINT
+SELECTION_REGISTER(double, size_t, kRadix11bits);         // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpImmediate);       // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpFiltered);        // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpDistributed);     // NOLINT
+SELECTION_REGISTER(double, size_t, kWarpDistributedShm);  // NOLINT
+
+}  // namespace raft::matrix
diff --git a/cpp/bench/neighbors/selection.cu b/cpp/bench/neighbors/selection.cu
deleted file mode 100644
index 1f116c199f..0000000000
--- a/cpp/bench/neighbors/selection.cu
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/benchmark.hpp>
-#include <raft/spatial/knn/knn.cuh>
-
-#if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
-#endif
-
-#include <raft/random/rng.cuh>
-#include <raft/sparse/detail/utils.h>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-namespace raft::bench::spatial {
-
-struct params {
-  int n_inputs;
-  int input_len;
-  int k;
-  int select_min;
-};
-
-template <typename KeyT, typename IdxT, raft::spatial::knn::SelectKAlgo Algo>
-struct selection : public fixture {
-  explicit selection(const params& p)
-    : params_(p),
-      in_dists_(p.n_inputs * p.input_len, stream),
-      in_ids_(p.n_inputs * p.input_len, stream),
-      out_dists_(p.n_inputs * p.k, stream),
-      out_ids_(p.n_inputs * p.k, stream)
-  {
-    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.n_inputs), IdxT(p.input_len), stream);
-    raft::random::RngState state{42};
-    raft::random::uniform(handle, state, in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0));
-  }
-
-  void run_benchmark(::benchmark::State& state) override
-  {
-    using_pool_memory_res res;
-    try {
-      std::ostringstream label_stream;
-      label_stream << params_.n_inputs << "#" << params_.input_len << "#" << params_.k;
-      state.SetLabel(label_stream.str());
-      loop_on_state(state, [this]() {
-        raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_.data(),
-                                                 in_ids_.data(),
-                                                 params_.n_inputs,
-                                                 params_.input_len,
-                                                 out_dists_.data(),
-                                                 out_ids_.data(),
-                                                 params_.select_min,
-                                                 params_.k,
-                                                 stream,
-                                                 Algo);
-      });
-    } catch (raft::exception& e) {
-      state.SkipWithError(e.what());
-    }
-  }
-
- private:
-  const params params_;
-  rmm::device_uvector<KeyT> in_dists_, out_dists_;
-  rmm::device_uvector<IdxT> in_ids_, out_ids_;
-};
-
-const std::vector<params> kInputs{
-  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
-  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
-  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
-
-  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
-  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
-  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
-
-  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
-  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
-  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
-
-  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
-  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
-  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
-};
-
-#define SELECTION_REGISTER(KeyT, IdxT, Algo)                                      \
-  namespace BENCHMARK_PRIVATE_NAME(selection)                                     \
-  {                                                                               \
-    using SelectK = selection<KeyT, IdxT, raft::spatial::knn::SelectKAlgo::Algo>; \
-    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #Algo, kInputs);             \
-  }
-
-SELECTION_REGISTER(float, int, FAISS);
-SELECTION_REGISTER(float, int, RADIX_8_BITS);
-SELECTION_REGISTER(float, int, RADIX_11_BITS);
-SELECTION_REGISTER(float, int, WARP_SORT);
-
-SELECTION_REGISTER(double, int, FAISS);
-SELECTION_REGISTER(double, int, RADIX_8_BITS);
-SELECTION_REGISTER(double, int, RADIX_11_BITS);
-SELECTION_REGISTER(double, int, WARP_SORT);
-
-SELECTION_REGISTER(double, size_t, FAISS);
-SELECTION_REGISTER(double, size_t, RADIX_8_BITS);
-SELECTION_REGISTER(double, size_t, RADIX_11_BITS);
-SELECTION_REGISTER(double, size_t, WARP_SORT);
-
-}  // namespace raft::bench::spatial
diff --git a/cpp/include/raft/spatial/knn/detail/topk.cuh b/cpp/include/raft/matrix/detail/select_k.cuh
similarity index 59%
rename from cpp/include/raft/spatial/knn/detail/topk.cuh
rename to cpp/include/raft/matrix/detail/select_k.cuh
index f4dcb53088..ac1ba3dfa3 100644
--- a/cpp/include/raft/spatial/knn/detail/topk.cuh
+++ b/cpp/include/raft/matrix/detail/select_k.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,34 +16,34 @@
 
 #pragma once
 
-#include "topk/radix_topk.cuh"
-#include "topk/warpsort_topk.cuh"
+#include "select_radix.cuh"
+#include "select_warpsort.cuh"
 
 #include <raft/core/nvtx.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-namespace raft::spatial::knn::detail {
+namespace raft::matrix::detail {
 
 /**
  * Select k smallest or largest key/values from each row in the input data.
  *
- * If you think of the input data `in_keys` as a row-major matrix with len columns and
- * batch_size rows, then this function selects k smallest/largest values in each row and fills
- * in the row-major matrix `out` of size (batch_size, k).
+ * If you think of the input data `in_val` as a row-major matrix with `len` columns and
+ * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
+ * in the row-major matrix `out_val` of size (batch_size, k).
  *
  * @tparam T
  *   the type of the keys (what is being compared).
  * @tparam IdxT
  *   the index type (what is being selected together with the keys).
  *
- * @param[in] in
+ * @param[in] in_val
  *   contiguous device array of inputs of size (len * batch_size);
  *   these are compared and selected.
  * @param[in] in_idx
  *   contiguous device array of inputs of size (len * batch_size);
- *   typically, these are indices of the corresponding in_keys.
+ *   typically, these are indices of the corresponding in_val.
  * @param batch_size
  *   number of input rows, i.e. the batch size.
  * @param len
@@ -51,12 +51,12 @@ namespace raft::spatial::knn::detail {
  *   Invariant: len >= k.
  * @param k
  *   the number of outputs to select in each input row.
- * @param[out] out
+ * @param[out] out_val
  *   contiguous device array of outputs of size (k * batch_size);
- *   the k smallest/largest values from each row of the `in_keys`.
+ *   the k smallest/largest values from each row of the `in_val`.
  * @param[out] out_idx
  *   contiguous device array of outputs of size (k * batch_size);
- *   the payload selected together with `out`.
+ *   the payload selected together with `out_val`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
  * @param stream
@@ -64,28 +64,28 @@ namespace raft::spatial::knn::detail {
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
-void select_topk(const T* in,
-                 const IdxT* in_idx,
-                 size_t batch_size,
-                 size_t len,
-                 int k,
-                 T* out,
-                 IdxT* out_idx,
-                 bool select_min,
-                 rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr = nullptr)
+void select_k(const T* in_val,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out_val,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "matrix::select_topk(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+    "matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
   // TODO (achirkin): investigate the trade-off for a wider variety of inputs.
   const bool radix_faster = batch_size >= 64 && len >= 102400 && k >= 128;
-  if (k <= raft::spatial::knn::detail::topk::kMaxCapacity && !radix_faster) {
-    topk::warp_sort_topk<T, IdxT>(
-      in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr);
+  if (k <= select::warpsort::kMaxCapacity && !radix_faster) {
+    select::warpsort::select_k<T, IdxT>(
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
   } else {
-    topk::radix_topk<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
-      in, in_idx, batch_size, len, k, out, out_idx, select_min, stream, mr);
+    select::radix::select_k<T, IdxT, (sizeof(T) >= 4 ? 11 : 8), 512>(
+      in_val, in_idx, batch_size, len, k, out_val, out_idx, select_min, stream, mr);
   }
 }
 
-}  // namespace raft::spatial::knn::detail
+}  // namespace raft::matrix::detail
diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
similarity index 87%
rename from cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
rename to cpp/include/raft/matrix/detail/select_radix.cuh
index 9c0f20b706..de19e63a4c 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/core/cudart_utils.hpp>
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/pow2_utils.cuh>
@@ -27,29 +28,29 @@
 #include <cub/block/block_store.cuh>
 #include <cub/block/radix_rank_sort_operations.cuh>
 
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-namespace raft::spatial::knn::detail::topk {
+namespace raft::matrix::detail::select::radix {
 
 constexpr int ITEM_PER_THREAD      = 32;
 constexpr int VECTORIZED_READ_SIZE = 16;
 
 template <int BitsPerPass>
-__host__ __device__ constexpr int calc_num_buckets()
+_RAFT_HOST_DEVICE constexpr int calc_num_buckets()
 {
   return 1 << BitsPerPass;
 }
 
 template <typename T, int BitsPerPass>
-__host__ __device__ constexpr int calc_num_passes()
+_RAFT_HOST_DEVICE constexpr int calc_num_passes()
 {
   return ceildiv<int>(sizeof(T) * 8, BitsPerPass);
 }
 
 // Minimum reasonable block size for the given radix size.
 template <int BitsPerPass>
-__host__ __device__ constexpr int calc_min_block_size()
+_RAFT_HOST_DEVICE constexpr int calc_min_block_size()
 {
   return 1 << std::max<int>(BitsPerPass - 4, Pow2<WarpSize>::Log2 + 1);
 }
@@ -62,7 +63,7 @@ __host__ __device__ constexpr int calc_min_block_size()
  * NB: Use pass=-1 for calc_mask().
  */
 template <typename T, int BitsPerPass>
-__device__ constexpr int calc_start_bit(int pass)
+_RAFT_DEVICE constexpr int calc_start_bit(int pass)
 {
   int start_bit = static_cast<int>(sizeof(T) * 8) - (pass + 1) * BitsPerPass;
   if (start_bit < 0) { start_bit = 0; }
@@ -70,7 +71,7 @@ __device__ constexpr int calc_start_bit(int pass)
 }
 
 template <typename T, int BitsPerPass>
-__device__ constexpr unsigned calc_mask(int pass)
+_RAFT_DEVICE constexpr unsigned calc_mask(int pass)
 {
   static_assert(BitsPerPass <= 31);
   int num_bits = calc_start_bit<T, BitsPerPass>(pass - 1) - calc_start_bit<T, BitsPerPass>(pass);
@@ -82,7 +83,7 @@ __device__ constexpr unsigned calc_mask(int pass)
  * as of integers.
  */
 template <typename T>
-__device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
+_RAFT_DEVICE typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
 {
   auto bits = reinterpret_cast<typename cub::Traits<T>::UnsignedBits&>(key);
   bits      = cub::Traits<T>::TwiddleIn(bits);
@@ -91,7 +92,7 @@ __device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
 }
 
 template <typename T, int BitsPerPass>
-__device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
+_RAFT_DEVICE int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
 {
   static_assert(BitsPerPass <= sizeof(int) * 8 - 1);  // so return type can be int
   return (twiddle_in(x, greater) >> start_bit) & mask;
@@ -112,7 +113,7 @@ __device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
  * @param f the lambda taking two arguments (T x, IdxT idx)
  */
 template <typename T, typename IdxT, typename Func>
-__device__ void vectorized_process(const T* in, IdxT len, Func f)
+_RAFT_DEVICE void vectorized_process(const T* in, IdxT len, Func f)
 {
   const IdxT stride = blockDim.x * gridDim.x;
   const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
@@ -167,18 +168,18 @@ struct Counter {
  * (see steps 4-1 in `radix_kernel` description).
  */
 template <typename T, typename IdxT, int BitsPerPass>
-__device__ void filter_and_histogram(const T* in_buf,
-                                     const IdxT* in_idx_buf,
-                                     T* out_buf,
-                                     IdxT* out_idx_buf,
-                                     T* out,
-                                     IdxT* out_idx,
-                                     IdxT len,
-                                     Counter<T, IdxT>* counter,
-                                     IdxT* histogram,
-                                     bool greater,
-                                     int pass,
-                                     int k)
+_RAFT_DEVICE void filter_and_histogram(const T* in_buf,
+                                       const IdxT* in_idx_buf,
+                                       T* out_buf,
+                                       IdxT* out_idx_buf,
+                                       T* out,
+                                       IdxT* out_idx,
+                                       IdxT len,
+                                       Counter<T, IdxT>* counter,
+                                       IdxT* histogram,
+                                       bool greater,
+                                       int pass,
+                                       int k)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ IdxT histogram_smem[num_buckets];
@@ -260,10 +261,10 @@ __device__ void filter_and_histogram(const T* in_buf,
  * (step 2 in `radix_kernel` description)
  */
 template <typename IdxT, int BitsPerPass, int BlockSize>
-__device__ void scan(volatile IdxT* histogram,
-                     const int start,
-                     const int num_buckets,
-                     const IdxT current)
+_RAFT_DEVICE void scan(volatile IdxT* histogram,
+                       const int start,
+                       const int num_buckets,
+                       const IdxT current)
 {
   typedef cub::BlockScan<IdxT, BlockSize> BlockScan;
   __shared__ typename BlockScan::TempStorage temp_storage;
@@ -284,7 +285,7 @@ __device__ void scan(volatile IdxT* histogram,
  *  (steps 2-3 in `radix_kernel` description)
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-__device__ void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
+_RAFT_DEVICE void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   int index                 = threadIdx.x;
@@ -547,21 +548,21 @@ inline dim3 get_optimal_grid_size(size_t req_batch_size, size_t len)
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-void radix_topk(const T* in,
-                const IdxT* in_idx,
-                size_t batch_size,
-                size_t len,
-                int k,
-                T* out,
-                IdxT* out_idx,
-                bool select_min,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr = nullptr)
+void select_k(const T* in,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
 {
   // reduce the block size if the input length is too small.
   if constexpr (BlockSize > calc_min_block_size<BitsPerPass>()) {
     if (BlockSize * ITEM_PER_THREAD > len) {
-      return radix_topk<T, IdxT, BitsPerPass, BlockSize / 2>(
+      return select_k<T, IdxT, BitsPerPass, BlockSize / 2>(
         in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
     }
   }
@@ -573,23 +574,33 @@ void radix_topk(const T* in,
   dim3 blocks           = get_optimal_grid_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, len);
   size_t max_chunk_size = blocks.y;
 
-  auto pool_guard = raft::get_pool_memory_resource(
-    mr,
-    max_chunk_size * (sizeof(Counter<T, IdxT>)            // counters
-                      + sizeof(IdxT) * (num_buckets + 2)  // histograms and IdxT bufs
-                      + sizeof(T) * 2                     // T bufs
-                      ));
+  size_t req_aux = max_chunk_size * (sizeof(Counter<T, IdxT>) + num_buckets * sizeof(IdxT));
+  size_t req_buf = max_chunk_size * len * 2 * (sizeof(T) + sizeof(IdxT));
+  size_t mem_req = req_aux + req_buf;
+  size_t mem_free, mem_total;
+  RAFT_CUDA_TRY(cudaMemGetInfo(&mem_free, &mem_total));
+  std::optional<rmm::mr::managed_memory_resource> managed_memory;
+  rmm::mr::device_memory_resource* mr_buf = nullptr;
+  if (mem_req > mem_free) {
+    // if there's not enough memory for buffers on the device, resort to the managed memory.
+    mem_req = req_aux;
+    managed_memory.emplace();
+    mr_buf = &managed_memory.value();
+  }
+
+  auto pool_guard = raft::get_pool_memory_resource(mr, mem_req);
   if (pool_guard) {
-    RAFT_LOG_DEBUG("radix_topk: using pool memory resource with initial size %zu bytes",
+    RAFT_LOG_DEBUG("radix::select_k: using pool memory resource with initial size %zu bytes",
                    pool_guard->pool_size());
   }
+  if (mr_buf == nullptr) { mr_buf = mr; }
 
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream, mr);
-  rmm::device_uvector<T> buf1(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<T> buf2(len * max_chunk_size, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream, mr);
+  rmm::device_uvector<IdxT> histograms(max_chunk_size * num_buckets, stream, mr);
+  rmm::device_uvector<T> buf1(max_chunk_size * len, stream, mr_buf);
+  rmm::device_uvector<IdxT> idx_buf1(max_chunk_size * len, stream, mr_buf);
+  rmm::device_uvector<T> buf2(max_chunk_size * len, stream, mr_buf);
+  rmm::device_uvector<IdxT> idx_buf2(max_chunk_size * len, stream, mr_buf);
 
   for (size_t offset = 0; offset < batch_size; offset += max_chunk_size) {
     blocks.y = std::min(max_chunk_size, batch_size - offset);
@@ -646,4 +657,4 @@ void radix_topk(const T* in,
   }
 }
 
-}  // namespace raft::spatial::knn::detail::topk
+}  // namespace raft::matrix::detail::select::radix
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
similarity index 71%
rename from cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
rename to cpp/include/raft/matrix/detail/select_warpsort.cuh
index c06aa04aea..d362b73792 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -16,10 +16,11 @@
 
 #pragma once
 
-#include "bitonic_sort.cuh"
-
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/util/bitonic_sort.cuh>
 #include <raft/util/cuda_utils.cuh>
+#include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
 #include <algorithm>
@@ -31,12 +32,12 @@
 
 /*
   Three APIs of different scopes are provided:
-    1. host function: warp_sort_topk()
+    1. host function: select_k()
     2. block-wide API: class block_sort
     3. warp-wide API: several implementations of warp_sort_*
 
 
-  1. warp_sort_topk()
+  1. select_k()
     (see the docstring)
 
   2. class block_sort
@@ -74,7 +75,7 @@
     These two classes can be regarded as fixed size priority queue for a warp.
     Usage is similar to class block_sort. No shared memory is needed.
 
-    The host function (warp_sort_topk) uses a heuristic to choose between these two classes for
+    The host function (select_k) uses a heuristic to choose between these two classes for
     sorting, warp_sort_immediate being chosen when the number of inputs per warp is somewhat small
     (see the usage of LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing).
 
@@ -94,7 +95,7 @@
       }
  */
 
-namespace raft::spatial::knn::detail::topk {
+namespace raft::matrix::detail::select::warpsort {
 
 static constexpr int kMaxCapacity = 256;
 
@@ -102,18 +103,12 @@ namespace {
 
 /** Whether 'left` should indeed be on the left w.r.t. `right`. */
 template <bool Ascending, typename T>
-__device__ __forceinline__ auto is_ordered(T left, T right) -> bool
+_RAFT_DEVICE _RAFT_FORCEINLINE auto is_ordered(T left, T right) -> bool
 {
   if constexpr (Ascending) { return left < right; }
   if constexpr (!Ascending) { return left > right; }
 }
 
-constexpr auto calc_capacity(int k) -> int
-{
-  int capacity = isPo2(k) ? k : (1 << (log2(k) + 1));
-  return capacity;
-}
-
 }  // namespace
 
 /**
@@ -134,7 +129,7 @@ constexpr auto calc_capacity(int k) -> int
  */
 template <int Capacity, bool Ascending, typename T, typename IdxT>
 class warp_sort {
-  static_assert(isPo2(Capacity));
+  static_assert(is_a_power_of_two(Capacity));
   static_assert(std::is_default_constructible_v<IdxT>);
 
  public:
@@ -148,13 +143,16 @@ class warp_sort {
   /** The number of elements to select. */
   const int k;
 
+  /** Extra memory required per-block for keeping the state (shared or global). */
+  constexpr static auto mem_required(uint32_t block_size) -> size_t { return 0; }
+
   /**
    * Construct the warp_sort empty queue.
    *
    * @param k
    *   number of elements to select.
    */
-  __device__ warp_sort(int k) : k(k)
+  _RAFT_DEVICE warp_sort(int k) : k(k)
   {
 #pragma unroll
     for (int i = 0; i < kMaxArrLen; i++) {
@@ -182,7 +180,7 @@ class warp_sort {
    *    It serves as a conditional; when `false` the function does nothing.
    *    We need it to ensure threads within a full warp don't diverge calling `bitonic::merge()`.
    */
-  __device__ void load_sorted(const T* in, const IdxT* in_idx, bool do_merge = true)
+  _RAFT_DEVICE void load_sorted(const T* in, const IdxT* in_idx, bool do_merge = true)
   {
     if (do_merge) {
       int idx = Pow2<kWarpWidth>::mod(laneId()) ^ Pow2<kWarpWidth>::Mask;
@@ -198,7 +196,7 @@ class warp_sort {
       }
     }
     if (kWarpWidth < WarpSize || do_merge) {
-      topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
+      util::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
     }
   }
 
@@ -211,15 +209,23 @@ class warp_sort {
    * @param[out] out_idx
    *   device pointer to a contiguous array, unique per-subwarp of size `kWarpWidth`
    *    (length: k <= kWarpWidth * kMaxArrLen).
+   * @param valF (optional) postprocess values (T -> OutT)
+   * @param idxF (optional) postprocess indices (IdxT -> OutIdxT)
    */
-  template <typename Lambda = raft::identity_op>
-  __device__ void store(T* out, IdxT* out_idx, Lambda post_process = raft::identity_op()) const
+  template <typename OutT,
+            typename OutIdxT,
+            typename ValF = identity_op,
+            typename IdxF = identity_op>
+  _RAFT_DEVICE void store(OutT* out,
+                          OutIdxT* out_idx,
+                          ValF valF = raft::identity_op{},
+                          IdxF idxF = raft::identity_op{}) const
   {
     int idx = Pow2<kWarpWidth>::mod(laneId());
 #pragma unroll kMaxArrLen
     for (int i = 0; i < kMaxArrLen && idx < k; i++, idx += kWarpWidth) {
-      out[idx]     = post_process(val_arr_[i]);
-      out_idx[idx] = idx_arr_[i];
+      out[idx]     = valF(val_arr_[i]);
+      out_idx[idx] = idxF(idx_arr_[i]);
     }
   }
 
@@ -246,8 +252,8 @@ class warp_sort {
    *   the associated indices of the elements in the same format as `keys_in`.
    */
   template <int PerThreadSizeIn>
-  __device__ __forceinline__ void merge_in(const T* __restrict__ keys_in,
-                                           const IdxT* __restrict__ ids_in)
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_in(const T* __restrict__ keys_in,
+                                               const IdxT* __restrict__ ids_in)
   {
 #pragma unroll
     for (int i = std::min(kMaxArrLen, PerThreadSizeIn); i > 0; i--) {
@@ -258,7 +264,7 @@ class warp_sort {
         idx_arr_[kMaxArrLen - i] = ids_in[PerThreadSizeIn - i];
       }
     }
-    topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
+    util::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
   }
 };
 
@@ -276,8 +282,9 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  using warp_sort<Capacity, Ascending, T, IdxT>::mem_required;
 
-  __device__ warp_sort_filtered(int k, T limit)
+  explicit _RAFT_DEVICE warp_sort_filtered(int k, T limit = kDummy)
     : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0), k_th_(limit)
   {
 #pragma unroll
@@ -287,12 +294,14 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ __forceinline__ explicit warp_sort_filtered(int k)
-    : warp_sort_filtered<Capacity, Ascending, T, IdxT>(k, kDummy)
+  _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k,
+                                                            uint8_t* = nullptr,
+                                                            T limit  = kDummy)
   {
+    return warp_sort_filtered<Capacity, Ascending, T, IdxT>{k, limit};
   }
 
-  __device__ void add(T val, IdxT idx)
+  _RAFT_DEVICE void add(T val, IdxT idx)
   {
     // comparing for k_th should reduce the total amount of updates:
     // `false` means the input value is surely not in the top-k values.
@@ -310,22 +319,22 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     if (do_add) { add_to_buf_(val, idx); }
   }
 
-  __device__ void done()
+  _RAFT_DEVICE void done()
   {
     if (any(buf_len_ != 0)) { merge_buf_(); }
   }
 
  private:
-  __device__ __forceinline__ void set_k_th_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_()
   {
     // NB on using srcLane: it's ok if it is outside the warp size / width;
     //                      the modulo op will be done inside the __shfl_sync.
     k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
   }
 
-  __device__ __forceinline__ void merge_buf_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_()
   {
-    topk::bitonic<kMaxBufLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
+    util::bitonic<kMaxBufLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
     this->merge_in<kMaxBufLen>(val_buf_, idx_buf_);
     buf_len_ = 0;
     set_k_th_();  // contains warp sync
@@ -335,7 +344,7 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ __forceinline__ void add_to_buf_(T val, IdxT idx)
+  _RAFT_DEVICE _RAFT_FORCEINLINE void add_to_buf_(T val, IdxT idx)
   {
     // NB: the loop is used here to ensure the constant indexing,
     //     to not force the buffers spill into the local memory.
@@ -374,8 +383,9 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  using warp_sort<Capacity, Ascending, T, IdxT>::mem_required;
 
-  __device__ warp_sort_distributed(int k, T limit)
+  explicit _RAFT_DEVICE warp_sort_distributed(int k, T limit = kDummy)
     : warp_sort<Capacity, Ascending, T, IdxT>(k),
       buf_val_(kDummy),
       buf_idx_(IdxT{}),
@@ -384,12 +394,14 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   {
   }
 
-  __device__ __forceinline__ explicit warp_sort_distributed(int k)
-    : warp_sort_distributed<Capacity, Ascending, T, IdxT>(k, kDummy)
+  _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k,
+                                                            uint8_t* = nullptr,
+                                                            T limit  = kDummy)
   {
+    return warp_sort_distributed<Capacity, Ascending, T, IdxT>{k, limit};
   }
 
-  __device__ void add(T val, IdxT idx)
+  _RAFT_DEVICE void add(T val, IdxT idx)
   {
     // mask tells which lanes in the warp have valid items to be added
     uint32_t mask = ballot(is_ordered<Ascending>(val, k_th_));
@@ -429,7 +441,7 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void done()
+  _RAFT_DEVICE void done()
   {
     if (buf_len_ != 0) {
       merge_buf_();
@@ -438,16 +450,16 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   }
 
  private:
-  __device__ __forceinline__ void set_k_th_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_()
   {
     // NB on using srcLane: it's ok if it is outside the warp size / width;
     //                      the modulo op will be done inside the __shfl_sync.
     k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
   }
 
-  __device__ __forceinline__ void merge_buf_()
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_()
   {
-    topk::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val_, buf_idx_);
+    util::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val_, buf_idx_);
     this->merge_in<1>(&buf_val_, &buf_idx_);
     set_k_th_();  // contains warp sync
     buf_val_ = kDummy;
@@ -464,6 +476,117 @@ class warp_sort_distributed : public warp_sort<Capacity, Ascending, T, IdxT> {
   T k_th_;
 };
 
+/**
+ * The same as `warp_sort_distributed`, but keeps the temporary value and index buffers
+ * in the given external pointers (normally, a shared memory pointer should be passed in).
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_distributed_ext : public warp_sort<Capacity, Ascending, T, IdxT> {
+ public:
+  using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k;
+
+  constexpr static auto mem_required(uint32_t block_size) -> size_t
+  {
+    return (sizeof(T) + sizeof(IdxT)) * block_size;
+  }
+
+  _RAFT_DEVICE warp_sort_distributed_ext(int k, T* val_buf, IdxT* idx_buf, T limit = kDummy)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k),
+      val_buf_(val_buf),
+      idx_buf_(idx_buf),
+      buf_len_(0),
+      k_th_(limit)
+  {
+    val_buf_[laneId()] = kDummy;
+  }
+
+  _RAFT_DEVICE static auto init_blockwide(int k, uint8_t* shmem, T limit = kDummy)
+  {
+    T* val_buf    = nullptr;
+    IdxT* idx_buf = nullptr;
+    if constexpr (alignof(T) >= alignof(IdxT)) {
+      val_buf = reinterpret_cast<T*>(shmem);
+      idx_buf = reinterpret_cast<IdxT*>(val_buf + blockDim.x);
+    } else {
+      idx_buf = reinterpret_cast<IdxT*>(shmem);
+      val_buf = reinterpret_cast<T*>(idx_buf + blockDim.x);
+    }
+    auto warp_offset = Pow2<WarpSize>::roundDown(threadIdx.x);
+    val_buf += warp_offset;
+    idx_buf += warp_offset;
+    return warp_sort_distributed_ext<Capacity, Ascending, T, IdxT>{k, val_buf, idx_buf, limit};
+  }
+
+  _RAFT_DEVICE void add(T val, IdxT idx)
+  {
+    bool do_add = is_ordered<Ascending>(val, k_th_);
+    // mask tells which lanes in the warp have valid items to be added
+    uint32_t mask = ballot(do_add);
+    if (mask == 0) { return; }
+    // where to put the element in the tmp buffer
+    int dst_ix = buf_len_ + __popc(mask & ((1u << laneId()) - 1u));
+    // put all elements, which fit into the current tmp buffer
+    if (do_add && dst_ix < WarpSize) {
+      val_buf_[dst_ix] = val;
+      idx_buf_[dst_ix] = idx;
+      do_add           = false;
+    }
+    // Total number of elements to be added
+    buf_len_ += __popc(mask);
+    // If the buffer is still not full, we can return
+    if (buf_len_ < WarpSize) { return; }
+    // Otherwise, merge the warp tmp buffer into the queue
+    merge_buf_();  // implies warp sync
+    buf_len_ -= WarpSize;
+    // save the inputs that couldn't fit before the merge
+    if (do_add) {
+      dst_ix -= WarpSize;
+      val_buf_[dst_ix] = val;
+      idx_buf_[dst_ix] = idx;
+    }
+  }
+
+  _RAFT_DEVICE void done()
+  {
+    if (buf_len_ != 0) {
+      merge_buf_();
+      buf_len_ = 0;
+    }
+    __syncthreads();
+  }
+
+ private:
+  _RAFT_DEVICE _RAFT_FORCEINLINE void set_k_th_()
+  {
+    // NB on using srcLane: it's ok if it is outside the warp size / width;
+    //                      the modulo op will be done inside the __shfl_sync.
+    k_th_ = shfl(val_arr_[kMaxArrLen - 1], k - 1, kWarpWidth);
+  }
+
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge_buf_()
+  {
+    __syncwarp();  // make sure the threads are aware of the data written by others
+    T buf_val          = val_buf_[laneId()];
+    IdxT buf_idx       = idx_buf_[laneId()];
+    val_buf_[laneId()] = kDummy;
+    util::bitonic<1>(!Ascending, kWarpWidth).sort(buf_val, buf_idx);
+    this->merge_in<1>(&buf_val, &buf_idx);
+    set_k_th_();  // contains warp sync
+  }
+
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+
+  T* val_buf_;
+  IdxT* idx_buf_;
+  uint32_t buf_len_;  // 0 <= buf_len_ < WarpSize
+
+  T k_th_;
+};
+
 /**
  * This version of warp_sort adds every input element into the intermediate sorting
  * buffer, and thus does the sorting step every `Capacity` input elements.
@@ -476,8 +599,10 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
   using warp_sort<Capacity, Ascending, T, IdxT>::kDummy;
   using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
   using warp_sort<Capacity, Ascending, T, IdxT>::k;
+  using warp_sort<Capacity, Ascending, T, IdxT>::mem_required;
 
-  __device__ warp_sort_immediate(int k) : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0)
+  explicit _RAFT_DEVICE warp_sort_immediate(int k)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k), buf_len_(0)
   {
 #pragma unroll
     for (int i = 0; i < kMaxArrLen; i++) {
@@ -486,7 +611,12 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void add(T val, IdxT idx)
+  _RAFT_DEVICE _RAFT_FORCEINLINE static auto init_blockwide(int k, uint8_t* = nullptr)
+  {
+    return warp_sort_immediate<Capacity, Ascending, T, IdxT>{k};
+  }
+
+  _RAFT_DEVICE void add(T val, IdxT idx)
   {
     // NB: the loop is used here to ensure the constant indexing,
     //     to not force the buffers spill into the local memory.
@@ -500,7 +630,7 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
 
     ++buf_len_;
     if (buf_len_ == kMaxArrLen) {
-      topk::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
+      util::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
       this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
 #pragma unroll
       for (int i = 0; i < kMaxArrLen; i++) {
@@ -510,10 +640,10 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void done()
+  _RAFT_DEVICE void done()
   {
     if (buf_len_ != 0) {
-      topk::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
+      util::bitonic<kMaxArrLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
       this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
     }
   }
@@ -545,15 +675,11 @@ class block_sort {
   using queue_t = WarpSortWarpWide<Capacity, Ascending, T, IdxT>;
 
   template <typename... Args>
-  __device__ block_sort(int k, uint8_t* smem_buf, Args... args) : queue_(k, args...)
+  _RAFT_DEVICE block_sort(int k, Args... args) : queue_(queue_t::init_blockwide(k, args...))
   {
-    val_smem_             = reinterpret_cast<T*>(smem_buf);
-    const int num_of_warp = subwarp_align::div(blockDim.x);
-    idx_smem_             = reinterpret_cast<IdxT*>(
-      smem_buf + Pow2<256>::roundUp(ceildiv(num_of_warp, 2) * sizeof(T) * k));
   }
 
-  __device__ void add(T val, IdxT idx) { queue_.add(val, idx); }
+  _RAFT_DEVICE void add(T val, IdxT idx) { queue_.add(val, idx); }
 
   /**
    * At the point of calling this function, the warp-level queues consumed all input
@@ -561,22 +687,26 @@ class block_sort {
    *
    * Here we tree-merge the results using the shared memory and block sync.
    */
-  __device__ void done()
+  _RAFT_DEVICE void done(uint8_t* smem_buf)
   {
     queue_.done();
 
+    int nwarps    = subwarp_align::div(blockDim.x);
+    auto val_smem = reinterpret_cast<T*>(smem_buf);
+    auto idx_smem = reinterpret_cast<IdxT*>(
+      smem_buf + Pow2<256>::roundUp(ceildiv(nwarps, 2) * sizeof(T) * queue_.k));
+
     const int warp_id = subwarp_align::div(threadIdx.x);
     // NB: there is no need for the second __synchthreads between .load_sorted and .store:
     //     we shift the pointers every iteration, such that individual warps either access the same
     //     locations or do not overlap with any of the other warps. The access patterns within warps
     //     are different for the two functions, but .load_sorted implies warp sync at the end, so
     //     there is no need for __syncwarp either.
-    for (int shift_mask = ~0, nwarps = subwarp_align::div(blockDim.x), split = (nwarps + 1) >> 1;
-         nwarps > 1;
+    for (int shift_mask = ~0, split = (nwarps + 1) >> 1; nwarps > 1;
          nwarps = split, split = (nwarps + 1) >> 1) {
       if (warp_id < nwarps && warp_id >= split) {
         int dst_warp_shift = (warp_id - (split & shift_mask)) * queue_.k;
-        queue_.store(val_smem_ + dst_warp_shift, idx_smem_ + dst_warp_shift);
+        queue_.store(val_smem + dst_warp_shift, idx_smem + dst_warp_shift);
       }
       __syncthreads();
 
@@ -586,23 +716,27 @@ class block_sort {
         // The last argument serves as a condition for loading
         //  -- to make sure threads within a full warp do not diverge on `bitonic::merge()`
         queue_.load_sorted(
-          val_smem_ + src_warp_shift, idx_smem_ + src_warp_shift, warp_id < nwarps - split);
+          val_smem + src_warp_shift, idx_smem + src_warp_shift, warp_id < nwarps - split);
       }
     }
   }
 
   /** Save the content by the pointer location. */
-  template <typename Lambda = raft::identity_op>
-  __device__ void store(T* out, IdxT* out_idx, Lambda post_process = raft::identity_op()) const
+  template <typename OutT,
+            typename OutIdxT,
+            typename ValF = identity_op,
+            typename IdxF = identity_op>
+  _RAFT_DEVICE void store(OutT* out,
+                          OutIdxT* out_idx,
+                          ValF valF = raft::identity_op{},
+                          IdxF idxF = raft::identity_op{}) const
   {
-    if (threadIdx.x < subwarp_align::Value) { queue_.store(out, out_idx, post_process); }
+    if (threadIdx.x < subwarp_align::Value) { queue_.store(out, out_idx, valF, idxF); }
   }
 
  private:
   using subwarp_align = Pow2<queue_t::kWarpWidth>;
   queue_t queue_;
-  T* val_smem_;
-  IdxT* idx_smem_;
 };
 
 /**
@@ -620,7 +754,10 @@ __launch_bounds__(256) __global__
   void block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx)
 {
   extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
-  block_sort<WarpSortClass, Capacity, Ascending, T, IdxT> queue(k, smem_buf_bytes);
+  using bq_t         = block_sort<WarpSortClass, Capacity, Ascending, T, IdxT>;
+  uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr;
+  bq_t queue(k, warp_smem);
+
   in += blockIdx.y * len;
   if (in_idx != nullptr) { in_idx += blockIdx.y * len; }
 
@@ -631,7 +768,7 @@ __launch_bounds__(256) __global__
               (i < len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
   }
 
-  queue.done();
+  queue.done(smem_buf_bytes);
   const int block_id = blockIdx.x + gridDim.x * blockIdx.y;
   queue.store(out + block_id * k, out_idx + block_id * k);
 }
@@ -658,7 +795,7 @@ struct launch_setup {
                                   int* min_grid_size,
                                   int block_size_limit = 0)
   {
-    const int capacity = calc_capacity(k);
+    const int capacity = bound_by_power_of_two(k);
     if constexpr (Capacity > 1) {
       if (capacity < Capacity) {
         return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::calc_optimal_params(
@@ -691,7 +828,7 @@ struct launch_setup {
                      IdxT* out_idx,
                      rmm::cuda_stream_view stream)
   {
-    const int capacity = calc_capacity(k);
+    const int capacity = bound_by_power_of_two(k);
     if constexpr (Capacity > 1) {
       if (capacity < Capacity) {
         return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::kernel(k,
@@ -742,6 +879,18 @@ struct LaunchThreshold<warp_sort_filtered> {
   static constexpr int len_factor_for_single_block = 32;
 };
 
+template <>
+struct LaunchThreshold<warp_sort_distributed> {
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 32;
+};
+
+template <>
+struct LaunchThreshold<warp_sort_distributed_ext> {
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 32;
+};
+
 template <>
 struct LaunchThreshold<warp_sort_immediate> {
   static constexpr int len_factor_for_choosing     = 4;
@@ -753,7 +902,7 @@ template <template <int, bool, typename, typename> class WarpSortClass, typename
 void calc_launch_parameter(
   size_t batch_size, size_t len, int k, int* p_num_of_block, int* p_num_of_warp)
 {
-  const int capacity               = calc_capacity(k);
+  const int capacity               = bound_by_power_of_two(k);
   const int capacity_per_full_warp = std::max(capacity, WarpSize);
   int block_size                   = 0;
   int min_grid_size                = 0;
@@ -827,30 +976,30 @@ void calc_launch_parameter(
 }
 
 template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
-void warp_sort_topk_(int num_of_block,
-                     int num_of_warp,
-                     const T* in,
-                     const IdxT* in_idx,
-                     size_t batch_size,
-                     size_t len,
-                     int k,
-                     T* out,
-                     IdxT* out_idx,
-                     bool select_min,
-                     rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr = nullptr)
+void select_k_(int num_of_block,
+               int num_of_warp,
+               const T* in,
+               const IdxT* in_idx,
+               size_t batch_size,
+               size_t len,
+               int k,
+               T* out,
+               IdxT* out_idx,
+               bool select_min,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr = nullptr)
 {
   auto pool_guard = raft::get_pool_memory_resource(
     mr, num_of_block * k * batch_size * 2 * std::max(sizeof(T), sizeof(IdxT)));
   if (pool_guard) {
-    RAFT_LOG_DEBUG("warp_sort_topk: using pool memory resource with initial size %zu bytes",
+    RAFT_LOG_DEBUG("warpsort::select_k: using pool memory resource with initial size %zu bytes",
                    pool_guard->pool_size());
   }
 
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
 
-  int capacity   = calc_capacity(k);
+  int capacity   = bound_by_power_of_two(k);
   int warp_width = std::min(capacity, WarpSize);
 
   T* result_val    = (num_of_block == 1) ? out : tmp_val.data();
@@ -858,6 +1007,8 @@ void warp_sort_topk_(int num_of_block,
   int block_dim    = num_of_warp * warp_width;
   int smem_size    = calc_smem_size_for_block_wide<T, IdxT>(num_of_warp, k);
 
+  smem_size = std::max<int>(smem_size, WarpSortClass<1, true, T, IdxT>::mem_required(block_dim));
+
   launch_setup<WarpSortClass, T, IdxT>::kernel(k,
                                                select_min,
                                                batch_size,
@@ -888,6 +1039,36 @@ void warp_sort_topk_(int num_of_block,
   }
 }
 
+template <typename T, typename IdxT, template <int, bool, typename, typename> class WarpSortClass>
+void select_k_impl(const T* in,
+                   const IdxT* in_idx,
+                   size_t batch_size,
+                   size_t len,
+                   int k,
+                   T* out,
+                   IdxT* out_idx,
+                   bool select_min,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr = nullptr)
+{
+  int num_of_block = 0;
+  int num_of_warp  = 0;
+  calc_launch_parameter<WarpSortClass, T, IdxT>(batch_size, len, k, &num_of_block, &num_of_warp);
+
+  select_k_<WarpSortClass, T, IdxT>(num_of_block,
+                                    num_of_warp,
+                                    in,
+                                    in_idx,
+                                    batch_size,
+                                    len,
+                                    k,
+                                    out,
+                                    out_idx,
+                                    select_min,
+                                    stream,
+                                    mr);
+}
+
 /**
  * Select k smallest or largest key/values from each row in the input data.
  *
@@ -926,23 +1107,23 @@ void warp_sort_topk_(int num_of_block,
  *           memory pool here to avoid memory allocations within the call).
  */
 template <typename T, typename IdxT>
-void warp_sort_topk(const T* in,
-                    const IdxT* in_idx,
-                    size_t batch_size,
-                    size_t len,
-                    int k,
-                    T* out,
-                    IdxT* out_idx,
-                    bool select_min,
-                    rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr = nullptr)
+void select_k(const T* in,
+              const IdxT* in_idx,
+              size_t batch_size,
+              size_t len,
+              int k,
+              T* out,
+              IdxT* out_idx,
+              bool select_min,
+              rmm::cuda_stream_view stream,
+              rmm::mr::device_memory_resource* mr = nullptr)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
          "The `len` (%zu) does not fit the indexing type",
          len);
 
-  int capacity     = calc_capacity(k);
+  int capacity     = bound_by_power_of_two(k);
   int num_of_block = 0;
   int num_of_warp  = 0;
   calc_launch_parameter<warp_sort_immediate, T, IdxT>(
@@ -950,34 +1131,34 @@ void warp_sort_topk(const T* in,
   int len_per_thread = len / (num_of_block * num_of_warp * std::min(capacity, WarpSize));
 
   if (len_per_thread <= LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
-    warp_sort_topk_<warp_sort_immediate, T, IdxT>(num_of_block,
-                                                  num_of_warp,
-                                                  in,
-                                                  in_idx,
-                                                  batch_size,
-                                                  len,
-                                                  k,
-                                                  out,
-                                                  out_idx,
-                                                  select_min,
-                                                  stream,
-                                                  mr);
+    select_k_<warp_sort_immediate, T, IdxT>(num_of_block,
+                                            num_of_warp,
+                                            in,
+                                            in_idx,
+                                            batch_size,
+                                            len,
+                                            k,
+                                            out,
+                                            out_idx,
+                                            select_min,
+                                            stream,
+                                            mr);
   } else {
     calc_launch_parameter<warp_sort_filtered, T, IdxT>(
       batch_size, len, k, &num_of_block, &num_of_warp);
-    warp_sort_topk_<warp_sort_filtered, T, IdxT>(num_of_block,
-                                                 num_of_warp,
-                                                 in,
-                                                 in_idx,
-                                                 batch_size,
-                                                 len,
-                                                 k,
-                                                 out,
-                                                 out_idx,
-                                                 select_min,
-                                                 stream,
-                                                 mr);
+    select_k_<warp_sort_filtered, T, IdxT>(num_of_block,
+                                           num_of_warp,
+                                           in,
+                                           in_idx,
+                                           batch_size,
+                                           len,
+                                           k,
+                                           out,
+                                           out_idx,
+                                           select_min,
+                                           stream,
+                                           mr);
   }
 }
 
-}  // namespace raft::spatial::knn::detail::topk
+}  // namespace raft::matrix::detail::select::warpsort
diff --git a/cpp/include/raft/matrix/select_k.cuh b/cpp/include/raft/matrix/select_k.cuh
new file mode 100644
index 0000000000..e40fc57e76
--- /dev/null
+++ b/cpp/include/raft/matrix/select_k.cuh
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/select_k.cuh"
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/nvtx.hpp>
+
+#include <optional>
+
+namespace raft::matrix {
+
+/**
+ * @defgroup select_k Batched-select k smallest or largest key/values
+ * @{
+ */
+
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_val` as a row-major matrix with `len` columns and
+ * `batch_size` rows, then this function selects `k` smallest/largest values in each row and fills
+ * in the row-major matrix `out_val` of size (batch_size, k).
+ *
+ * Example usage
+ * @code{.cpp}
+ *   using namespace raft;
+ *   // get a 2D row-major array of values to search through
+ *   auto in_values = {... input device_matrix_view<const float, size_t, row_major> ...}
+ *   // prepare output arrays
+ *   auto out_extents = make_extents<size_t>(in_values.extent(0), k);
+ *   auto out_values  = make_device_mdarray<float>(handle, out_extents);
+ *   auto out_indices = make_device_mdarray<size_t>(handle, out_extents);
+ *   // search `k` smallest values in each row
+ *   matrix::select_k<float, size_t>(
+ *     handle, in_values, std::nullopt, out_values.view(), out_indices.view(), true);
+ * @endcode
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ *
+ * @param[in] handle
+ * @param[in] in_val
+ *   inputs values [batch_size, len];
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   optional input payload [batch_size, len];
+ *   typically, these are indices of the corresponding `in_val`.
+ *   If `in_idx` is `std::nullopt`, a contiguous array `0...len-1` is implied.
+ * @param[out] out_val
+ *   output values [batch_size, k];
+ *   the k smallest/largest values from each row of the `in_val`.
+ * @param[out] out_idx
+ *   output payload (e.g. indices) [batch_size, k];
+ *   the payload selected together with `out_val`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ */
+template <typename T, typename IdxT>
+void select_k(const handle_t& handle,
+              raft::device_matrix_view<const T, size_t, row_major> in_val,
+              std::optional<raft::device_matrix_view<const IdxT, size_t, row_major>> in_idx,
+              raft::device_matrix_view<T, size_t, row_major> out_val,
+              raft::device_matrix_view<IdxT, size_t, row_major> out_idx,
+              bool select_min)
+{
+  RAFT_EXPECTS(out_val.extent(1) <= size_t(std::numeric_limits<int>::max()),
+               "output k must fit the int type.");
+  auto batch_size = in_val.extent(0);
+  auto len        = in_val.extent(1);
+  auto k          = int(out_val.extent(1));
+  RAFT_EXPECTS(batch_size == out_val.extent(0), "batch sizes must be equal");
+  RAFT_EXPECTS(batch_size == out_idx.extent(0), "batch sizes must be equal");
+  if (in_idx.has_value()) {
+    RAFT_EXPECTS(batch_size == in_idx->extent(0), "batch sizes must be equal");
+    RAFT_EXPECTS(len == in_idx->extent(1), "value and index input lengths must be equal");
+  }
+  RAFT_EXPECTS(size_t(k) == out_idx.extent(1), "value and index output lengths must be equal");
+  return detail::select_k<T, IdxT>(in_val.data_handle(),
+                                   in_idx.has_value() ? in_idx->data_handle() : nullptr,
+                                   batch_size,
+                                   len,
+                                   k,
+                                   out_val.data_handle(),
+                                   out_idx.data_handle(),
+                                   select_min,
+                                   handle.get_stream());
+}
+
+/** @} */  // end of group select_k
+
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/neighbors/detail/refine.cuh b/cpp/include/raft/neighbors/detail/refine.cuh
index 7d3779c89e..86aef8fad8 100644
--- a/cpp/include/raft/neighbors/detail/refine.cuh
+++ b/cpp/include/raft/neighbors/detail/refine.cuh
@@ -43,9 +43,9 @@ void check_input(extents_t dataset,
   auto n_queries = queries.extent(0);
   auto k         = distances.extent(1);
 
-  RAFT_EXPECTS(k <= raft::spatial::knn::detail::topk::kMaxCapacity,
+  RAFT_EXPECTS(k <= raft::matrix::detail::select::warpsort::kMaxCapacity,
                "k must be lest than topk::kMaxCapacity (%d).",
-               raft::spatial::knn::detail::topk::kMaxCapacity);
+               raft::matrix::detail::select::warpsort::kMaxCapacity);
 
   RAFT_EXPECTS(indices.extent(0) == n_queries && distances.extent(0) == n_queries &&
                  candidates.extent(0) == n_queries,
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
index b139d8df8c..ab445c75d4 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_search.cuh
@@ -18,8 +18,6 @@
 
 #include "../ivf_flat_types.hpp"
 #include "ann_utils.cuh"
-#include "topk.cuh"
-#include "topk/warpsort_topk.cuh"
 
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/handle.hpp>
@@ -29,8 +27,11 @@
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/norm.cuh>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_loads_stores.cuh>
+#include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
@@ -700,8 +701,13 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
   copy_vectorized(query_shared, query, std::min(dim, query_smem_elems));
   __syncthreads();
 
-  using block_sort_t = topk::block_sort<topk::warp_sort_filtered, Capacity, Ascending, float, IdxT>;
-  block_sort_t queue(k, interleaved_scan_kernel_smem + query_smem_elems * sizeof(T));
+  using block_sort_t = matrix::detail::select::warpsort::block_sort<
+    matrix::detail::select::warpsort::warp_sort_filtered,
+    Capacity,
+    Ascending,
+    float,
+    IdxT>;
+  block_sort_t queue(k);
 
   {
     using align_warp  = Pow2<WarpSize>;
@@ -778,7 +784,8 @@ __global__ void __launch_bounds__(kThreadsPerBlock)
   }
 
   // finalize and store selected neighbours
-  queue.done();
+  __syncthreads();
+  queue.done(interleaved_scan_kernel_smem);
   queue.store(distances, neighbors, post_process);
 }
 
@@ -831,8 +838,10 @@ void launch_kernel(Lambda lambda,
     std::min<int>(max_query_smem / sizeof(T), Pow2<Veclen * WarpSize>::roundUp(index.dim()));
   int smem_size              = query_smem_elems * sizeof(T);
   constexpr int kSubwarpSize = std::min<int>(Capacity, WarpSize);
-  smem_size += raft::spatial::knn::detail::topk::calc_smem_size_for_block_wide<AccT, IdxT>(
-    kThreadsPerBlock / kSubwarpSize, k);
+  auto block_merge_mem =
+    raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<AccT, IdxT>(
+      kThreadsPerBlock / kSubwarpSize, k);
+  smem_size += std::max<int>(smem_size, block_merge_mem);
 
   // power-of-two less than cuda limit (for better addr alignment)
   constexpr uint32_t kMaxGridY = 32768;
@@ -979,7 +988,7 @@ void launch_with_fixed_consts(raft::distance::DistanceType metric, Args&&... arg
 template <typename T,
           typename AccT,
           typename IdxT,
-          int Capacity = topk::kMaxCapacity,
+          int Capacity = matrix::detail::select::warpsort::kMaxCapacity,
           int Veclen   = std::max<int>(1, 16 / sizeof(T))>
 struct select_interleaved_scan_kernel {
   /**
@@ -1003,12 +1012,12 @@ struct select_interleaved_scan_kernel {
           capacity, veclen, select_min, std::forward<Args>(args)...);
       }
     }
-    // NB: this is the limitation of the topk::block_topk structures that use a huge number of
+    // NB: this is the limitation of the warpsort structures that use a huge number of
     //     registers (used in the main kernel here).
     RAFT_EXPECTS(capacity == Capacity,
                  "Capacity must be power-of-two not bigger than the maximum allowed size "
-                 "topk::kMaxCapacity (%d).",
-                 topk::kMaxCapacity);
+                 "matrix::detail::select::warpsort::kMaxCapacity (%d).",
+                 matrix::detail::select::warpsort::kMaxCapacity);
     RAFT_EXPECTS(
       veclen == Veclen,
       "Veclen must be power-of-two not bigger than the maximum allowed size for this data type.");
@@ -1034,7 +1043,7 @@ struct select_interleaved_scan_kernel {
  * @param metric type of the measured distance
  * @param n_probes number of nearest clusters to query
  * @param k number of nearest neighbors.
- *            NB: the maximum value of `k` is limited statically by `topk::kMaxCapacity`.
+ *            NB: the maximum value of `k` is limited statically by `kMaxCapacity`.
  * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given
  * metric.
  * @param[out] neighbors device pointer to the result indices for each query and cluster
@@ -1059,7 +1068,7 @@ void ivfflat_interleaved_scan(const ivf_flat::index<T, IdxT>& index,
                               uint32_t& grid_dim_x,
                               rmm::cuda_stream_view stream)
 {
-  const int capacity = raft::spatial::knn::detail::topk::calc_capacity(k);
+  const int capacity = bound_by_power_of_two(k);
   select_interleaved_scan_kernel<T, AccT, IdxT>::run(capacity,
                                                      index.veclen(),
                                                      select_min,
@@ -1167,16 +1176,16 @@ void search_impl(const handle_t& handle,
                stream);
 
   RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min<uint32_t>(20, index.n_lists()));
-  select_topk<AccT, uint32_t>(distance_buffer_dev.data(),
-                              nullptr,
-                              n_queries,
-                              index.n_lists(),
-                              n_probes,
-                              coarse_distances_dev.data(),
-                              coarse_indices_dev.data(),
-                              select_min,
-                              stream,
-                              search_mr);
+  matrix::detail::select_k<AccT, uint32_t>(distance_buffer_dev.data(),
+                                           nullptr,
+                                           n_queries,
+                                           index.n_lists(),
+                                           n_probes,
+                                           coarse_distances_dev.data(),
+                                           coarse_indices_dev.data(),
+                                           select_min,
+                                           stream,
+                                           search_mr);
   RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes);
   RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes);
 
@@ -1225,16 +1234,16 @@ void search_impl(const handle_t& handle,
 
   // Merge topk values from different blocks
   if (grid_dim_x > 1) {
-    select_topk<AccT, IdxT>(refined_distances_dev.data(),
-                            refined_indices_dev.data(),
-                            n_queries,
-                            k * grid_dim_x,
-                            k,
-                            distances,
-                            neighbors,
-                            select_min,
-                            stream,
-                            search_mr);
+    matrix::detail::select_k<AccT, IdxT>(refined_distances_dev.data(),
+                                         refined_indices_dev.data(),
+                                         n_queries,
+                                         k * grid_dim_x,
+                                         k,
+                                         distances,
+                                         neighbors,
+                                         select_min,
+                                         stream,
+                                         search_mr);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
index 16a78aec1c..aa69841f4b 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_pq_search.cuh
@@ -17,8 +17,6 @@
 #pragma once
 
 #include "ann_utils.cuh"
-#include "topk.cuh"
-#include "topk/warpsort_topk.cuh"
 
 #include <raft/neighbors/ivf_pq_types.hpp>
 
@@ -30,6 +28,8 @@
 #include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/gemm.cuh>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/device_loads_stores.cuh>
@@ -130,7 +130,7 @@ struct fp_8bit {
  * Select the clusters to probe and, as a side-effect, translate the queries type `T -> float`
  *
  * Assuming the number of clusters is not that big (a few thousands), we do a plain GEMM
- * followed by select_topk to select the clusters to probe. There's no need to return the similarity
+ * followed by select_k to select the clusters to probe. There's no need to return the similarity
  * scores here.
  */
 template <typename T>
@@ -148,7 +148,6 @@ void select_clusters(const handle_t& handle,
                      rmm::mr::device_memory_resource* mr)
 {
   auto stream = handle.get_stream();
-  rmm::device_uvector<float> qc_distances(n_queries * n_lists, stream, mr);
   /* NOTE[qc_distances]
 
   We compute query-center distances to choose the clusters to probe.
@@ -158,16 +157,16 @@ void select_clusters(const handle_t& handle,
       cluster_centers[i, dim()] contains the squared norm of the center vector i;
       we extend the dimension K of the GEMM to compute it together with all the dot products:
 
-      `cq_distances[i, j] = |cluster_centers[j]|^2 - 2 * (queries[i], cluster_centers[j])`
+      `qc_distances[i, j] = |cluster_centers[j]|^2 - 2 * (queries[i], cluster_centers[j])`
 
       This is a monotonous mapping of the proper L2 distance.
 
     IP distance:
-      `cq_distances[i, j] = - (queries[i], cluster_centers[j])`
+      `qc_distances[i, j] = - (queries[i], cluster_centers[j])`
 
       This is a negative inner-product distance. We minimize it to find the similar clusters.
 
-      NB: cq_distances is NOT used further in ivfpq_search.
+      NB: qc_distances is NOT used further in ivfpq_search.
  */
   float norm_factor;
   switch (metric) {
@@ -203,6 +202,7 @@ void select_clusters(const handle_t& handle,
     } break;
     default: RAFT_FAIL("Unsupported distance type %d.", int(metric));
   }
+  rmm::device_uvector<float> qc_distances(n_queries * n_lists, stream, mr);
   linalg::gemm(handle,
                true,
                false,
@@ -221,16 +221,16 @@ void select_clusters(const handle_t& handle,
 
   // Select neighbor clusters for each query.
   rmm::device_uvector<float> cluster_dists(n_queries * n_probes, stream, mr);
-  select_topk<float, uint32_t>(qc_distances.data(),
-                               nullptr,
-                               n_queries,
-                               n_lists,
-                               n_probes,
-                               cluster_dists.data(),
-                               clusters_to_probe,
-                               true,
-                               stream,
-                               mr);
+  matrix::detail::select_k<float, uint32_t>(qc_distances.data(),
+                                            nullptr,
+                                            n_queries,
+                                            n_lists,
+                                            n_probes,
+                                            cluster_dists.data(),
+                                            clusters_to_probe,
+                                            true,
+                                            stream,
+                                            mr);
 }
 
 /**
@@ -452,14 +452,15 @@ void postprocess_distances(float* out,        // [n_queries, topk]
 
 template <typename T, typename IdxT>
 struct dummy_block_sort_t {
-  using queue_t = topk::warp_sort_distributed<WarpSize, true, T, IdxT>;
+  using queue_t = matrix::detail::select::warpsort::warp_sort_distributed<WarpSize, true, T, IdxT>;
   template <typename... Args>
-  __device__ dummy_block_sort_t(int k, uint8_t* smem_buf, Args...){};
+  __device__ dummy_block_sort_t(int k, Args...){};
 };
 
 template <int Capacity, typename T, typename IdxT>
 struct pq_block_sort {
-  using type = topk::block_sort<topk::warp_sort_distributed, Capacity, true, T, IdxT>;
+  using type = matrix::detail::select::warpsort::
+    block_sort<matrix::detail::select::warpsort::warp_sort_distributed, Capacity, true, T, IdxT>;
 };
 
 template <typename T, typename IdxT>
@@ -808,7 +809,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     constexpr OutT kDummy = upper_bound<OutT>();
     OutT query_kth        = kDummy;
     if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); }
-    local_topk_t block_topk(topk, smem_buf, query_kth);
+    local_topk_t block_topk(topk, nullptr, query_kth);
     OutT early_stop_limit = kDummy;
     switch (metric) {
       // If the metric is non-negative, we can use the query_kth approximation as an early stop
@@ -845,7 +846,7 @@ __global__ void ivfpq_compute_similarity_kernel(uint32_t n_rows,
     if constexpr (kManageLocalTopK) {
       // sync threads before the topk merging operation, because we reuse smem_buf
       __syncthreads();
-      block_topk.done();
+      block_topk.done(smem_buf);
       block_topk.store(out_scores, out_indices);
       if (threadIdx.x == 0) { atomicMin(query_kths + query_ix, float(out_scores[topk - 1])); }
     } else {
@@ -1055,9 +1056,11 @@ struct ivfpq_compute_similarity {
 
       [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t
       {
-        return manage_local_topk ? topk::template calc_smem_size_for_block_wide<OutT, IdxT>(
-                                     n_threads / subwarp_size, topk)
-                                 : 0;
+        return manage_local_topk
+                 ? matrix::detail::select::warpsort::template calc_smem_size_for_block_wide<OutT,
+                                                                                            IdxT>(
+                     n_threads / subwarp_size, topk)
+                 : 0;
       }
     } ltk_mem{manage_local_topk, topk};
 
@@ -1412,16 +1415,16 @@ void ivfpq_search_worker(const handle_t& handle,
 
   // Select topk vectors for each query
   rmm::device_uvector<ScoreT> topk_dists(n_queries * topK, stream, mr);
-  select_topk<ScoreT, IdxT>(distances_buf.data(),
-                            neighbors_ptr,
-                            n_queries,
-                            topk_len,
-                            topK,
-                            topk_dists.data(),
-                            neighbors,
-                            true,
-                            stream,
-                            mr);
+  matrix::detail::select_k<ScoreT, IdxT>(distances_buf.data(),
+                                         neighbors_ptr,
+                                         n_queries,
+                                         topk_len,
+                                         topK,
+                                         topk_dists.data(),
+                                         neighbors,
+                                         true,
+                                         stream,
+                                         mr);
 
   // Postprocessing
   postprocess_distances(
@@ -1524,11 +1527,7 @@ inline auto get_max_batch_size(uint32_t k,
   };
   constexpr uint64_t kMaxWsSize = 1024 * 1024 * 1024;
   if (ws_size(max_batch_size) > kMaxWsSize) {
-    uint32_t smaller_batch_size = 1;
-    // take powers of two for better alignment
-    while (smaller_batch_size * 2 <= max_batch_size) {
-      smaller_batch_size <<= 1;
-    }
+    uint32_t smaller_batch_size = bound_by_power_of_two(max_batch_size);
     // gradually reduce the batch size until we fit into the max size limit.
     while (smaller_batch_size > 1 && ws_size(smaller_batch_size) > kMaxWsSize) {
       smaller_batch_size >>= 1;
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 95f7aab9da..f4b664fa16 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,12 +18,11 @@
 
 #include "detail/knn_brute_force_faiss.cuh"
 #include "detail/selection_faiss.cuh"
-#include <raft/core/device_mdspan.hpp>
-
-#include "detail/topk/radix_topk.cuh"
-#include "detail/topk/warpsort_topk.cuh"
 
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/matrix/detail/select_radix.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
 
 namespace raft::spatial::knn {
 
@@ -88,6 +87,8 @@ enum class SelectKAlgo {
  * Note, depending on the selected algorithm, the values within rows of `out_keys` are not
  * necessarily sorted. See the `SelectKAlgo` enumeration for more details.
  *
+ * Note: This call is deprecated, please use `raft/matrix/select_k.cuh`
+ *
  * @tparam idx_t
  *   the payload type (what is being selected together with the keys).
  * @tparam value_t
@@ -122,16 +123,17 @@ enum class SelectKAlgo {
  *   the implementation of the algorithm
  */
 template <typename idx_t = int, typename value_t = float>
-inline void select_k(const value_t* in_keys,
-                     const idx_t* in_values,
-                     size_t n_inputs,
-                     size_t input_len,
-                     value_t* out_keys,
-                     idx_t* out_values,
-                     bool select_min,
-                     int k,
-                     cudaStream_t stream,
-                     SelectKAlgo algo = SelectKAlgo::FAISS)
+[[deprecated("Use function `select_k` from `raft/matrix/select_k.cuh`")]] inline void select_k(
+  const value_t* in_keys,
+  const idx_t* in_values,
+  size_t n_inputs,
+  size_t input_len,
+  value_t* out_keys,
+  idx_t* out_values,
+  bool select_min,
+  int k,
+  cudaStream_t stream,
+  SelectKAlgo algo = SelectKAlgo::FAISS)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("select-%s-%d (%zu, %zu) algo-%d",
                                                             select_min ? "min" : "max",
@@ -151,17 +153,17 @@ inline void select_k(const value_t* in_keys,
       break;
 
     case SelectKAlgo::RADIX_8_BITS:
-      detail::topk::radix_topk<value_t, idx_t, 8, 512>(
+      matrix::detail::select::radix::select_k<value_t, idx_t, 8, 512>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     case SelectKAlgo::RADIX_11_BITS:
-      detail::topk::radix_topk<value_t, idx_t, 11, 512>(
+      matrix::detail::select::radix::select_k<value_t, idx_t, 11, 512>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     case SelectKAlgo::WARP_SORT:
-      detail::topk::warp_sort_topk<value_t, idx_t>(
+      matrix::detail::select::warpsort::select_k<value_t, idx_t>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
diff --git a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh b/cpp/include/raft/util/bitonic_sort.cuh
similarity index 68%
rename from cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
rename to cpp/include/raft/util/bitonic_sort.cuh
index 630acab2b8..5de464b4c7 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
+++ b/cpp/include/raft/util/bitonic_sort.cuh
@@ -16,14 +16,15 @@
 
 #pragma once
 
+#include <raft/core/detail/macros.hpp>
 #include <raft/util/cuda_utils.cuh>
 
-namespace raft::spatial::knn::detail::topk {
+namespace raft::util {
 
-namespace helpers {
+namespace {
 
 template <typename T>
-__device__ __forceinline__ void swap(T& x, T& y)
+_RAFT_DEVICE _RAFT_FORCEINLINE void swap(T& x, T& y)
 {
   T t = x;
   x   = y;
@@ -31,12 +32,12 @@ __device__ __forceinline__ void swap(T& x, T& y)
 }
 
 template <typename T>
-__device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
+_RAFT_DEVICE _RAFT_FORCEINLINE void conditional_assign(bool cond, T& ptr, T x)
 {
   if (cond) { ptr = x; }
 }
 
-}  // namespace helpers
+}  // namespace
 
 /**
  * Warp-wide bitonic merge and sort.
@@ -59,6 +60,19 @@ __device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
  *   3  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63    48  49  50 ...
  * `
  *
+ * Here is a small usage example of device code, which sorts the arrays of length 6 (= 3 * 2)
+ * grouped in pairs of threads in ascending order:
+ * @code{.cpp}
+ *   // Fill an array of three ints in each thread of a warp.
+ *   int i = laneId();
+ *   int arr[3] = {i+1, i+5, i};
+ *   // Sort the arrays in groups of two threads.
+ *   bitonic<3>(ascending=true, warp_width=2).sort(arr);
+ *   // As a result,
+ *   //  for every even thread (`i == 2j`):    arr == {2j,   2j+1, 2j+5}
+ *   //  for every odd  thread (`i == 2j+1`):  arr == {2j+1, 2j+2, 2j+6}
+ * @endcode
+ *
  * @tparam Size
  *   number of elements processed in each thread;
  *   i.e. the total data size is `Size * warp_width`.
@@ -80,7 +94,7 @@ class bitonic {
    *   the total size of the sorted data is `Size * warp_width`.
    *   Must be power-of-two, not larger than the WarpSize.
    */
-  __device__ __forceinline__ explicit bitonic(bool ascending, int warp_width = WarpSize)
+  _RAFT_DEVICE _RAFT_FORCEINLINE explicit bitonic(bool ascending, int warp_width = WarpSize)
     : ascending_(ascending), warp_width_(warp_width)
   {
   }
@@ -95,7 +109,7 @@ class bitonic {
    *
    *   1) Sort any bitonic sequence.
    *   2) Merge two halves of the input data assuming they're already sorted, and their order is
-   *      opposite (i.e. either ascending, descending or vice-versa).
+   *      opposite (i.e. either ascending+descending or descending+ascending).
    *
    * The input pointers are unique per-thread.
    * See the class description for the description of the data layout.
@@ -108,10 +122,10 @@ class bitonic {
    *   the keys; must be at least `Size` elements long.
    */
   template <typename KeyT, typename... PayloadTs>
-  __device__ __forceinline__ void merge(KeyT* __restrict__ keys,
-                                        PayloadTs* __restrict__... payloads) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE void merge(KeyT* __restrict__ keys,
+                                            PayloadTs* __restrict__... payloads) const
   {
-    return bitonic<Size>::merge_(ascending_, warp_width_, keys, payloads...);
+    return bitonic<Size>::merge_impl(ascending_, warp_width_, keys, payloads...);
   }
 
   /**
@@ -127,10 +141,10 @@ class bitonic {
    *   the keys; must be at least `Size` elements long.
    */
   template <typename KeyT, typename... PayloadTs>
-  __device__ __forceinline__ void sort(KeyT* __restrict__ keys,
-                                       PayloadTs* __restrict__... payloads) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE void sort(KeyT* __restrict__ keys,
+                                           PayloadTs* __restrict__... payloads) const
   {
-    return bitonic<Size>::sort_(ascending_, warp_width_, keys, payloads...);
+    return bitonic<Size>::sort_impl(ascending_, warp_width_, keys, payloads...);
   }
 
   /**
@@ -141,8 +155,8 @@ class bitonic {
    * @param payload
    */
   template <typename KeyT, typename... PayloadTs, int S = Size>
-  __device__ __forceinline__ auto merge(KeyT& __restrict__ key,
-                                        PayloadTs& __restrict__... payload) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE auto merge(KeyT& __restrict__ key,
+                                            PayloadTs& __restrict__... payload) const
     -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
   {
     static_assert(S == Size);
@@ -157,8 +171,8 @@ class bitonic {
    * @param payload
    */
   template <typename KeyT, typename... PayloadTs, int S = Size>
-  __device__ __forceinline__ auto sort(KeyT& __restrict__ key,
-                                       PayloadTs& __restrict__... payload) const
+  _RAFT_DEVICE _RAFT_FORCEINLINE auto sort(KeyT& __restrict__ key,
+                                           PayloadTs& __restrict__... payload) const
     -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
   {
     static_assert(S == Size);
@@ -173,10 +187,10 @@ class bitonic {
   friend class bitonic;
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void merge_(bool ascending,
-                                                int warp_width,
-                                                KeyT* __restrict__ keys,
-                                                PayloadTs* __restrict__... payloads)
+  static _RAFT_DEVICE _RAFT_FORCEINLINE void merge_impl(bool ascending,
+                                                        int warp_width,
+                                                        KeyT* __restrict__ keys,
+                                                        PayloadTs* __restrict__... payloads)
   {
 #pragma unroll
     for (int size = Size; size > 1; size >>= 1) {
@@ -189,8 +203,8 @@ class bitonic {
           KeyT& key         = keys[i];
           KeyT& other       = keys[other_i];
           if (ascending ? key > other : key < other) {
-            helpers::swap(key, other);
-            (helpers::swap(payloads[i], payloads[other_i]), ...);
+            swap(key, other);
+            (swap(payloads[i], payloads[other_i]), ...);
           }
         }
       }
@@ -204,33 +218,32 @@ class bitonic {
         const KeyT other     = shfl_xor(key, stride, warp_width);
         const bool do_assign = (ascending != is_second) ? key > other : key < other;
 
-        helpers::conditional_assign(do_assign, key, other);
+        conditional_assign(do_assign, key, other);
         // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
-        (helpers::conditional_assign(
-           do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
+        (conditional_assign(do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
          ...);
       }
     }
   }
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void sort_(bool ascending,
-                                               int warp_width,
-                                               KeyT* __restrict__ keys,
-                                               PayloadTs* __restrict__... payloads)
+  static _RAFT_DEVICE _RAFT_FORCEINLINE void sort_impl(bool ascending,
+                                                       int warp_width,
+                                                       KeyT* __restrict__ keys,
+                                                       PayloadTs* __restrict__... payloads)
   {
     if constexpr (Size == 1) {
       const int lane = laneId();
       for (int width = 2; width < warp_width; width <<= 1) {
-        bitonic<1>::merge_(lane & width, width, keys, payloads...);
+        bitonic<1>::merge_impl(lane & width, width, keys, payloads...);
       }
     } else {
       constexpr int kSize2 = Size / 2;
-      bitonic<kSize2>::sort_(false, warp_width, keys, payloads...);
-      bitonic<kSize2>::sort_(true, warp_width, keys + kSize2, (payloads + kSize2)...);
+      bitonic<kSize2>::sort_impl(false, warp_width, keys, payloads...);
+      bitonic<kSize2>::sort_impl(true, warp_width, keys + kSize2, (payloads + kSize2)...);
     }
-    bitonic<Size>::merge_(ascending, warp_width, keys, payloads...);
+    bitonic<Size>::merge_impl(ascending, warp_width, keys, payloads...);
   }
 };
 
-}  // namespace raft::spatial::knn::detail::topk
+}  // namespace raft::util
diff --git a/cpp/include/raft/util/integer_utils.hpp b/cpp/include/raft/util/integer_utils.hpp
index e893ff0904..3b0d9d44ae 100644
--- a/cpp/include/raft/util/integer_utils.hpp
+++ b/cpp/include/raft/util/integer_utils.hpp
@@ -23,6 +23,7 @@
  *
  */
 
+#include <limits>
 #include <stdexcept>
 #include <type_traits>
 
@@ -112,18 +113,37 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
  * approach of using (dividend + divisor - 1) / divisor
  */
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, I> div_rounding_up_safe(
-  I dividend, I divisor) noexcept
+constexpr inline auto div_rounding_up_safe(I dividend, I divisor) noexcept
+  -> std::enable_if_t<std::is_integral<I>::value, I>
 {
   using i_is_a_signed_type = std::integral_constant<bool, std::is_signed<I>::value>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_of_two(
-  I val) noexcept
+constexpr inline auto is_a_power_of_two(I val) noexcept
+  -> std::enable_if_t<std::is_integral<I>::value, bool>
 {
-  return ((val - 1) & val) == 0;
+  return (val != 0) && (((val - 1) & val) == 0);
+}
+
+/**
+ * Given an integer `x`, return such `y` that `x <= y` and `is_a_power_of_two(y)`.
+ * If such `y` does not exist in `T`, return zero.
+ */
+template <typename T>
+constexpr inline auto bound_by_power_of_two(T x) noexcept
+  -> std::enable_if_t<std::is_integral<T>::value, T>
+{
+  if (is_a_power_of_two(x)) { return x; }
+  constexpr T kMaxUnsafe = std::numeric_limits<T>::max();
+  constexpr T kMaxSafe   = is_a_power_of_two(kMaxUnsafe) ? kMaxUnsafe : (kMaxUnsafe >> 1);
+  const T limited        = std::min(x, kMaxSafe);
+  T bound                = T{1};
+  while (bound < limited) {
+    bound <<= 1;
+  }
+  return bound < x ? T{0} : bound;
 }
 
 /**
@@ -150,13 +170,13 @@ constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_o
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(T val)
+constexpr inline auto absolute_value(T val) -> std::enable_if_t<std::is_signed<T>::value, T>
 {
   return std::abs(val);
 }
 // Unsigned type just returns itself.
 template <typename T>
-std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T val)
+constexpr inline auto absolute_value(T val) -> std::enable_if_t<!std::is_signed<T>::value, T>
 {
   return val;
 }
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index a4b3758faa..6c7ca11d86 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -179,6 +179,7 @@ if(BUILD_TESTS)
     test/matrix/matrix.cu
     test/matrix/norm.cu
     test/matrix/reverse.cu
+    test/matrix/select_k.cu
     test/matrix/slice.cu
     test/matrix/triangular.cu
     test/sparse/spectral_matrix.cu
@@ -292,7 +293,7 @@ if(BUILD_TESTS)
   )
 
   ConfigureTest(
-    NAME UTILS_TEST PATH test/core/seive.cu test/util/cudart_utils.cpp test/util/device_atomics.cu
-    test/util/integer_utils.cpp test/util/pow2_utils.cu
+    NAME UTILS_TEST PATH test/core/seive.cu test/util/bitonic_sort.cu test/util/cudart_utils.cpp
+    test/util/device_atomics.cu test/util/integer_utils.cpp test/util/pow2_utils.cu
   )
 endif()
diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu
new file mode 100644
index 0000000000..cb92c15790
--- /dev/null
+++ b/cpp/test/matrix/select_k.cu
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include "select_k.cuh"
+
+#include <raft/core/handle.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/sparse/detail/utils.h>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <algorithm>
+#include <numeric>
+
+namespace raft::matrix {
+
+template <typename IdxT>
+auto gen_simple_ids(int batch_size, int len) -> std::vector<IdxT>
+{
+  std::vector<IdxT> out(batch_size * len);
+  auto s = rmm::cuda_stream_default;
+  rmm::device_uvector<IdxT> out_d(out.size(), s);
+  sparse::iota_fill(out_d.data(), IdxT(batch_size), IdxT(len), s);
+  update_host(out.data(), out_d.data(), out.size(), s);
+  s.synchronize();
+  return out;
+}
+
+template <typename KeyT, typename IdxT>
+struct io_simple {
+ public:
+  bool not_supported = false;
+
+  io_simple(const select::params& spec,
+            const std::vector<KeyT>& in_dists,
+            const std::vector<KeyT>& out_dists,
+            const std::vector<IdxT>& out_ids)
+    : in_dists_(in_dists),
+      in_ids_(gen_simple_ids<IdxT>(spec.batch_size, spec.len)),
+      out_dists_(out_dists),
+      out_ids_(out_ids)
+  {
+  }
+
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+};
+
+template <typename KeyT, typename IdxT>
+struct io_computed {
+ public:
+  bool not_supported = false;
+
+  io_computed(const select::params& spec,
+              const select::Algo& algo,
+              const std::vector<KeyT>& in_dists,
+              const std::optional<std::vector<IdxT>>& in_ids = std::nullopt)
+    : in_dists_(in_dists),
+      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.batch_size, spec.len))),
+      out_dists_(spec.batch_size * spec.k),
+      out_ids_(spec.batch_size * spec.k)
+  {
+    // check if the size is supported by the algorithm
+    switch (algo) {
+      case select::Algo::kWarpAuto:
+      case select::Algo::kWarpImmediate:
+      case select::Algo::kWarpFiltered:
+      case select::Algo::kWarpDistributed:
+      case select::Algo::kWarpDistributedShm: {
+        if (spec.k > raft::matrix::detail::select::warpsort::kMaxCapacity) {
+          not_supported = true;
+          return;
+        }
+      } break;
+      default: break;
+    }
+
+    handle_t handle{};
+    auto stream = handle.get_stream();
+
+    rmm::device_uvector<KeyT> in_dists_d(in_dists_.size(), stream);
+    rmm::device_uvector<IdxT> in_ids_d(in_ids_.size(), stream);
+    rmm::device_uvector<KeyT> out_dists_d(out_dists_.size(), stream);
+    rmm::device_uvector<IdxT> out_ids_d(out_ids_.size(), stream);
+
+    update_device(in_dists_d.data(), in_dists_.data(), in_dists_.size(), stream);
+    update_device(in_ids_d.data(), in_ids_.data(), in_ids_.size(), stream);
+
+    select::select_k_impl<KeyT, IdxT>(handle,
+                                      algo,
+                                      in_dists_d.data(),
+                                      spec.use_index_input ? in_ids_d.data() : nullptr,
+                                      spec.batch_size,
+                                      spec.len,
+                                      spec.k,
+                                      out_dists_d.data(),
+                                      out_ids_d.data(),
+                                      spec.select_min);
+
+    update_host(out_dists_.data(), out_dists_d.data(), out_dists_.size(), stream);
+    update_host(out_ids_.data(), out_ids_d.data(), out_ids_.size(), stream);
+
+    interruptible::synchronize(stream);
+
+    auto p = topk_sort_permutation(out_dists_, out_ids_, spec.k, spec.select_min);
+    apply_permutation(out_dists_, p);
+    apply_permutation(out_ids_, p);
+  }
+
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+
+  auto topk_sort_permutation(const std::vector<KeyT>& vec,
+                             const std::vector<IdxT>& inds,
+                             int k,
+                             bool select_min) -> std::vector<IdxT>
+  {
+    std::vector<IdxT> p(vec.size());
+    std::iota(p.begin(), p.end(), 0);
+    if (select_min) {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] < vec[j];
+        }
+        return ik < jk;
+      });
+    } else {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] > vec[j];
+        }
+        return ik < jk;
+      });
+    }
+    return p;
+  }
+
+  template <typename T>
+  void apply_permutation(std::vector<T>& vec, const std::vector<IdxT>& p)  // NOLINT
+  {
+    for (auto i = IdxT(vec.size()) - 1; i > 0; i--) {
+      auto j = p[i];
+      while (j > i)
+        j = p[j];
+      std::swap(vec[j], vec[i]);
+    }
+  }
+};
+
+template <typename InOut>
+using Params = std::tuple<select::params, select::Algo, InOut>;
+
+template <typename KeyT, typename IdxT, template <typename, typename> typename ParamsReader>
+struct SelectK  // NOLINT
+  : public testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::params_t> {
+  const select::params spec;
+  const select::Algo algo;
+  typename ParamsReader<KeyT, IdxT>::io_t ref;
+  io_computed<KeyT, IdxT> res;
+
+  explicit SelectK(Params<typename ParamsReader<KeyT, IdxT>::io_t> ps)
+    : spec(std::get<0>(ps)),
+      algo(std::get<1>(ps)),                                 // NOLINT
+      ref(std::get<2>(ps)),                                  // NOLINT
+      res(spec, algo, ref.get_in_dists(), ref.get_in_ids())  // NOLINT
+  {
+  }
+
+  explicit SelectK(typename ParamsReader<KeyT, IdxT>::params_t ps)
+    : SelectK(ParamsReader<KeyT, IdxT>::read(ps))
+  {
+  }
+
+  SelectK()
+    : SelectK(testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::params_t>::GetParam())
+  {
+  }
+
+  void run()
+  {
+    if (ref.not_supported || res.not_supported) { GTEST_SKIP(); }
+    ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
+
+    // If the dists (keys) are the same, different corresponding ids may end up in the selection due
+    // to non-deterministic nature of some implementations.
+    auto& in_ids     = ref.get_in_ids();
+    auto& in_dists   = ref.get_in_dists();
+    auto compare_ids = [&in_ids, &in_dists](const IdxT& i, const IdxT& j) {
+      if (i == j) return true;
+      auto ix_i = size_t(std::find(in_ids.begin(), in_ids.end(), i) - in_ids.begin());
+      auto ix_j = size_t(std::find(in_ids.begin(), in_ids.end(), j) - in_ids.begin());
+      if (ix_i >= in_ids.size() || ix_j >= in_ids.size()) return false;
+      auto dist_i = in_dists[ix_i];
+      auto dist_j = in_dists[ix_j];
+      if (dist_i == dist_j) return true;
+      std::cout << "ERROR: ref[" << ix_i << "] = " << dist_i << " != "
+                << "res[" << ix_j << "] = " << dist_j << std::endl;
+      return false;
+    };
+    ASSERT_TRUE(hostVecMatch(ref.get_out_ids(), res.get_out_ids(), compare_ids));
+  }
+};
+
+template <typename KeyT, typename IdxT>
+struct params_simple {
+  using io_t = io_simple<KeyT, IdxT>;
+  using input_t =
+    std::tuple<select::params, std::vector<KeyT>, std::vector<KeyT>, std::vector<IdxT>>;
+  using params_t = std::tuple<input_t, select::Algo>;
+
+  static auto read(params_t ps) -> Params<io_t>
+  {
+    auto ins  = std::get<0>(ps);
+    auto algo = std::get<1>(ps);
+    return std::make_tuple(
+      std::get<0>(ins),
+      algo,
+      io_simple<KeyT, IdxT>(
+        std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)));
+  }
+};
+
+auto inputs_simple_f = testing::Values(
+  params_simple<float, int>::input_t(
+    {5, 5, 5, true, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
+     4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
+    {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
+  params_simple<float, int>::input_t(
+    {5, 5, 3, true, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
+    {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
+  params_simple<float, int>::input_t(
+    {5, 5, 5, true, false},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
+     4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
+    {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
+  params_simple<float, int>::input_t(
+    {5, 5, 3, true, false},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
+    {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
+  params_simple<float, int>::input_t(
+    {5, 7, 3, true, true},
+    {5.0, 4.0, 3.0, 2.0, 1.3, 7.5, 19.0, 9.0, 2.0, 3.0, 3.0, 5.0, 6.0, 4.0, 2.0, 3.0, 5.0, 1.0,
+     4.0, 1.0, 1.0, 5.0, 7.0, 2.5, 4.0,  7.0, 8.0, 8.0, 1.0, 3.0, 2.0, 5.0, 4.0, 1.1, 1.2},
+    {1.3, 2.0, 3.0, 2.0, 3.0, 3.0, 1.0, 1.0, 1.0, 2.5, 4.0, 5.0, 1.0, 1.1, 1.2},
+    {4, 3, 2, 1, 2, 3, 3, 5, 6, 2, 3, 0, 0, 5, 6}),
+  params_simple<float, int>::input_t(
+    {1, 7, 3, true, true}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {3, 5, 6}),
+  params_simple<float, int>::input_t(
+    {1, 7, 3, false, false}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {5.0, 4.0, 3.0}, {2, 4, 1}),
+  params_simple<float, int>::input_t(
+    {1, 7, 3, false, true}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}),
+  params_simple<float, int>::input_t(
+    {1, 130, 5, false, true},
+    {19, 1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+     1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
+     5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    {20, 19, 18, 17, 16},
+    {129, 0, 117, 116, 115}),
+  params_simple<float, int>::input_t(
+    {1, 130, 15, false, true},
+    {19, 1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+     1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
+     5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6},
+    {129, 0, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105}));
+
+using SimpleFloatInt = SelectK<float, int, params_simple>;
+TEST_P(SimpleFloatInt, Run) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                // NOLINT
+  SelectK,
+  SimpleFloatInt,
+  testing::Combine(inputs_simple_f,
+                   testing::Values(select::Algo::kPublicApi,
+                                   select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kWarpImmediate,
+                                   select::Algo::kWarpFiltered,
+                                   select::Algo::kWarpDistributed)));
+
+template <select::Algo RefAlgo>
+struct with_ref {
+  template <typename KeyT, typename IdxT>
+  struct params_random {
+    using io_t     = io_computed<KeyT, IdxT>;
+    using params_t = std::tuple<select::params, select::Algo>;
+
+    static auto read(params_t ps) -> Params<io_t>
+    {
+      auto spec = std::get<0>(ps);
+      auto algo = std::get<1>(ps);
+      std::vector<KeyT> dists(spec.len * spec.batch_size);
+
+      raft::handle_t handle;
+      {
+        auto s = handle.get_stream();
+        rmm::device_uvector<KeyT> dists_d(spec.len * spec.batch_size, s);
+        raft::random::RngState r(42);
+        normal(handle, r, dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0));
+        update_host(dists.data(), dists_d.data(), dists_d.size(), s);
+        s.synchronize();
+      }
+
+      return std::make_tuple(spec, algo, io_computed<KeyT, IdxT>(spec, RefAlgo, dists));
+    }
+  };
+};
+
+auto inputs_random_longlist = testing::Values(select::params{1, 130, 15, false},
+                                              select::params{1, 128, 15, false},
+                                              select::params{20, 700, 1, true},
+                                              select::params{20, 700, 2, true},
+                                              select::params{20, 700, 3, true},
+                                              select::params{20, 700, 4, true},
+                                              select::params{20, 700, 5, true},
+                                              select::params{20, 700, 6, true},
+                                              select::params{20, 700, 7, true},
+                                              select::params{20, 700, 8, true},
+                                              select::params{20, 700, 9, true},
+                                              select::params{20, 700, 10, true, false},
+                                              select::params{20, 700, 11, true},
+                                              select::params{20, 700, 12, true},
+                                              select::params{20, 700, 16, true},
+                                              select::params{100, 1700, 17, true},
+                                              select::params{100, 1700, 31, true, false},
+                                              select::params{100, 1700, 32, false},
+                                              select::params{100, 1700, 33, false},
+                                              select::params{100, 1700, 63, false},
+                                              select::params{100, 1700, 64, false, false},
+                                              select::params{100, 1700, 65, false},
+                                              select::params{100, 1700, 255, true},
+                                              select::params{100, 1700, 256, true},
+                                              select::params{100, 1700, 511, false},
+                                              select::params{100, 1700, 512, true},
+                                              select::params{100, 1700, 1023, false, false},
+                                              select::params{100, 1700, 1024, true},
+                                              select::params{100, 1700, 1700, true});
+
+auto inputs_random_largesize = testing::Values(select::params{100, 100000, 1, true},
+                                               select::params{100, 100000, 2, true},
+                                               select::params{100, 100000, 3, true, false},
+                                               select::params{100, 100000, 7, true},
+                                               select::params{100, 100000, 16, true},
+                                               select::params{100, 100000, 31, true},
+                                               select::params{100, 100000, 32, true, false},
+                                               select::params{100, 100000, 60, true},
+                                               select::params{100, 100000, 100, true, false},
+                                               select::params{100, 100000, 200, true},
+                                               select::params{100000, 100, 100, false},
+                                               select::params{1, 1000000000, 1, true},
+                                               select::params{1, 1000000000, 16, false, false},
+                                               select::params{1, 1000000000, 64, false},
+                                               select::params{1, 1000000000, 128, true, false},
+                                               select::params{1, 1000000000, 256, false, false});
+
+auto inputs_random_largek = testing::Values(select::params{100, 100000, 1000, true},
+                                            select::params{100, 100000, 2000, true},
+                                            select::params{100, 100000, 100000, true, false},
+                                            select::params{100, 100000, 2048, false},
+                                            select::params{100, 100000, 1237, true});
+
+using ReferencedRandomFloatInt =
+  SelectK<float, int, with_ref<select::Algo::kPublicApi>::params_random>;
+TEST_P(ReferencedRandomFloatInt, Run) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                          // NOLINT
+  SelectK,
+  ReferencedRandomFloatInt,
+  testing::Combine(inputs_random_longlist,
+                   testing::Values(select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kWarpImmediate,
+                                   select::Algo::kWarpFiltered,
+                                   select::Algo::kWarpDistributed,
+                                   select::Algo::kWarpDistributedShm)));
+
+using ReferencedRandomDoubleSizeT =
+  SelectK<double, size_t, with_ref<select::Algo::kPublicApi>::params_random>;
+TEST_P(ReferencedRandomDoubleSizeT, Run) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                             // NOLINT
+  SelectK,
+  ReferencedRandomDoubleSizeT,
+  testing::Combine(inputs_random_longlist,
+                   testing::Values(select::Algo::kRadix8bits,
+                                   select::Algo::kRadix11bits,
+                                   select::Algo::kWarpImmediate,
+                                   select::Algo::kWarpFiltered,
+                                   select::Algo::kWarpDistributed,
+                                   select::Algo::kWarpDistributedShm)));
+
+using ReferencedRandomDoubleInt =
+  SelectK<double, int, with_ref<select::Algo::kRadix11bits>::params_random>;
+TEST_P(ReferencedRandomDoubleInt, LargeSize) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(                                 // NOLINT
+  SelectK,
+  ReferencedRandomDoubleInt,
+  testing::Combine(inputs_random_largesize, testing::Values(select::Algo::kWarpAuto)));
+
+using ReferencedRandomFloatSizeT =
+  SelectK<float, size_t, with_ref<select::Algo::kRadix8bits>::params_random>;
+TEST_P(ReferencedRandomFloatSizeT, LargeK) { run(); }  // NOLINT
+INSTANTIATE_TEST_CASE_P(SelectK,                       // NOLINT
+                        ReferencedRandomFloatSizeT,
+                        testing::Combine(inputs_random_largek,
+                                         testing::Values(select::Algo::kRadix11bits)));
+
+}  // namespace raft::matrix
diff --git a/cpp/test/matrix/select_k.cuh b/cpp/test/matrix/select_k.cuh
new file mode 100644
index 0000000000..ee79b1ff80
--- /dev/null
+++ b/cpp/test/matrix/select_k.cuh
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/matrix/detail/select_radix.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/matrix/select_k.cuh>
+
+#include <raft/core/handle.hpp>
+
+namespace raft::matrix::select {
+
+struct params {
+  size_t batch_size;
+  size_t len;
+  int k;
+  bool select_min;
+  bool use_index_input = true;
+};
+
+inline auto operator<<(std::ostream& os, const params& ss) -> std::ostream&
+{
+  os << "params{batch_size: " << ss.batch_size;
+  os << ", len: " << ss.len;
+  os << ", k: " << ss.k;
+  os << (ss.select_min ? ", asc" : ", dsc");
+  os << (ss.use_index_input ? "}" : ", no-input-index}");
+  return os;
+}
+
+enum class Algo {
+  kPublicApi,
+  kRadix8bits,
+  kRadix11bits,
+  kWarpAuto,
+  kWarpImmediate,
+  kWarpFiltered,
+  kWarpDistributed,
+  kWarpDistributedShm
+};
+
+inline auto operator<<(std::ostream& os, const Algo& algo) -> std::ostream&
+{
+  switch (algo) {
+    case Algo::kPublicApi: return os << "kPublicApi";
+    case Algo::kRadix8bits: return os << "kRadix8bits";
+    case Algo::kRadix11bits: return os << "kRadix11bits";
+    case Algo::kWarpAuto: return os << "kWarpAuto";
+    case Algo::kWarpImmediate: return os << "kWarpImmediate";
+    case Algo::kWarpFiltered: return os << "kWarpFiltered";
+    case Algo::kWarpDistributed: return os << "kWarpDistributed";
+    case Algo::kWarpDistributedShm: return os << "kWarpDistributedShm";
+    default: return os << "unknown enum value";
+  }
+}
+
+template <typename T, typename IdxT>
+void select_k_impl(const handle_t& handle,
+                   const Algo& algo,
+                   const T* in,
+                   const IdxT* in_idx,
+                   size_t batch_size,
+                   size_t len,
+                   int k,
+                   T* out,
+                   IdxT* out_idx,
+                   bool select_min)
+{
+  auto stream = handle.get_stream();
+  switch (algo) {
+    case Algo::kPublicApi: {
+      auto in_extent   = make_extents<size_t>(batch_size, len);
+      auto out_extent  = make_extents<size_t>(batch_size, k);
+      auto in_span     = make_mdspan<const T, size_t, row_major, false, true>(in, in_extent);
+      auto in_idx_span = make_mdspan<const IdxT, size_t, row_major, false, true>(in_idx, in_extent);
+      auto out_span    = make_mdspan<T, size_t, row_major, false, true>(out, out_extent);
+      auto out_idx_span = make_mdspan<IdxT, size_t, row_major, false, true>(out_idx, out_extent);
+      if (in_idx == nullptr) {
+        // NB: std::nullopt prevents automatic inference of the template parameters.
+        return matrix::select_k<T, IdxT>(
+          handle, in_span, std::nullopt, out_span, out_idx_span, select_min);
+      } else {
+        return matrix::select_k(
+          handle, in_span, std::make_optional(in_idx_span), out_span, out_idx_span, select_min);
+      }
+    }
+    case Algo::kRadix8bits:
+      return detail::select::radix::select_k<T, IdxT, 8, 512>(
+        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kRadix11bits:
+      return detail::select::radix::select_k<T, IdxT, 11, 512>(
+        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpAuto:
+      return detail::select::warpsort::select_k<T, IdxT>(
+        in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpImmediate:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_immediate>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpFiltered:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_filtered>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpDistributed:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+    case Algo::kWarpDistributedShm:
+      return detail::select::warpsort::
+        select_k_impl<T, IdxT, detail::select::warpsort::warp_sort_distributed_ext>(
+          in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+  }
+}
+
+}  // namespace raft::matrix::select
diff --git a/cpp/test/neighbors/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu
index 080e7551fa..46a80a2f56 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cu
+++ b/cpp/test/neighbors/ann_ivf_flat.cu
@@ -332,16 +332,16 @@ const std::vector<AnnIvfFlatInputs<int64_t>> inputs = {
    10000,
    16,
    10,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 2,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 2,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
    raft::distance::DistanceType::L2Expanded,
    false},
   {1000,
    10000,
    16,
    10,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
-   raft::spatial::knn::detail::topk::kMaxCapacity * 4,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
+   raft::matrix::detail::select::warpsort::kMaxCapacity * 4,
    raft::distance::DistanceType::InnerProduct,
    false}};
 
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index 551ebd767f..f98f0fa771 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/distance/distance_types.hpp>
+#include <raft/matrix/detail/select_k.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
-#include <raft/spatial/knn/detail/topk.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -183,17 +183,16 @@ void naiveBfKnn(EvalT* dist_topk,
     naive_distance_kernel<EvalT, DataT, IdxT><<<grid_dim, block_dim, 0, stream>>>(
       dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
 
-    spatial::knn::detail::select_topk<EvalT, IdxT>(
-      dist.data(),
-      nullptr,
-      batch_size,
-      input_len,
-      static_cast<int>(k),
-      dist_topk + offset * k,
-      indices_topk + offset * k,
-      type != raft::distance::DistanceType::InnerProduct,
-      stream,
-      mr);
+    matrix::detail::select_k<EvalT, IdxT>(dist.data(),
+                                          nullptr,
+                                          batch_size,
+                                          input_len,
+                                          static_cast<int>(k),
+                                          dist_topk + offset * k,
+                                          indices_topk + offset * k,
+                                          type != raft::distance::DistanceType::InnerProduct,
+                                          stream,
+                                          mr);
   }
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
 }
diff --git a/cpp/test/neighbors/selection.cu b/cpp/test/neighbors/selection.cu
index 2f95ed1b3a..4b84c9b840 100644
--- a/cpp/test/neighbors/selection.cu
+++ b/cpp/test/neighbors/selection.cu
@@ -111,7 +111,7 @@ struct SelectInOutComputed {
     // check if the size is supported by the algorithm
     switch (algo) {
       case knn::SelectKAlgo::WARP_SORT:
-        if (spec.k > raft::spatial::knn::detail::topk::kMaxCapacity) {
+        if (spec.k > raft::matrix::detail::select::warpsort::kMaxCapacity) {
           not_supported = true;
           return;
         }
diff --git a/cpp/test/util/bitonic_sort.cu b/cpp/test/util/bitonic_sort.cu
new file mode 100644
index 0000000000..f45e8ce1e0
--- /dev/null
+++ b/cpp/test/util/bitonic_sort.cu
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/util/bitonic_sort.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <numeric>
+
+namespace raft::util {
+
+constexpr int kMaxBlockSize = 512;
+constexpr int kMaxCapacity  = 128;
+
+struct test_spec {
+  int n_inputs;
+  int warp_width;
+  int capacity;
+  bool ascending;
+
+  [[nodiscard]] auto len() const -> int { return n_inputs * warp_width * capacity; }
+};
+
+auto operator<<(std::ostream& os, const test_spec& ss) -> std::ostream&
+{
+  os << "spec{n_inputs: " << ss.n_inputs << ", input_len: " << (ss.warp_width * ss.capacity) << " ("
+     << ss.warp_width << " * " << ss.capacity << ")";
+  os << (ss.ascending ? "; asc}" : "; dsc}");
+  return os;
+}
+
+template <int Capacity, typename T>
+__global__ void bitonic_kernel(T* arr, bool ascending, int warp_width, int n_inputs)
+{
+  const int tid          = blockDim.x * blockIdx.x + threadIdx.x;
+  const int subwarp_id   = tid / warp_width;
+  const int subwarp_lane = tid % warp_width;
+  T local_arr[Capacity];  // NOLINT
+  // Split the data into chunks of size `warp_width * Capacity`, each thread poiting
+  // to the beginning of its stride within the chunk.
+  T* per_thread_arr = arr + subwarp_id * warp_width * Capacity + subwarp_lane;
+
+  if (subwarp_id < n_inputs) {
+#pragma unroll
+    for (int i = 0; i < Capacity; i++) {
+      local_arr[i] = per_thread_arr[i * warp_width];
+    }
+  }
+
+  bitonic<Capacity>(ascending, warp_width).sort(local_arr);
+
+  if (subwarp_id < n_inputs) {
+#pragma unroll
+    for (int i = 0; i < Capacity; i++) {
+      per_thread_arr[i * warp_width] = local_arr[i];
+    }
+  }
+}
+
+template <int Capacity>
+struct bitonic_launch {
+  template <typename T>
+  static void run(const test_spec& spec, T* arr, rmm::cuda_stream_view stream)
+  {
+    ASSERT(spec.capacity <= Capacity, "Invalid input: the requested capacity is too high.");
+    ASSERT(spec.warp_width <= WarpSize,
+           "Invalid input: the requested warp_width must be not larger than the WarpSize.");
+    if constexpr (Capacity > 1) {
+      if (spec.capacity < Capacity) {
+        return bitonic_launch<std::max(1, Capacity / 2)>::run(spec, arr, stream);
+      }
+    }
+    int max_block_size, min_grid_size;
+    RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
+      &min_grid_size, &max_block_size, bitonic_kernel<Capacity, T>, 0, kMaxBlockSize));
+    const int n_warps =
+      ceildiv(std::min(spec.n_inputs * spec.warp_width, max_block_size), WarpSize);
+    const int block_dim  = n_warps * WarpSize;
+    const int n_subwarps = block_dim / spec.warp_width;
+    const int grid_dim   = ceildiv(spec.n_inputs, n_subwarps);
+    bitonic_kernel<Capacity, T>
+      <<<grid_dim, block_dim, 0, stream>>>(arr, spec.ascending, spec.warp_width, spec.n_inputs);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+};
+
+template <typename T>
+class BitonicTest : public testing::TestWithParam<test_spec> {  // NOLINT
+ protected:
+  const test_spec spec;  // NOLINT
+  std::vector<T> in;     // NOLINT
+  std::vector<T> out;    // NOLINT
+  std::vector<T> ref;    // NOLINT
+
+  void segmented_sort(std::vector<T>& vec, int k, bool ascending)  // NOLINT
+  {
+    std::vector<int> p(vec.size());
+    std::iota(p.begin(), p.end(), 0);
+    std::sort(p.begin(), p.end(), [&vec, k, ascending](int i, int j) {
+      const int ik = i / k;
+      const int jk = j / k;
+      if (ik == jk) { return ascending ? vec[i] < vec[j] : vec[i] > vec[j]; }
+      return ik < jk;
+    });
+    for (auto i = int(vec.size()) - 1; i > 0; i--) {
+      auto j = p[i];
+      while (j > i)
+        j = p[j];
+      std::swap(vec[j], vec[i]);
+    }
+  }
+
+  void fill_random(rmm::device_uvector<T>& arr, rmm::cuda_stream_view stream)
+  {
+    raft::random::Rng rng(42);
+    if constexpr (std::is_floating_point_v<T>) {
+      return rng.normal(arr.data(), arr.size(), T(10), T(100), stream);
+    }
+    if constexpr (std::is_integral_v<T>) {
+      return rng.normalInt(arr.data(), arr.size(), T(10), T(100), stream);
+    }
+  }
+
+ public:
+  explicit BitonicTest()
+    : spec(testing::TestWithParam<test_spec>::GetParam()),
+      in(spec.len()),
+      out(spec.len()),
+      ref(spec.len())
+  {
+    auto stream = rmm::cuda_stream_default;
+
+    // generate input
+    rmm::device_uvector<T> arr_d(spec.len(), stream);
+    fill_random(arr_d, stream);
+    update_host(in.data(), arr_d.data(), arr_d.size(), stream);
+
+    // calculate the results
+    bitonic_launch<kMaxCapacity>::run(spec, arr_d.data(), stream);
+    update_host(out.data(), arr_d.data(), arr_d.size(), stream);
+
+    // make sure the results are available on host
+    stream.synchronize();
+
+    // calculate the reference
+    std::copy(in.begin(), in.end(), ref.begin());
+    segmented_sort(ref, spec.warp_width * spec.capacity, spec.ascending);
+  }
+
+  void run() { ASSERT_TRUE(hostVecMatch(ref, out, Compare<T>())); }
+};
+
+auto inputs = ::testing::Values(test_spec{1, 1, 1, true},
+                                test_spec{1, 2, 1, true},
+                                test_spec{1, 4, 1, true},
+                                test_spec{1, 8, 1, true},
+                                test_spec{1, 16, 1, false},
+                                test_spec{1, 32, 1, false},
+                                test_spec{1, 32, 2, false},
+                                test_spec{1, 32, 4, true},
+                                test_spec{1, 32, 8, true},
+                                test_spec{5, 32, 2, true},
+                                test_spec{7, 16, 4, true},
+                                test_spec{7, 8, 2, true},
+                                test_spec{70, 4, 32, true},
+                                test_spec{70, 1, 64, true},
+                                test_spec{70, 2, 128, false});
+
+using Floats = BitonicTest<float>;                     // NOLINT
+TEST_P(Floats, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Floats, inputs);  // NOLINT
+
+using Ints = BitonicTest<int>;                       // NOLINT
+TEST_P(Ints, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Ints, inputs);  // NOLINT
+
+using Doubles = BitonicTest<double>;                    // NOLINT
+TEST_P(Doubles, Run) { run(); }                         // NOLINT
+INSTANTIATE_TEST_CASE_P(BitonicTest, Doubles, inputs);  // NOLINT
+
+}  // namespace raft::util
diff --git a/docs/source/cpp_api/matrix_ordering.rst b/docs/source/cpp_api/matrix_ordering.rst
index fae6dc12a4..0af84e14f5 100644
--- a/docs/source/cpp_api/matrix_ordering.rst
+++ b/docs/source/cpp_api/matrix_ordering.rst
@@ -29,6 +29,18 @@ namespace *raft::matrix*
     :members:
     :content-only:
 
+Select-K
+--------
+
+``#include <raft/matrix/select_k.cuh>``
+
+namespace *raft::matrix*
+
+.. doxygengroup:: select_k
+    :project: RAFT
+    :members:
+    :content-only:
+
 Column-wise Sort
 ----------------
 

From 0e3b93c5a04ad3600c3be9328f5348a8450e44d5 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 23 Jan 2023 17:14:57 +0100
Subject: [PATCH 2/4] Add an internal library to share headers between test and
 bench

---
 cpp/CMakeLists.txt                            |   7 +
 cpp/bench/CMakeLists.txt                      |   2 +-
 cpp/bench/matrix/select_k.cu                  |   7 +-
 cpp/bench/neighbors/refine.cu                 |  11 +-
 cpp/internal/CMakeLists.txt                   |  21 +++
 .../internal}/matrix/select_k.cuh             |   0
 cpp/internal/internal/neighbors/naive_knn.cuh | 126 ++++++++++++++++++
 .../internal}/neighbors/refine_helper.cuh     |  51 +++----
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/matrix/select_k.cu                   |   5 +-
 cpp/test/neighbors/ann_ivf_flat.cu            |  22 +--
 cpp/test/neighbors/ann_ivf_pq.cuh             |  22 +--
 cpp/test/neighbors/ann_utils.cuh              |  94 -------------
 cpp/test/neighbors/refine.cu                  |  14 +-
 14 files changed, 222 insertions(+), 161 deletions(-)
 create mode 100644 cpp/internal/CMakeLists.txt
 rename cpp/{test => internal/internal}/matrix/select_k.cuh (100%)
 create mode 100644 cpp/internal/internal/neighbors/naive_knn.cuh
 rename cpp/{test => internal/internal}/neighbors/refine_helper.cuh (74%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 784bbbb935..c6850b290f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -665,6 +665,13 @@ raft_export(
   distance distributed nn DOCUMENTATION doc_string NAMESPACE raft:: FINAL_CODE_BLOCK code_string
 )
 
+# ##################################################################################################
+# * shared test/bench headers ------------------------------------------------
+
+if(BUILD_TESTS OR BUILD_BENCH)
+  include(internal/CMakeLists.txt)
+endif()
+
 # ##################################################################################################
 # * build test executable ----------------------------------------------------
 
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 6b985acfc3..b1ffc72ba9 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -30,9 +30,9 @@ function(ConfigureBench)
   target_link_libraries(
     ${BENCH_NAME}
     PRIVATE raft::raft
+            raft_internal
             $<$<BOOL:${ConfigureBench_DIST}>:raft::distance>
             $<$<BOOL:${ConfigureBench_NN}>:raft::nn>
-            GTest::gtest
             benchmark::benchmark
             Threads::Threads
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
diff --git a/cpp/bench/matrix/select_k.cu b/cpp/bench/matrix/select_k.cu
index 452a50ba50..0d3e3c592e 100644
--- a/cpp/bench/matrix/select_k.cu
+++ b/cpp/bench/matrix/select_k.cu
@@ -14,12 +14,7 @@
  * limitations under the License.
  */
 
-/**
- * TODO: reconsider how to organize shared test+bench files better
- *       Related Issue: https://github.com/rapidsai/raft/issues/1153
- *       (although this header does not depend on any gtest headers)
- */
-#include "../../test/matrix/select_k.cuh"
+#include <internal/matrix/select_k.cuh>
 
 #include <common/benchmark.hpp>
 
diff --git a/cpp/bench/neighbors/refine.cu b/cpp/bench/neighbors/refine.cu
index a038905ace..ab6d41d3ee 100644
--- a/cpp/bench/neighbors/refine.cu
+++ b/cpp/bench/neighbors/refine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,16 @@
  * limitations under the License.
  */
 
-#include <common/benchmark.hpp>
+#include <internal/neighbors/refine_helper.cuh>
 
-#include <raft/random/rng.cuh>
+#include <common/benchmark.hpp>
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/handle.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/detail/refine.cuh>
 #include <raft/neighbors/refine.cuh>
+#include <raft/random/rng.cuh>
 
 #if defined RAFT_DISTANCE_COMPILED
 #include <raft/distance/specializations.cuh>
@@ -36,12 +37,10 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-#include "../../test/neighbors/refine_helper.cuh"
-
 #include <iostream>
 #include <sstream>
 
-using namespace raft::neighbors::detail;
+using namespace raft::neighbors;
 
 namespace raft::bench::neighbors {
 
diff --git a/cpp/internal/CMakeLists.txt b/cpp/internal/CMakeLists.txt
new file mode 100644
index 0000000000..4d5c585c01
--- /dev/null
+++ b/cpp/internal/CMakeLists.txt
@@ -0,0 +1,21 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+if(BUILD_TESTS OR BUILD_BENCH)
+  add_library(raft_internal INTERFACE)
+  target_include_directories(
+    raft_internal INTERFACE "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/internal>"
+  )
+  target_compile_features(raft_internal INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
+endif()
diff --git a/cpp/test/matrix/select_k.cuh b/cpp/internal/internal/matrix/select_k.cuh
similarity index 100%
rename from cpp/test/matrix/select_k.cuh
rename to cpp/internal/internal/matrix/select_k.cuh
diff --git a/cpp/internal/internal/neighbors/naive_knn.cuh b/cpp/internal/internal/neighbors/naive_knn.cuh
new file mode 100644
index 0000000000..fb93a05fa1
--- /dev/null
+++ b/cpp/internal/internal/neighbors/naive_knn.cuh
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance_types.hpp>
+#include <raft/matrix/detail/select_k.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+namespace raft::neighbors {
+
+template <typename EvalT, typename DataT, typename IdxT>
+__global__ void naive_distance_kernel(EvalT* dist,
+                                      const DataT* x,
+                                      const DataT* y,
+                                      IdxT m,
+                                      IdxT n,
+                                      IdxT k,
+                                      raft::distance::DistanceType metric)
+{
+  IdxT midx = IdxT(threadIdx.x) + IdxT(blockIdx.x) * IdxT(blockDim.x);
+  if (midx >= m) return;
+  IdxT grid_size = IdxT(blockDim.y) * IdxT(gridDim.y);
+  for (IdxT nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n; nidx += grid_size) {
+    EvalT acc = EvalT(0);
+    for (IdxT i = 0; i < k; ++i) {
+      IdxT xidx = i + midx * k;
+      IdxT yidx = i + nidx * k;
+      auto xv   = EvalT(x[xidx]);
+      auto yv   = EvalT(y[yidx]);
+      switch (metric) {
+        case raft::distance::DistanceType::InnerProduct: {
+          acc += xv * yv;
+        } break;
+        case raft::distance::DistanceType::L2SqrtExpanded:
+        case raft::distance::DistanceType::L2SqrtUnexpanded:
+        case raft::distance::DistanceType::L2Expanded:
+        case raft::distance::DistanceType::L2Unexpanded: {
+          auto diff = xv - yv;
+          acc += diff * diff;
+        } break;
+        default: break;
+      }
+    }
+    switch (metric) {
+      case raft::distance::DistanceType::L2SqrtExpanded:
+      case raft::distance::DistanceType::L2SqrtUnexpanded: {
+        acc = raft::sqrt(acc);
+      } break;
+      default: break;
+    }
+    dist[midx * n + nidx] = acc;
+  }
+}
+
+/**
+ * Naive, but flexible bruteforce KNN search.
+ *
+ * TODO: either replace this with brute_force_knn or with distance+select_k
+ *       when either distance or brute_force_knn support 8-bit int inputs.
+ */
+template <typename EvalT, typename DataT, typename IdxT>
+void naive_knn(EvalT* dist_topk,
+               IdxT* indices_topk,
+               const DataT* x,
+               const DataT* y,
+               size_t n_inputs,
+               size_t input_len,
+               size_t dim,
+               uint32_t k,
+               raft::distance::DistanceType type,
+               rmm::cuda_stream_view stream)
+{
+  rmm::mr::device_memory_resource* mr = nullptr;
+  auto pool_guard                     = raft::get_pool_memory_resource(mr, 1024 * 1024);
+
+  dim3 block_dim(16, 32, 1);
+  // maximum reasonable grid size in `y` direction
+  auto grid_y =
+    static_cast<uint16_t>(std::min<size_t>(raft::ceildiv<size_t>(input_len, block_dim.y), 32768));
+
+  // bound the memory used by this function
+  size_t max_batch_size =
+    std::min<size_t>(n_inputs, raft::ceildiv<size_t>(size_t(1) << size_t(27), input_len));
+  rmm::device_uvector<EvalT> dist(max_batch_size * input_len, stream, mr);
+
+  for (size_t offset = 0; offset < n_inputs; offset += max_batch_size) {
+    size_t batch_size = std::min(max_batch_size, n_inputs - offset);
+    dim3 grid_dim(raft::ceildiv<size_t>(batch_size, block_dim.x), grid_y, 1);
+
+    naive_distance_kernel<EvalT, DataT, IdxT><<<grid_dim, block_dim, 0, stream>>>(
+      dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
+
+    matrix::detail::select_k<EvalT, IdxT>(dist.data(),
+                                          nullptr,
+                                          batch_size,
+                                          input_len,
+                                          static_cast<int>(k),
+                                          dist_topk + offset * k,
+                                          indices_topk + offset * k,
+                                          type != raft::distance::DistanceType::InnerProduct,
+                                          stream,
+                                          mr);
+  }
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+}
+
+}  // namespace raft::neighbors
diff --git a/cpp/test/neighbors/refine_helper.cuh b/cpp/internal/internal/neighbors/refine_helper.cuh
similarity index 74%
rename from cpp/test/neighbors/refine_helper.cuh
rename to cpp/internal/internal/neighbors/refine_helper.cuh
index 3c69a8f5b7..d51f6a5a40 100644
--- a/cpp/test/neighbors/refine_helper.cuh
+++ b/cpp/internal/internal/neighbors/refine_helper.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,8 @@
  */
 #pragma once
 
-#include "ann_utils.cuh"
+#include <internal/neighbors/naive_knn.cuh>
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/handle.hpp>
@@ -25,9 +26,9 @@
 #include <raft/random/rng.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
-namespace raft::neighbors::detail {
+namespace raft::neighbors {
 
 template <typename IdxT>
 struct RefineInputs {
@@ -66,16 +67,16 @@ class RefineHelper {
     {
       candidates = raft::make_device_matrix<IdxT, IdxT>(handle_, p.n_queries, p.k0);
       rmm::device_uvector<DistanceT> distances_tmp(p.n_queries * p.k0, stream_);
-      raft::neighbors::naiveBfKnn<DistanceT, DataT, IdxT>(distances_tmp.data(),
-                                                          candidates.data_handle(),
-                                                          queries.data_handle(),
-                                                          dataset.data_handle(),
-                                                          p.n_queries,
-                                                          p.n_rows,
-                                                          p.dim,
-                                                          p.k0,
-                                                          p.metric,
-                                                          stream_);
+      naive_knn<DistanceT, DataT, IdxT>(distances_tmp.data(),
+                                        candidates.data_handle(),
+                                        queries.data_handle(),
+                                        dataset.data_handle(),
+                                        p.n_queries,
+                                        p.n_rows,
+                                        p.dim,
+                                        p.k0,
+                                        p.metric,
+                                        stream_);
       handle_.sync_stream(stream_);
     }
 
@@ -98,16 +99,16 @@ class RefineHelper {
     {
       rmm::device_uvector<DistanceT> distances_dev(p.n_queries * p.k, stream_);
       rmm::device_uvector<IdxT> indices_dev(p.n_queries * p.k, stream_);
-      raft::neighbors::naiveBfKnn<DistanceT, DataT, IdxT>(distances_dev.data(),
-                                                          indices_dev.data(),
-                                                          queries.data_handle(),
-                                                          dataset.data_handle(),
-                                                          p.n_queries,
-                                                          p.n_rows,
-                                                          p.dim,
-                                                          p.k,
-                                                          p.metric,
-                                                          stream_);
+      naive_knn<DistanceT, DataT, IdxT>(distances_dev.data(),
+                                        indices_dev.data(),
+                                        queries.data_handle(),
+                                        dataset.data_handle(),
+                                        p.n_queries,
+                                        p.n_rows,
+                                        p.dim,
+                                        p.k,
+                                        p.metric,
+                                        stream_);
       true_refined_distances_host.resize(p.n_queries * p.k);
       true_refined_indices_host.resize(p.n_queries * p.k);
       raft::copy(true_refined_indices_host.data(), indices_dev.data(), indices_dev.size(), stream_);
@@ -137,4 +138,4 @@ class RefineHelper {
   std::vector<IdxT> true_refined_indices_host;
   std::vector<DistanceT> true_refined_distances_host;
 };
-}  // namespace raft::neighbors::detail
\ No newline at end of file
+}  // namespace raft::neighbors
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 6c7ca11d86..8a9071fdd1 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -32,6 +32,7 @@ function(ConfigureTest)
   target_link_libraries(
     ${TEST_NAME}
     PRIVATE raft::raft
+            raft_internal
             $<$<BOOL:${ConfigureTest_DIST}>:raft::distance>
             $<$<BOOL:${ConfigureTest_NN}>:raft::nn>
             GTest::gtest
diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu
index cb92c15790..b15276c492 100644
--- a/cpp/test/matrix/select_k.cu
+++ b/cpp/test/matrix/select_k.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,8 @@
  */
 
 #include "../test_utils.cuh"
-#include "select_k.cuh"
+
+#include <internal/matrix/select_k.cuh>
 
 #include <raft/core/handle.hpp>
 #include <raft/random/rng.cuh>
diff --git a/cpp/test/neighbors/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu
index 46a80a2f56..f4cf4dcbf8 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cu
+++ b/cpp/test/neighbors/ann_ivf_flat.cu
@@ -17,6 +17,8 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
+#include <internal/neighbors/naive_knn.cuh>
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
@@ -78,16 +80,16 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
     {
       rmm::device_uvector<T> distances_naive_dev(queries_size, stream_);
       rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
-      naiveBfKnn<T, DataT, IdxT>(distances_naive_dev.data(),
-                                 indices_naive_dev.data(),
-                                 search_queries.data(),
-                                 database.data(),
-                                 ps.num_queries,
-                                 ps.num_db_vecs,
-                                 ps.dim,
-                                 ps.k,
-                                 ps.metric,
-                                 stream_);
+      naive_knn<T, DataT, IdxT>(distances_naive_dev.data(),
+                                indices_naive_dev.data(),
+                                search_queries.data(),
+                                database.data(),
+                                ps.num_queries,
+                                ps.num_db_vecs,
+                                ps.dim,
+                                ps.k,
+                                ps.metric,
+                                stream_);
       update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
       update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
       handle_.sync_stream(stream_);
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 719f429f13..be17d0135a 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -18,6 +18,8 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
+#include <internal/neighbors/naive_knn.cuh>
+
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/ivf_pq.cuh>
@@ -158,16 +160,16 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     size_t queries_size = size_t{ps.num_queries} * size_t{ps.k};
     rmm::device_uvector<EvalT> distances_naive_dev(queries_size, stream_);
     rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
-    naiveBfKnn<EvalT, DataT, IdxT>(distances_naive_dev.data(),
-                                   indices_naive_dev.data(),
-                                   search_queries.data(),
-                                   database.data(),
-                                   ps.num_queries,
-                                   ps.num_db_vecs,
-                                   ps.dim,
-                                   ps.k,
-                                   ps.index_params.metric,
-                                   stream_);
+    naive_knn<EvalT, DataT, IdxT>(distances_naive_dev.data(),
+                                  indices_naive_dev.data(),
+                                  search_queries.data(),
+                                  database.data(),
+                                  ps.num_queries,
+                                  ps.num_db_vecs,
+                                  ps.dim,
+                                  ps.k,
+                                  ps.index_params.metric,
+                                  stream_);
     distances_ref.resize(queries_size);
     update_host(distances_ref.data(), distances_naive_dev.data(), queries_size, stream_);
     indices_ref.resize(queries_size);
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index f98f0fa771..4b07db32f4 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -103,100 +103,6 @@ inline auto operator<<(std::ostream& os, const print_metric& p) -> std::ostream&
   return os;
 }
 
-template <typename EvalT, typename DataT, typename IdxT>
-__global__ void naive_distance_kernel(EvalT* dist,
-                                      const DataT* x,
-                                      const DataT* y,
-                                      IdxT m,
-                                      IdxT n,
-                                      IdxT k,
-                                      raft::distance::DistanceType metric)
-{
-  IdxT midx = IdxT(threadIdx.x) + IdxT(blockIdx.x) * IdxT(blockDim.x);
-  if (midx >= m) return;
-  IdxT grid_size = IdxT(blockDim.y) * IdxT(gridDim.y);
-  for (IdxT nidx = threadIdx.y + blockIdx.y * blockDim.y; nidx < n; nidx += grid_size) {
-    EvalT acc = EvalT(0);
-    for (IdxT i = 0; i < k; ++i) {
-      IdxT xidx = i + midx * k;
-      IdxT yidx = i + nidx * k;
-      auto xv   = EvalT(x[xidx]);
-      auto yv   = EvalT(y[yidx]);
-      switch (metric) {
-        case raft::distance::DistanceType::InnerProduct: {
-          acc += xv * yv;
-        } break;
-        case raft::distance::DistanceType::L2SqrtExpanded:
-        case raft::distance::DistanceType::L2SqrtUnexpanded:
-        case raft::distance::DistanceType::L2Expanded:
-        case raft::distance::DistanceType::L2Unexpanded: {
-          auto diff = xv - yv;
-          acc += diff * diff;
-        } break;
-        default: break;
-      }
-    }
-    switch (metric) {
-      case raft::distance::DistanceType::L2SqrtExpanded:
-      case raft::distance::DistanceType::L2SqrtUnexpanded: {
-        acc = raft::sqrt(acc);
-      } break;
-      default: break;
-    }
-    dist[midx * n + nidx] = acc;
-  }
-}
-
-/**
- * TODO: either replace this with brute_force_knn or with distance+select_k
- *       when either distance or brute_force_knn support 8-bit int inputs.
- */
-template <typename EvalT, typename DataT, typename IdxT>
-void naiveBfKnn(EvalT* dist_topk,
-                IdxT* indices_topk,
-                const DataT* x,
-                const DataT* y,
-                size_t n_inputs,
-                size_t input_len,
-                size_t dim,
-                uint32_t k,
-                raft::distance::DistanceType type,
-                rmm::cuda_stream_view stream)
-{
-  rmm::mr::device_memory_resource* mr = nullptr;
-  auto pool_guard                     = raft::get_pool_memory_resource(mr, 1024 * 1024);
-
-  dim3 block_dim(16, 32, 1);
-  // maximum reasonable grid size in `y` direction
-  auto grid_y =
-    static_cast<uint16_t>(std::min<size_t>(raft::ceildiv<size_t>(input_len, block_dim.y), 32768));
-
-  // bound the memory used by this function
-  size_t max_batch_size =
-    std::min<size_t>(n_inputs, raft::ceildiv<size_t>(size_t(1) << size_t(27), input_len));
-  rmm::device_uvector<EvalT> dist(max_batch_size * input_len, stream, mr);
-
-  for (size_t offset = 0; offset < n_inputs; offset += max_batch_size) {
-    size_t batch_size = std::min(max_batch_size, n_inputs - offset);
-    dim3 grid_dim(raft::ceildiv<size_t>(batch_size, block_dim.x), grid_y, 1);
-
-    naive_distance_kernel<EvalT, DataT, IdxT><<<grid_dim, block_dim, 0, stream>>>(
-      dist.data(), x + offset * dim, y, batch_size, input_len, dim, type);
-
-    matrix::detail::select_k<EvalT, IdxT>(dist.data(),
-                                          nullptr,
-                                          batch_size,
-                                          input_len,
-                                          static_cast<int>(k),
-                                          dist_topk + offset * k,
-                                          indices_topk + offset * k,
-                                          type != raft::distance::DistanceType::InnerProduct,
-                                          stream,
-                                          mr);
-  }
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-}
-
 template <typename IdxT, typename DistT, typename CompareDist>
 struct idx_dist_pair {
   IdxT idx;
diff --git a/cpp/test/neighbors/refine.cu b/cpp/test/neighbors/refine.cu
index 674171e030..4f17fd5a22 100644
--- a/cpp/test/neighbors/refine.cu
+++ b/cpp/test/neighbors/refine.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
-#include "refine_helper.cuh"
+#include <internal/neighbors/refine_helper.cuh>
 
 #include <raft/core/handle.hpp>
 #include <raft/core/logger.hpp>
@@ -40,11 +40,11 @@
 namespace raft::neighbors {
 
 template <typename DataT, typename DistanceT, typename IdxT>
-class RefineTest : public ::testing::TestWithParam<detail::RefineInputs<IdxT>> {
+class RefineTest : public ::testing::TestWithParam<RefineInputs<IdxT>> {
  public:
   RefineTest()
     : stream_(handle_.get_stream()),
-      data(handle_, ::testing::TestWithParam<detail::RefineInputs<IdxT>>::GetParam())
+      data(handle_, ::testing::TestWithParam<RefineInputs<IdxT>>::GetParam())
   {
   }
 
@@ -104,11 +104,11 @@ class RefineTest : public ::testing::TestWithParam<detail::RefineInputs<IdxT>> {
  public:
   raft::handle_t handle_;
   rmm::cuda_stream_view stream_;
-  detail::RefineHelper<DataT, DistanceT, IdxT> data;
+  RefineHelper<DataT, DistanceT, IdxT> data;
 };
 
-const std::vector<detail::RefineInputs<int64_t>> inputs =
-  raft::util::itertools::product<detail::RefineInputs<int64_t>>(
+const std::vector<RefineInputs<int64_t>> inputs =
+  raft::util::itertools::product<RefineInputs<int64_t>>(
     {137},
     {1000},
     {16},

From 0e05542b5c78c44961a0089f7dc750d7bd352406 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 24 Jan 2023 09:19:53 +0100
Subject: [PATCH 3/4] Rename the root folder of internal->raft_internal

---
 cpp/bench/matrix/select_k.cu                                    | 2 +-
 cpp/bench/neighbors/refine.cu                                   | 2 +-
 cpp/internal/{internal => raft_internal}/matrix/select_k.cuh    | 0
 .../{internal => raft_internal}/neighbors/naive_knn.cuh         | 2 +-
 .../{internal => raft_internal}/neighbors/refine_helper.cuh     | 2 +-
 cpp/test/matrix/select_k.cu                                     | 2 +-
 cpp/test/neighbors/ann_ivf_flat.cu                              | 2 +-
 cpp/test/neighbors/ann_ivf_pq.cuh                               | 2 +-
 cpp/test/neighbors/refine.cu                                    | 2 +-
 9 files changed, 8 insertions(+), 8 deletions(-)
 rename cpp/internal/{internal => raft_internal}/matrix/select_k.cuh (100%)
 rename cpp/internal/{internal => raft_internal}/neighbors/naive_knn.cuh (98%)
 rename cpp/internal/{internal => raft_internal}/neighbors/refine_helper.cuh (99%)

diff --git a/cpp/bench/matrix/select_k.cu b/cpp/bench/matrix/select_k.cu
index 0d3e3c592e..3279c011cc 100644
--- a/cpp/bench/matrix/select_k.cu
+++ b/cpp/bench/matrix/select_k.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <internal/matrix/select_k.cuh>
+#include <raft_internal/matrix/select_k.cuh>
 
 #include <common/benchmark.hpp>
 
diff --git a/cpp/bench/neighbors/refine.cu b/cpp/bench/neighbors/refine.cu
index ab6d41d3ee..cfce402968 100644
--- a/cpp/bench/neighbors/refine.cu
+++ b/cpp/bench/neighbors/refine.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <internal/neighbors/refine_helper.cuh>
+#include <raft_internal/neighbors/refine_helper.cuh>
 
 #include <common/benchmark.hpp>
 
diff --git a/cpp/internal/internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh
similarity index 100%
rename from cpp/internal/internal/matrix/select_k.cuh
rename to cpp/internal/raft_internal/matrix/select_k.cuh
diff --git a/cpp/internal/internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
similarity index 98%
rename from cpp/internal/internal/neighbors/naive_knn.cuh
rename to cpp/internal/raft_internal/neighbors/naive_knn.cuh
index fb93a05fa1..3ad055272b 100644
--- a/cpp/internal/internal/neighbors/naive_knn.cuh
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/internal/internal/neighbors/refine_helper.cuh b/cpp/internal/raft_internal/neighbors/refine_helper.cuh
similarity index 99%
rename from cpp/internal/internal/neighbors/refine_helper.cuh
rename to cpp/internal/raft_internal/neighbors/refine_helper.cuh
index d51f6a5a40..1d8c5600bd 100644
--- a/cpp/internal/internal/neighbors/refine_helper.cuh
+++ b/cpp/internal/raft_internal/neighbors/refine_helper.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <internal/neighbors/naive_knn.cuh>
+#include <raft_internal/neighbors/naive_knn.cuh>
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
diff --git a/cpp/test/matrix/select_k.cu b/cpp/test/matrix/select_k.cu
index b15276c492..a8b5d60bb8 100644
--- a/cpp/test/matrix/select_k.cu
+++ b/cpp/test/matrix/select_k.cu
@@ -16,7 +16,7 @@
 
 #include "../test_utils.cuh"
 
-#include <internal/matrix/select_k.cuh>
+#include <raft_internal/matrix/select_k.cuh>
 
 #include <raft/core/handle.hpp>
 #include <raft/random/rng.cuh>
diff --git a/cpp/test/neighbors/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu
index f4cf4dcbf8..232759a948 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cu
+++ b/cpp/test/neighbors/ann_ivf_flat.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
-#include <internal/neighbors/naive_knn.cuh>
+#include <raft_internal/neighbors/naive_knn.cuh>
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index be17d0135a..178078b297 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -18,7 +18,7 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
-#include <internal/neighbors/naive_knn.cuh>
+#include <raft_internal/neighbors/naive_knn.cuh>
 
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
diff --git a/cpp/test/neighbors/refine.cu b/cpp/test/neighbors/refine.cu
index 4f17fd5a22..e2575f0f4e 100644
--- a/cpp/test/neighbors/refine.cu
+++ b/cpp/test/neighbors/refine.cu
@@ -17,7 +17,7 @@
 #include "../test_utils.cuh"
 #include "ann_utils.cuh"
 
-#include <internal/neighbors/refine_helper.cuh>
+#include <raft_internal/neighbors/refine_helper.cuh>
 
 #include <raft/core/handle.hpp>
 #include <raft/core/logger.hpp>

From 9ff0b2a6371989c2db6255cc9f9aad3874c38261 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 25 Jan 2023 10:19:00 +0100
Subject: [PATCH 4/4] Add a short description for the new folder {internal}

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 34d66cbbc3..fa8b2b36e0 100755
--- a/README.md
+++ b/README.md
@@ -315,6 +315,7 @@ The folder structure mirrors other RAPIDS repos, with the following folders:
       - `solver`: Sparse solvers for optimization and approximation
     - `stats`: Moments, summary statistics, model performance measures
     - `util`: Various reusable tools and utilities for accelerated algorithm development
+  - `internal`: A private header-only component that hosts the code shared between benchmarks.
   - `scripts`: Helpful scripts for development
   - `src`: Compiled APIs and template specializations for the shared libraries
   - `test`: Googletests source code