diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 9f0a6096d9..5214047571 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -19,6 +19,7 @@ set(RAFT_CPP_BENCH_TARGET "bench_raft")
 # (please keep the filenames in alphabetical order)
 add_executable(${RAFT_CPP_BENCH_TARGET}
   bench/linalg/reduce.cu
+  bench/spatial/selection.cu
   bench/main.cpp
 )
 
diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
new file mode 100644
index 0000000000..09d02940a5
--- /dev/null
+++ b/cpp/bench/spatial/selection.cu
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/spatial/knn/knn.cuh>
+
+#include <raft/random/rng.hpp>
+#include <raft/sparse/detail/utils.h>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft::bench::spatial {
+
+struct params {
+  int n_inputs;
+  int input_len;
+  int k;
+  int select_min;
+};
+
+template <typename KeyT, typename IdxT, raft::spatial::knn::SelectKAlgo Algo>
+struct selection : public fixture {
+  explicit selection(const params& p)
+    : params_(p),
+      in_dists_(p.n_inputs * p.input_len, stream),
+      in_ids_(p.n_inputs * p.input_len, stream),
+      out_dists_(p.n_inputs * p.k, stream),
+      out_ids_(p.n_inputs * p.k, stream)
+  {
+    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.n_inputs), IdxT(p.input_len), stream);
+    raft::random::Rng(42).uniform(
+      in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0), stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    using_pool_memory_res res;
+    try {
+      std::ostringstream label_stream;
+      label_stream << params_.n_inputs << "#" << params_.input_len << "#" << params_.k;
+      state.SetLabel(label_stream.str());
+      loop_on_state(state, [this]() {
+        raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_.data(),
+                                                 in_ids_.data(),
+                                                 params_.n_inputs,
+                                                 params_.input_len,
+                                                 out_dists_.data(),
+                                                 out_ids_.data(),
+                                                 params_.select_min,
+                                                 params_.k,
+                                                 stream,
+                                                 Algo);
+      });
+    } catch (raft::exception& e) {
+      state.SkipWithError(e.what());
+    }
+  }
+
+ private:
+  const params params_;
+  rmm::device_uvector<KeyT> in_dists_, out_dists_;
+  rmm::device_uvector<IdxT> in_ids_, out_ids_;
+};
+
+const std::vector<params> kInputs{
+  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
+  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
+  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
+
+  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
+  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
+  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
+
+  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
+  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
+  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
+
+  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
+  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
+  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
+};
+
+#define SELECTION_REGISTER(KeyT, IdxT, Algo)                                      \
+  namespace BENCHMARK_PRIVATE_NAME(selection)                                     \
+  {                                                                               \
+    using SelectK = selection<KeyT, IdxT, raft::spatial::knn::SelectKAlgo::Algo>; \
+    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #Algo, kInputs);             \
+  }
+
+SELECTION_REGISTER(float, int, FAISS);
+SELECTION_REGISTER(float, int, RADIX_8_BITS);
+SELECTION_REGISTER(float, int, RADIX_11_BITS);
+SELECTION_REGISTER(float, int, WARP_SORT);
+
+SELECTION_REGISTER(double, int, FAISS);
+SELECTION_REGISTER(double, int, RADIX_8_BITS);
+SELECTION_REGISTER(double, int, RADIX_11_BITS);
+SELECTION_REGISTER(double, int, WARP_SORT);
+
+SELECTION_REGISTER(double, size_t, FAISS);
+SELECTION_REGISTER(double, size_t, RADIX_8_BITS);
+SELECTION_REGISTER(double, size_t, RADIX_11_BITS);
+SELECTION_REGISTER(double, size_t, WARP_SORT);
+
+}  // namespace raft::bench::spatial
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 4ba1e18768..05fce6c0c4 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -404,6 +404,22 @@ IntType gcd(IntType a, IntType b)
   return a;
 }
 
+template <typename T>
+constexpr T lower_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
+    return -std::numeric_limits<T>::infinity();
+  }
+  return std::numeric_limits<T>::lowest();
+}
+
+template <typename T>
+constexpr T upper_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity) { return std::numeric_limits<T>::infinity(); }
+  return std::numeric_limits<T>::max();
+}
+
 }  // namespace raft
 
 #endif
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 03a4eabaac..2d2fabd9d6 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,24 +31,30 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename K, typename IndexType, bool select_min, int warp_q, int thread_q, int tpb>
-__global__ void select_k_kernel(K* inK,
-                                IndexType* inV,
+template <typename key_t, typename payload_t>
+constexpr int kFaissMaxK()
+{
+  return (sizeof(key_t) + sizeof(payload_t) > 8) ? 512 : 1024;
+}
+
+template <typename key_t, typename payload_t, bool select_min, int warp_q, int thread_q, int tpb>
+__global__ void select_k_kernel(key_t* inK,
+                                payload_t* inV,
                                 size_t n_rows,
                                 size_t n_cols,
-                                K* outK,
-                                IndexType* outV,
-                                K initK,
-                                IndexType initV,
+                                key_t* outK,
+                                payload_t* outV,
+                                key_t initK,
+                                payload_t initV,
                                 int k)
 {
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
-  __shared__ K smemK[kNumWarps * warp_q];
-  __shared__ IndexType smemV[kNumWarps * warp_q];
+  __shared__ key_t smemK[kNumWarps * warp_q];
+  __shared__ payload_t smemV[kNumWarps * warp_q];
 
   faiss::gpu::
-    BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>, warp_q, thread_q, tpb>
+    BlockSelect<key_t, payload_t, select_min, faiss::gpu::Comparator<key_t>, warp_q, thread_q, tpb>
       heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
@@ -56,8 +62,8 @@ __global__ void select_k_kernel(K* inK,
   int i   = threadIdx.x;
 
   int idx             = row * n_cols;
-  K* inKStart         = inK + idx + i;
-  IndexType* inVStart = inV + idx + i;
+  key_t* inKStart     = inK + idx + i;
+  payload_t* inVStart = inV + idx + i;
 
   // Whole warps must participate in the selection
   int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
@@ -84,13 +90,13 @@ __global__ void select_k_kernel(K* inK,
   }
 }
 
-template <typename value_idx = int, typename value_t = float, int warp_q, int thread_q>
-inline void select_k_impl(value_t* inK,
-                          value_idx* inV,
+template <typename payload_t = int, typename key_t = float, int warp_q, int thread_q>
+inline void select_k_impl(key_t* inK,
+                          payload_t* inV,
                           size_t n_rows,
                           size_t n_cols,
-                          value_t* outK,
-                          value_idx* outV,
+                          key_t* outK,
+                          payload_t* outV,
                           bool select_min,
                           int k,
                           cudaStream_t stream)
@@ -100,14 +106,13 @@ inline void select_k_impl(value_t* inK,
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
   auto block              = dim3(n_threads);
 
-  auto kInit =
-    select_min ? faiss::gpu::Limits<value_t>::getMax() : faiss::gpu::Limits<value_t>::getMin();
+  auto kInit = select_min ? upper_bound<key_t>() : lower_bound<key_t>();
   auto vInit = -1;
   if (select_min) {
-    select_k_kernel<value_t, value_idx, false, warp_q, thread_q, n_threads>
+    select_k_kernel<key_t, payload_t, false, warp_q, thread_q, n_threads>
       <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   } else {
-    select_k_kernel<value_t, value_idx, true, warp_q, thread_q, n_threads>
+    select_k_kernel<key_t, payload_t, true, warp_q, thread_q, n_threads>
       <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   }
   RAFT_CUDA_TRY(cudaGetLastError());
@@ -127,38 +132,41 @@ inline void select_k_impl(value_t* inK,
  * @param[in] k number of neighbors per partition (also number of merged neighbors)
  * @param[in] stream CUDA stream to use
  */
-template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
+template <typename payload_t = int, typename key_t = float>
+inline void select_k(key_t* inK,
+                     payload_t* inV,
                      size_t n_rows,
                      size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
+                     key_t* outK,
+                     payload_t* outV,
                      bool select_min,
                      int k,
                      cudaStream_t stream)
 {
+  constexpr int max_k = kFaissMaxK<payload_t, key_t>();
   if (k == 1)
-    select_k_impl<value_idx, value_t, 1, 1>(
+    select_k_impl<payload_t, key_t, 1, 1>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 32)
-    select_k_impl<value_idx, value_t, 32, 2>(
+    select_k_impl<payload_t, key_t, 32, 2>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 64)
-    select_k_impl<value_idx, value_t, 64, 3>(
+    select_k_impl<payload_t, key_t, 64, 3>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 128)
-    select_k_impl<value_idx, value_t, 128, 3>(
+    select_k_impl<payload_t, key_t, 128, 3>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 256)
-    select_k_impl<value_idx, value_t, 256, 4>(
+    select_k_impl<payload_t, key_t, 256, 4>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 512)
-    select_k_impl<value_idx, value_t, 512, 8>(
+    select_k_impl<payload_t, key_t, 512, 8>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 1024)
-    select_k_impl<value_idx, value_t, 1024, 8>(
+  else if (k <= 1024 && k <= max_k)
+    select_k_impl<payload_t, key_t, max_k, 8>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else
+    ASSERT(k <= max_k, "Current max k is %d (requested %d)", max_k, k);
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
new file mode 100644
index 0000000000..44ffe6bc50
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+
+namespace raft::spatial::knn::detail::topk {
+
+namespace helpers {
+
+template <typename T>
+__device__ __forceinline__ void swap(T& x, T& y)
+{
+  T t = x;
+  x   = y;
+  y   = t;
+}
+
+template <typename T>
+__device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
+{
+  if (cond) { ptr = x; }
+}
+
+}  // namespace helpers
+
+/**
+ * Warp-wide bitonic merge and sort.
+ * The data is strided among `warp_width` threads,
+ * e.g. calling `bitonic<4>(ascending=true).sort(arr)` takes a unique 4-element array as input of
+ * each thread in a warp and sorts them, such that for a fixed i, arr[i] are sorted within the
+ * threads in a warp, and for any i < j, arr[j] in any thread is not smaller than arr[i] in any
+ * other thread.
+ * When `warp_width < WarpSize`, the data is sorted within all subwarps of the warp independently.
+ *
+ * As an example, assuming `Size = 4`, `warp_width = 16`, and `WarpSize = 32`, sorting a permutation
+ * of numbers 0-63 in each subwarp yield the following result:
+ * `
+ *  arr_i \ laneId()
+ *       0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15    16  17  18 ...
+ *      subwarp_1                                                         subwarp_2
+ *   0   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15     0   1   2 ...
+ *   1  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31    16  17  18 ...
+ *   2  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47    32  33  34 ...
+ *   3  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63    48  49  50 ...
+ * `
+ *
+ * @tparam Size
+ *   number of elements processed in each thread;
+ *   i.e. the total data size is `Size * warp_width`.
+ *   Must be power-of-two.
+ *
+ */
+template <int Size = 1>
+class bitonic {
+  static_assert(isPo2(Size));
+
+ public:
+  /**
+   * Initialize bitonic sort config.
+   *
+   * @param ascending
+   *   the resulting order (true: ascending, false: descending).
+   * @param warp_width
+   *   the number of threads participating in the warp-level primitives;
+   *   the total size of the sorted data is `Size * warp_width`.
+   *   Must be power-of-two, not larger than the WarpSize.
+   */
+  __device__ __forceinline__ explicit bitonic(bool ascending, int warp_width = WarpSize)
+    : ascending_(ascending), warp_width_(warp_width)
+  {
+  }
+
+  bitonic(bitonic const&) = delete;
+  bitonic(bitonic&&)      = delete;
+  auto operator=(bitonic const&) -> bitonic& = delete;
+  auto operator=(bitonic&&) -> bitonic& = delete;
+
+  /**
+   * You can think of this function in two ways:
+   *
+   *   1) Sort any bitonic sequence.
+   *   2) Merge two halfs of the input data assuming they're already sorted, and their order is
+   *      opposite (i.e. either ascending, descending or vice-versa).
+   *
+   * The input pointers are unique per-thread.
+   * See the class description for the description of the data layout.
+   *
+   * @param keys
+   *   is a device pointer to a contiguous array of keys, unique per thread; must be at least `Size`
+   *   elements long.
+   * @param payloads
+   *   are zero or more associated arrays of the same size as keys, which are sorted together with
+   *   the keys; must be at least `Size` elements long.
+   */
+  template <typename KeyT, typename... PayloadTs>
+  __device__ __forceinline__ void merge(KeyT* __restrict__ keys,
+                                        PayloadTs* __restrict__... payloads) const
+  {
+    return bitonic<Size>::merge_(ascending_, warp_width_, keys, payloads...);
+  }
+
+  /**
+   * Sort the data.
+   * The input pointers are unique per-thread.
+   * See the class description for the description of the data layout.
+   *
+   * @param keys
+   *   is a device pointer to a contiguous array of keys, unique per thread; must be at least `Size`
+   *   elements long.
+   * @param payloads
+   *   are zero or more associated arrays of the same size as keys, which are sorted together with
+   *   the keys; must be at least `Size` elements long.
+   */
+  template <typename KeyT, typename... PayloadTs>
+  __device__ __forceinline__ void sort(KeyT* __restrict__ keys,
+                                       PayloadTs* __restrict__... payloads) const
+  {
+    return bitonic<Size>::sort_(ascending_, warp_width_, keys, payloads...);
+  }
+
+  /**
+   * @brief `merge` variant for the case of one element per thread
+   *        (pass input by a reference instead of a pointer).
+   *
+   * @param key
+   * @param payload
+   */
+  template <typename KeyT, typename... PayloadTs, int S = Size>
+  __device__ __forceinline__ auto merge(KeyT& __restrict__ key,
+                                        PayloadTs& __restrict__... payload) const
+    -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
+  {
+    static_assert(S == Size);
+    return merge(&key, &payload...);
+  }
+
+  /**
+   * @brief `sort` variant for the case of one element per thread
+   *        (pass input by a reference instead of a pointer).
+   *
+   * @param key
+   * @param payload
+   */
+  template <typename KeyT, typename... PayloadTs, int S = Size>
+  __device__ __forceinline__ auto sort(KeyT& __restrict__ key,
+                                       PayloadTs& __restrict__... payload) const
+    -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
+  {
+    static_assert(S == Size);
+    return sort(&key, &payload...);
+  }
+
+ private:
+  const int warp_width_;
+  const bool ascending_;
+
+  template <int AnotherSize>
+  friend class bitonic;
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void merge_(bool ascending,
+                                                int warp_width,
+                                                KeyT* __restrict__ keys,
+                                                PayloadTs* __restrict__... payloads)
+  {
+#pragma unroll
+    for (int size = Size; size > 1; size >>= 1) {
+      const int stride = size >> 1;
+#pragma unroll
+      for (int offset = 0; offset < Size; offset += size) {
+#pragma unroll
+        for (int i = offset + stride - 1; i >= offset; i--) {
+          const int other_i = i + stride;
+          KeyT& key         = keys[i];
+          KeyT& other       = keys[other_i];
+          if (ascending ? key > other : key < other) {
+            helpers::swap(key, other);
+            (helpers::swap(payloads[i], payloads[other_i]), ...);
+          }
+        }
+      }
+    }
+    const int lane = laneId();
+#pragma unroll
+    for (int i = 0; i < Size; i++) {
+      KeyT& key = keys[i];
+      for (int stride = (warp_width >> 1); stride > 0; stride >>= 1) {
+        const bool is_second = lane & stride;
+        const KeyT other     = shfl_xor(key, stride, warp_width);
+        const bool do_assign = (ascending != is_second) ? key > other : key < other;
+
+        helpers::conditional_assign(do_assign, key, other);
+        // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
+        (helpers::conditional_assign(
+           do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
+         ...);
+      }
+    }
+  }
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void sort_(bool ascending,
+                                               int warp_width,
+                                               KeyT* __restrict__ keys,
+                                               PayloadTs* __restrict__... payloads)
+  {
+    if constexpr (Size == 1) {
+      const int lane = laneId();
+      for (int width = 2; width < warp_width; width <<= 1) {
+        bitonic<1>::merge_(lane & width, width, keys, payloads...);
+      }
+    } else {
+      constexpr int kSize2 = Size / 2;
+      bitonic<kSize2>::sort_(false, warp_width, keys, payloads...);
+      bitonic<kSize2>::sort_(true, warp_width, keys + kSize2, (payloads + kSize2)...);
+    }
+    bitonic<Size>::merge_(ascending, warp_width, keys, payloads...);
+  }
+};
+
+}  // namespace raft::spatial::knn::detail::topk
diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
new file mode 100644
index 0000000000..21e6ea026c
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -0,0 +1,608 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+
+#include <raft/cudart_utils.h>
+#include <raft/device_atomics.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft::spatial::knn::detail::topk {
+
+constexpr int ITEM_PER_THREAD      = 32;
+constexpr int VECTORIZED_READ_SIZE = 16;
+
+template <int BitsPerPass>
+__host__ __device__ constexpr int calc_num_buckets()
+{
+  return 1 << BitsPerPass;
+}
+
+template <typename T, int BitsPerPass>
+__host__ __device__ constexpr int calc_num_passes()
+{
+  return ceildiv<int>(sizeof(T) * 8, BitsPerPass);
+}
+
+/**
+ * Bit 0 is the least significant (rightmost);
+ * this implementation processes input from the most to the least significant bit.
+ * This way, we can skip some passes in the end at the cost of having an unsorted output.
+ *
+ * NB: Use pass=-1 for calc_mask().
+ */
+template <typename T, int BitsPerPass>
+__device__ constexpr int calc_start_bit(int pass)
+{
+  int start_bit = static_cast<int>(sizeof(T) * 8) - (pass + 1) * BitsPerPass;
+  if (start_bit < 0) { start_bit = 0; }
+  return start_bit;
+}
+
+template <typename T, int BitsPerPass>
+__device__ constexpr unsigned calc_mask(int pass)
+{
+  static_assert(BitsPerPass <= 31);
+  int num_bits = calc_start_bit<T, BitsPerPass>(pass - 1) - calc_start_bit<T, BitsPerPass>(pass);
+  return (1 << num_bits) - 1;
+}
+
+/**
+ * Use cub to twiddle bits - so that we can correctly compare bits of floating-point values as well
+ * as of integers.
+ */
+template <typename T>
+__device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
+{
+  auto bits = reinterpret_cast<typename cub::Traits<T>::UnsignedBits&>(key);
+  bits      = cub::Traits<T>::TwiddleIn(bits);
+  if (greater) { bits = ~bits; }
+  return bits;
+}
+
+template <typename T, int BitsPerPass>
+__device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
+{
+  static_assert(BitsPerPass <= sizeof(int) * 8 - 1);  // so return type can be int
+  return (twiddle_in(x, greater) >> start_bit) & mask;
+}
+
+/**
+ * Map a Func over the input data, using vectorized load instructions if possible.
+ *
+ * NB: in future, we should move this to cpp/include/raft/linalg/detail/unary_op.cuh, which
+ *     currently does not support the second lambda argument (index of an element)
+ *
+ * @tparam T element type
+ * @tparam IdxT indexing type
+ * @tparam Func void (T x, IdxT idx)
+ *
+ * @param in the input data
+ * @param len the number of elements to read
+ * @param f the lambda taking two arguments (T x, IdxT idx)
+ */
+template <typename T, typename IdxT, typename Func>
+__device__ void vectorized_process(const T* in, IdxT len, Func f)
+{
+  const IdxT stride = blockDim.x * gridDim.x;
+  const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
+  if constexpr (sizeof(T) >= VECTORIZED_READ_SIZE || VECTORIZED_READ_SIZE % sizeof(T) != 0) {
+    for (IdxT i = tid; i < len; i += stride) {
+      f(in[i], i);
+    }
+  } else {
+    using wide_t      = TxN_t<T, VECTORIZED_READ_SIZE / sizeof(T)>;
+    using align_bytes = Pow2<(size_t)VECTORIZED_READ_SIZE>;
+    using align_elems = Pow2<wide_t::Ratio>;
+    wide_t wide;
+
+    // how many elements to skip in order to do aligned vectorized load
+    const IdxT skip_cnt_left = std::min<IdxT>((IdxT)(align_bytes::roundUp(in) - in), len);
+
+    // The main loop: process all aligned data
+    for (IdxT i = tid * wide_t::Ratio + skip_cnt_left; i + wide_t::Ratio <= len;
+         i += stride * wide_t::Ratio) {
+      wide.load(in, i);
+#pragma unroll
+      for (int j = 0; j < wide_t::Ratio; ++j) {
+        f(wide.val.data[j], i + j);
+      }
+    }
+
+    static_assert(WarpSize >= wide_t::Ratio);
+    // Processes the skipped elements on the left
+    if (tid < skip_cnt_left) { f(in[tid], tid); }
+    // Processes the skipped elements on the right
+    const IdxT skip_cnt_right = align_elems::mod(len - skip_cnt_left);
+    const IdxT remain_i       = len - skip_cnt_right + tid;
+    if (remain_i < len) { f(in[remain_i], remain_i); }
+  }
+}
+
+template <typename T, typename IdxT>
+struct Counter {
+  IdxT k;
+  IdxT len;
+  IdxT previous_len;
+  int bucket;
+
+  IdxT filter_cnt;
+  unsigned int finished_block_cnt;
+  IdxT out_cnt;
+  IdxT out_back_cnt;
+};
+
+/**
+ * Fused filtering of the current phase and building histogram for the next phase
+ * (see steps 4-1 in `radix_kernel` description).
+ */
+template <typename T, typename IdxT, int BitsPerPass>
+__device__ void filter_and_histogram(const T* in_buf,
+                                     const IdxT* in_idx_buf,
+                                     T* out_buf,
+                                     IdxT* out_idx_buf,
+                                     T* out,
+                                     IdxT* out_idx,
+                                     IdxT len,
+                                     Counter<T, IdxT>* counter,
+                                     IdxT* histogram,
+                                     bool greater,
+                                     int pass,
+                                     int k)
+{
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  __shared__ IdxT histogram_smem[num_buckets];
+  for (IdxT i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+    histogram_smem[i] = 0;
+  }
+  __syncthreads();
+
+  const int start_bit = calc_start_bit<T, BitsPerPass>(pass);
+  const unsigned mask = calc_mask<T, BitsPerPass>(pass);
+
+  if (pass == 0) {
+    // Passed to vectorized_process, this function executes in all blocks in parallel,
+    // i.e. the work is split along the input (both, in batches and chunks of a single row).
+    // Later, the histograms are merged using atomicAdd.
+    auto f = [greater, start_bit, mask](T value, IdxT) {
+      int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
+      atomicAdd(histogram_smem + bucket, IdxT(1));
+    };
+    vectorized_process(in_buf, len, f);
+  } else {
+    const IdxT previous_len      = counter->previous_len;
+    const int want_bucket        = counter->bucket;
+    IdxT& filter_cnt             = counter->filter_cnt;
+    IdxT& out_cnt                = counter->out_cnt;
+    const IdxT counter_len       = counter->len;
+    const int previous_start_bit = calc_start_bit<T, BitsPerPass>(pass - 1);
+    const unsigned previous_mask = calc_mask<T, BitsPerPass>(pass - 1);
+
+    // See the remark above on the distributed execution of `f` using vectorized_process.
+    auto f = [in_idx_buf,
+              out_buf,
+              out_idx_buf,
+              out,
+              out_idx,
+              greater,
+              k,
+              start_bit,
+              mask,
+              previous_start_bit,
+              previous_mask,
+              want_bucket,
+              &filter_cnt,
+              &out_cnt,
+              counter_len](T value, IdxT i) {
+      int prev_bucket =
+        calc_bucket<T, BitsPerPass>(value, previous_start_bit, previous_mask, greater);
+      if (prev_bucket == want_bucket) {
+        IdxT pos     = atomicAdd(&filter_cnt, IdxT(1));
+        out_buf[pos] = value;
+        if (out_idx_buf) { out_idx_buf[pos] = in_idx_buf ? in_idx_buf[i] : i; }
+        int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
+        atomicAdd(histogram_smem + bucket, IdxT(1));
+
+        if (counter_len == 1) {
+          out[k - 1]     = value;
+          out_idx[k - 1] = in_idx_buf ? in_idx_buf[i] : i;
+        }
+      } else if (prev_bucket < want_bucket) {
+        IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
+        out[pos]     = value;
+        out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
+      }
+    };
+
+    vectorized_process(in_buf, previous_len, f);
+  }
+  __syncthreads();
+
+  // merge histograms produced by individual blocks
+  for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+    if (histogram_smem[i] != 0) { atomicAdd(histogram + i, histogram_smem[i]); }
+  }
+}
+
+/**
+ * Replace a part of the histogram with its own prefix sum, starting from the `start` and adding
+ * `current` to each entry of the result.
+ * (step 2 in `radix_kernel` description)
+ */
+template <typename IdxT, int BitsPerPass, int BlockSize>
+__device__ void scan(volatile IdxT* histogram,
+                     const int start,
+                     const int num_buckets,
+                     const IdxT current)
+{
+  typedef cub::BlockScan<IdxT, BlockSize> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  IdxT thread_data = 0;
+  int index        = start + threadIdx.x;
+  if (index < num_buckets) { thread_data = histogram[index]; }
+
+  BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+  __syncthreads();
+  if (index < num_buckets) { histogram[index] = thread_data + current; }
+  __syncthreads();  // This sync is necessary, as the content of histogram needs
+                    // to be read after
+}
+
+/**
+ * Calculate in which bucket the k-th value will fall
+ *  (steps 2-3 in `radix_kernel` description)
+ */
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+__device__ void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
+{
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  int index                 = threadIdx.x;
+  IdxT last_prefix_sum      = 0;
+  int num_pass              = 1;
+  if constexpr (num_buckets >= BlockSize) {
+    static_assert(num_buckets % BlockSize == 0);
+    num_pass = num_buckets / BlockSize;
+  }
+
+  for (int i = 0; i < num_pass && (last_prefix_sum < k); i++) {
+    // Turn the i-th chunk of the histogram into its prefix sum.
+    scan<IdxT, BitsPerPass, BlockSize>(histogram, i * BlockSize, num_buckets, last_prefix_sum);
+    if (index < num_buckets) {
+      // Number of values in the previous `index-1` buckets (see the `scan` op above)
+      IdxT prev = (index == 0) ? 0 : histogram[index - 1];
+      // Number of values in `index` buckets
+      IdxT cur = histogram[index];
+
+      // one and only one thread will satisfy this condition, so only write once
+      if (prev < k && cur >= k) {
+        counter->k            = k - prev;  // how many values still are there to find
+        counter->previous_len = counter->len;
+        counter->len          = cur - prev;  // number of values in `index` bucket
+        counter->bucket       = index;
+      }
+    }
+    index += BlockSize;
+    // this will break the loop when the counter is set (cur >= k), because last_prefix_sum >= cur
+    last_prefix_sum = histogram[(i + 1) * BlockSize - 1];
+  }
+}
+
+/**
+ *
+ * It is expected to call this kernel multiple times (passes), in each pass we process a radix,
+ * going from the most significant towards the least significant bits (MSD).
+ *
+ * Conceptually, each pass consists of 4 steps:
+ *
+ * 1. Calculate histogram
+ *      First, transform bits into a digit, the value of which is in the range
+ *      [0, 2^{BITS_PER_PASS}-1]. Then count the frequency of each digit value and the result is a
+ *      histogram. That is, histogram[i] contains the count of inputs having value i.
+ *
+ * 2. Scan the histogram
+ *      Inclusive prefix sum is computed for the histogram. After this step, histogram[i] contains
+ *      the count of inputs having value <= i.
+ *
+ * 3. Find the bucket j of the histogram that the k-th value falls into
+ *
+ * 4. Filtering
+ *      Input elements whose digit value <j are the top-k elements. We put them into the result
+ *      array out. The number of such elements is histogram[j-1]. Since the k-th value must be in
+ *      the bucket j, we write all elements in bucket j into a intermediate buffer out_buf. For the
+ *      next pass, these elements are used as input, and we would like to find the
+ *      (k - histogram[j-1])-th value among them. That is, the k in the next pass is set to
+ *      (k - histogram[j-1]).
+ *
+ * In the implementation, the filtering step is delayed to the next pass so the filtering and
+ * histogram computation are fused. In this way, inputs are read once rather than twice.
+ */
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+__global__ void __launch_bounds__(BlockSize) radix_kernel(const T* in_buf,
+                                                          const IdxT* in_idx_buf,
+                                                          T* out_buf,
+                                                          IdxT* out_idx_buf,
+                                                          T* out,
+                                                          IdxT* out_idx,
+                                                          Counter<T, IdxT>* counters,
+                                                          IdxT* histograms,
+                                                          const IdxT len,
+                                                          const int k,
+                                                          const bool greater,
+                                                          const int pass)
+{
+  __shared__ bool isLastBlockDone;
+
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  constexpr int num_passes  = calc_num_passes<T, BitsPerPass>();
+  const int batch_id        = blockIdx.y;
+  in_buf += batch_id * len;
+  out_buf += batch_id * len;
+  out += batch_id * k;
+  out_idx += batch_id * k;
+  if (in_idx_buf) { in_idx_buf += batch_id * len; }
+  if (out_idx_buf) { out_idx_buf += batch_id * len; }
+
+  auto counter   = counters + batch_id;
+  auto histogram = histograms + batch_id * num_buckets;
+
+  filter_and_histogram<T, IdxT, BitsPerPass>(in_buf,
+                                             in_idx_buf,
+                                             out_buf,
+                                             out_idx_buf,
+                                             out,
+                                             out_idx,
+                                             len,
+                                             counter,
+                                             histogram,
+                                             greater,
+                                             pass,
+                                             k);
+  __threadfence();
+
+  if (threadIdx.x == 0) {
+    unsigned int finished = atomicInc(&counter->finished_block_cnt, gridDim.x - 1);
+    isLastBlockDone       = (finished == (gridDim.x - 1));
+  }
+
+  // Synchronize to make sure that each thread reads the correct value of
+  // isLastBlockDone.
+  __syncthreads();
+  if (isLastBlockDone) {
+    if (counter->len == 1 && threadIdx.x == 0) {
+      counter->previous_len = 0;
+      counter->len          = 0;
+    }
+    // init counter, other members of counter is initialized with 0 by
+    // cudaMemset()
+    if (pass == 0 && threadIdx.x == 0) {
+      counter->k            = k;
+      counter->len          = len;
+      counter->out_back_cnt = 0;
+    }
+    __syncthreads();
+
+    IdxT ori_k = counter->k;
+
+    if (counter->len > 0) {
+      choose_bucket<T, IdxT, BitsPerPass, BlockSize>(counter, histogram, ori_k);
+    }
+
+    __syncthreads();
+    if (pass == num_passes - 1) {
+      const IdxT previous_len = counter->previous_len;
+      const int want_bucket   = counter->bucket;
+      int start_bit           = calc_start_bit<T, BitsPerPass>(pass);
+      unsigned mask           = calc_mask<T, BitsPerPass>(pass);
+
+      // radix topk
+      IdxT& out_cnt = counter->out_cnt;
+      for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
+        const T value = out_buf[i];
+        int bucket    = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
+        if (bucket < want_bucket) {
+          IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
+          out[pos]     = value;
+          out_idx[pos] = out_idx_buf[i];
+        } else if (bucket == want_bucket) {
+          IdxT needed_num_of_kth = counter->k;
+          IdxT back_pos          = atomicAdd(&(counter->out_back_cnt), IdxT(1));
+          if (back_pos < needed_num_of_kth) {
+            IdxT pos     = k - 1 - back_pos;
+            out[pos]     = value;
+            out_idx[pos] = out_idx_buf[i];
+          }
+        }
+      }
+      __syncthreads();
+    } else {
+      // reset for next pass
+      for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+        histogram[i] = 0;
+      }
+      if (threadIdx.x == 0) { counter->filter_cnt = 0; }
+    }
+  }
+}
+
+/**
+ * Calculate the minimal batch size, such that GPU is still fully occupied.
+ */
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+inline uint16_t get_optimal_batch_size(size_t req_batch_size, size_t blocks_per_row)
+{
+  int dev_id, sm_count, occupancy, max_grid_dim_y;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&max_grid_dim_y, cudaDevAttrMaxGridDimY, dev_id));
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &occupancy, radix_kernel<T, IdxT, BitsPerPass, BlockSize>, BlockSize, 0));
+
+  // fully occupy GPU
+  size_t opt_batch_size = ceildiv<size_t>(sm_count * occupancy, blocks_per_row);
+  // round it up to the closest pow-of-two for better data alignment
+  opt_batch_size = isPo2(opt_batch_size) ? opt_batch_size : (1 << (log2(opt_batch_size) + 1));
+  // Take a max possible pow-of-two grid_dim_y
+  max_grid_dim_y = isPo2(max_grid_dim_y) ? max_grid_dim_y : (1 << log2(max_grid_dim_y));
+  // If the optimal batch size is very small compared to the requested batch size, we know
+  // the extra required memory is not significant and we can increase the batch size for
+  // better occupancy when the grid size is not multiple of the SM count.
+  // Also don't split the batch size when there is not much work overall.
+  const size_t safe_enlarge_factor = 9;
+  const size_t min_grid_size       = 1024;
+  while ((opt_batch_size << safe_enlarge_factor) < req_batch_size ||
+         blocks_per_row * opt_batch_size < min_grid_size) {
+    opt_batch_size <<= 1;
+  }
+
+  // Do not exceed the max grid size.
+  opt_batch_size = std::min<size_t>(opt_batch_size, size_t(max_grid_dim_y));
+
+  // Don't do more work than needed
+  return uint16_t(std::min<size_t>(opt_batch_size, req_batch_size));
+}
+
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_keys` as a row-major matrix with len columns and
+ * batch_size rows, then this function selects k smallest/largest values in each row and fills
+ * in the row-major matrix `out` of size (batch_size, k).
+ *
+ * Note, the output is NOT sorted within the groups of `k` selected elements.
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ * @tparam BitsPerPass
+ *   The size of the radix;
+ *   it affects the number of passes and number of buckets.
+ * @tparam BlockSize
+ *   Number of threads in a kernel thread block.
+ *
+ * @param[in] in
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   typically, these are indices of the corresponding in_keys.
+ * @param[in] batch_size
+ *   number of input rows, i.e. the batch size.
+ * @param[in] len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: len >= k.
+ * @param[in] k
+ *   the number of outputs to select in each input row.
+ * @param[out] out
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the k smallest/largest values from each row of the `in_keys`.
+ * @param[out] out_idx
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the payload selected together with `out`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] stream
+ */
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+void radix_topk(const T* in,
+                const IdxT* in_idx,
+                size_t batch_size,
+                size_t len,
+                int k,
+                T* out,
+                IdxT* out_idx,
+                bool select_min,
+                rmm::cuda_stream_view stream)
+{
+  // TODO: is it possible to relax this restriction?
+  static_assert(calc_num_passes<T, BitsPerPass>() > 1);
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+
+  size_t blocks_per_row = ceildiv<size_t>(len, BlockSize * ITEM_PER_THREAD);
+  uint16_t max_chunk_size =
+    get_optimal_batch_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, blocks_per_row);
+
+  rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream);
+  rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream);
+  rmm::device_uvector<T> buf1(len * max_chunk_size, stream);
+  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream);
+  rmm::device_uvector<T> buf2(len * max_chunk_size, stream);
+  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream);
+
+  for (size_t offset = 0; offset < batch_size; offset += max_chunk_size) {
+    auto chunk_size = uint16_t(std::min<size_t>(max_chunk_size, batch_size - offset));
+
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
+
+    const T* in_buf        = nullptr;
+    const IdxT* in_idx_buf = nullptr;
+    T* out_buf             = nullptr;
+    IdxT* out_idx_buf      = nullptr;
+
+    dim3 blocks(blocks_per_row, chunk_size);
+
+    constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
+
+    for (int pass = 0; pass < num_passes; ++pass) {
+      if (pass == 0) {
+        in_buf      = in + offset * len;
+        in_idx_buf  = nullptr;
+        out_buf     = nullptr;
+        out_idx_buf = nullptr;
+      } else if (pass == 1) {
+        in_buf      = in + offset * len;
+        in_idx_buf  = in_idx ? in_idx + offset * len : nullptr;
+        out_buf     = buf1.data();
+        out_idx_buf = idx_buf1.data();
+      } else if (pass % 2 == 0) {
+        in_buf      = buf1.data();
+        in_idx_buf  = idx_buf1.data();
+        out_buf     = buf2.data();
+        out_idx_buf = idx_buf2.data();
+      } else {
+        in_buf      = buf2.data();
+        in_idx_buf  = idx_buf2.data();
+        out_buf     = buf1.data();
+        out_idx_buf = idx_buf1.data();
+      }
+
+      radix_kernel<T, IdxT, BitsPerPass, BlockSize>
+        <<<blocks, BlockSize, 0, stream>>>(in_buf,
+                                           in_idx_buf,
+                                           out_buf,
+                                           out_idx_buf,
+                                           out + offset * k,
+                                           out_idx + offset * k,
+                                           counters.data(),
+                                           histograms.data(),
+                                           len,
+                                           k,
+                                           !select_min,
+                                           pass);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+}
+
+}  // namespace raft::spatial::knn::detail::topk
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
new file mode 100644
index 0000000000..f5ea8ba879
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "bitonic_sort.cuh"
+
+#include <raft/cuda_utils.cuh>
+#include <raft/pow2_utils.cuh>
+
+#include <algorithm>
+#include <functional>
+#include <type_traits>
+
+/*
+  Three APIs of different scopes are provided:
+    1. host function: warp_sort_topk()
+    2. block-wide API: class block_sort
+    3. warp-wide API: class warp_sort_filtered and class warp_sort_immediate
+
+
+  1. warp_sort_topk()
+    (see the docstring)
+
+  2. class block_sort
+    It can be regarded as a fixed size priority queue for a thread block,
+    although the API is not typical.
+    class warp_sort_filtered and warp_sort_immediate can be used to instantiate block_sort.
+
+    It uses dynamic shared memory as intermediate buffer.
+    So the required shared memory size should be calculated using
+    calc_smem_size_for_block_wide() and passed as the 3rd kernel launch parameter.
+
+    Two overload functions can be used to add items to the queue.
+    One is load(const T* in, IdxT start, IdxT end) and it adds a range of items,
+    namely [start, end) of in. The idx is inferred from start.
+    This function should be called only once to add all items, and should not be
+    used together with the add().
+    The second one is add(T val, IdxT idx), and it adds only one item pair.
+    Note that the range [start, end) is for the whole block of threads, that is,
+    each thread in the same block should get the same start/end.
+    In contrast, the parameters of the second form are for only one thread,
+    so each thread must get different val/idx.
+
+    After adding is finished, function done() should be called. And finally,
+    store() is used to get the top-k result.
+
+    Example:
+      __global__ void kernel() {
+        block_sort<warp_sort_immediate, ...> queue(...);
+
+        // way 1, [0, len) is same for the whole block
+        queue.load(in, 0, len);
+        // way 2, each thread gets its own val/idx pair
+        for (IdxT i = threadIdx.x; i < len, i += blockDim.x) {
+          queue.add(in[i], idx[i]);
+        }
+
+        queue.done();
+        queue.store(out, out_idx);
+     }
+
+     int smem_size = calc_smem_size_for_block_wide<T>(...);
+     kernel<<<grid_dim, block_dim, smem_size>>>();
+
+
+  3. class warp_sort_filtered and class warp_sort_immediate
+    These two classes can be regarded as fixed size priority queue for a warp.
+    Usage is similar to class block_sort.
+    Two types of add() functions are provided, and also note that [start, end) is
+    for a whole warp, while val/idx is for a thread.
+    No shared memory is needed.
+
+    The host function (warp_sort_topk) uses a heuristic to choose between these two classes for
+    sorting, warp_sort_immediate being chosen when the number of inputs per warp is somewhat small
+    (see the usage of LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing).
+
+    Example:
+      __global__ void kernel() {
+        warp_sort_immediate<...> queue(...);
+        int warp_id = threadIdx.x / WarpSize;
+        int lane_id = threadIdx.x % WarpSize;
+
+        // way 1, [0, len) is same for the whole warp
+        queue.load(in, 0, len);
+        // way 2, each thread gets its own val/idx pair
+        for (IdxT i = lane_id; i < len, i += WarpSize) {
+          queue.add(in[i], idx[i]);
+        }
+
+        queue.done();
+        // each warp outputs to a different offset
+        queue.store(out+ warp_id * k, out_idx+ warp_id * k);
+      }
+ */
+
+namespace raft::spatial::knn::detail::topk {
+
+static constexpr int kMaxCapacity = 256;
+
+namespace {
+
+/** Whether 'left` should indeed be on the left w.r.t. `right`. */
+template <bool Ascending, typename T>
+__device__ __forceinline__ auto is_ordered(T left, T right) -> bool
+{
+  if constexpr (Ascending) { return left < right; }
+  if constexpr (!Ascending) { return left > right; }
+}
+
+constexpr auto calc_capacity(int k) -> int
+{
+  int capacity = isPo2(k) ? k : (1 << (log2(k) + 1));
+  if (capacity < WarpSize) { capacity = WarpSize; }  // TODO: remove this to allow small sizes.
+  return capacity;
+}
+
+}  // namespace
+
+/**
+ * A fixed-size warp-level priority queue.
+ * By feeding the data through this queue, you get the `k <= Capacity`
+ * smallest/greatest values in the data.
+ *
+ * @tparam Capacity
+ *   maximum number of elements in the queue.
+ * @tparam Ascending
+ *   which comparison to use: `true` means `<`, collect the smallest elements,
+ *   `false` means `>`, collect the greatest elements.
+ * @tparam T
+ *   the type of keys (what is being compared)
+ * @tparam IdxT
+ *   the type of payload (normally, indices of elements), i.e.
+ *   the content sorted alongside the keys.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort {
+  static_assert(isPo2(Capacity));
+
+ public:
+  /**
+   * Construct the warp_sort empty queue.
+   *
+   * @param k
+   *   number of elements to select.
+   * @param dummy
+   *   the `empty` value for the choosen binary operation,
+   *   i.e. `Ascending ? upper_bound<T>() : lower_bound<T>()`.
+   *
+   */
+  __device__ warp_sort(IdxT k, T dummy) : k_(k), dummy_(dummy)
+  {
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; i++) {
+      val_arr_[i] = dummy_;
+    }
+  }
+
+  /**
+   * Load k values from the pointers at the given position, and merge them in the storage.
+   */
+  __device__ void load_sorted(const T* in, const IdxT* in_idx)
+  {
+    IdxT idx = kWarpWidth - 1 - Pow2<kWarpWidth>::mod(laneId());
+#pragma unroll
+    for (int i = kMaxArrLen - 1; i >= 0; --i, idx += kWarpWidth) {
+      if (idx < k_) {
+        T t = in[idx];
+        if (is_ordered<Ascending>(t, val_arr_[i])) {
+          val_arr_[i] = t;
+          idx_arr_[i] = in_idx[idx];
+        }
+      }
+    }
+    topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
+  }
+
+  /** Save the content by the pointer location. */
+  __device__ void store(T* out, IdxT* out_idx) const
+  {
+    IdxT idx = Pow2<kWarpWidth>::mod(laneId());
+#pragma unroll kMaxArrLen
+    for (int i = 0; i < kMaxArrLen && idx < k_; i++, idx += kWarpWidth) {
+      out[idx]     = val_arr_[i];
+      out_idx[idx] = idx_arr_[i];
+    }
+  }
+
+ protected:
+  static constexpr int kWarpWidth = std::min<int>(Capacity, WarpSize);
+  static constexpr int kMaxArrLen = Capacity / kWarpWidth;
+
+  const IdxT k_;
+  const T dummy_;
+  T val_arr_[kMaxArrLen];
+  IdxT idx_arr_[kMaxArrLen];
+
+  /**
+   * Merge another array (sorted in the opposite direction) in the queue.
+   * Thanks to the other array being sorted in the opposite direction,
+   * it's enough to call bitonic.merge once to maintain the valid state
+   * of the queue.
+   *
+   * @tparam PerThreadSizeIn
+   *   the size of the other array per-thread (compared to `kMaxArrLen`).
+   *
+   * @param keys_in
+   *   the values to be merged in. Pointers are unique per-thread. The values
+   *   must already be sorted in the opposite direction.
+   *   The layout of `keys_in` must be the same as the layout of `val_arr_`.
+   * @param ids_in
+   *   the associated indices of the elements in the same format as `keys_in`.
+   */
+  template <int PerThreadSizeIn>
+  __device__ __forceinline__ void merge_in(const T* __restrict__ keys_in,
+                                           const IdxT* __restrict__ ids_in)
+  {
+#pragma unroll
+    for (int i = std::min(kMaxArrLen, PerThreadSizeIn); i > 0; i--) {
+      T& key  = val_arr_[kMaxArrLen - i];
+      T other = keys_in[PerThreadSizeIn - i];
+      if (is_ordered<Ascending>(other, key)) {
+        key                      = other;
+        idx_arr_[kMaxArrLen - i] = ids_in[PerThreadSizeIn - i];
+      }
+    }
+    topk::bitonic<kMaxArrLen>(Ascending).merge(val_arr_, idx_arr_);
+  }
+};
+
+/**
+ * This version of warp_sort compares each input element against the current
+ * estimate of k-th value before adding it to the intermediate sorting buffer.
+ * This makes the algorithm do less sorting steps for long input sequences
+ * at the cost of extra checks on each step.
+ *
+ * This implementation is preferred for large len values.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
+  static_assert(Capacity >= WarpSize);
+
+ public:
+  __device__ warp_sort_filtered(int k, T dummy)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k, dummy), buf_len_(0), k_th_(dummy)
+  {
+#pragma unroll
+    for (int i = 0; i < kMaxBufLen; i++) {
+      val_buf_[i] = dummy_;
+    }
+  }
+
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
+    for (IdxT i = start + laneId(); i < end_for_fullwarp; i += WarpSize) {
+      T val    = (i < end) ? in[i] : dummy_;
+      IdxT idx = (i < end) ? in_idx[i] : std::numeric_limits<IdxT>::max();
+      add(val, idx);
+    }
+  }
+
+  __device__ void add(T val, IdxT idx)
+  {
+    // comparing for k_th should reduce the total amount of updates:
+    // `false` means the input value is surely not in the top-k values.
+    if (is_ordered<Ascending>(val, k_th_)) {
+      // NB: the loop is used here to ensure the constant indexing,
+      //     to not force the buffers spill into the local memory.
+#pragma unroll
+      for (int i = 0; i < kMaxBufLen; i++) {
+        if (i == buf_len_) {
+          val_buf_[i] = val;
+          idx_buf_[i] = idx;
+        }
+      }
+      ++buf_len_;
+    }
+    if (any(buf_len_ == kMaxBufLen)) { merge_buf_(); }
+  }
+
+  __device__ void done()
+  {
+    if (any(buf_len_ != 0)) { merge_buf_(); }
+  }
+
+ private:
+  __device__ void set_k_th_()
+  {
+    // NB on using srcLane: it's ok if it is outside the warp size / width;
+    //                      the modulo op will be done inside the __shfl_sync.
+    k_th_ = shfl(val_arr_[kMaxArrLen - 1], k_ - 1);
+  }
+
+  __device__ void merge_buf_()
+  {
+    topk::bitonic<kMaxBufLen>(!Ascending).sort(val_buf_, idx_buf_);
+    this->merge_in<kMaxBufLen>(val_buf_, idx_buf_);
+    buf_len_ = 0;
+    set_k_th_();  // contains warp sync
+#pragma unroll
+    for (int i = 0; i < kMaxBufLen; i++) {
+      val_buf_[i] = dummy_;
+    }
+  }
+
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::dummy_;
+
+  static constexpr int kMaxBufLen = (Capacity <= 64) ? 2 : 4;
+
+  T val_buf_[kMaxBufLen];
+  IdxT idx_buf_[kMaxBufLen];
+  int buf_len_;
+
+  T k_th_;
+};
+
+/**
+ * This version of warp_sort adds every input element into the intermediate sorting
+ * buffer, and thus does the sorting step every `Capacity` input elements.
+ *
+ * This implementation is preferred for very small len values.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
+  static_assert(Capacity >= WarpSize);
+
+ public:
+  __device__ warp_sort_immediate(int k, T dummy)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k, dummy), buf_len_(0)
+  {
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; i++) {
+      val_buf_[i] = dummy_;
+    }
+  }
+
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    add_first_(in, in_idx, start, end);
+    start += Capacity;
+    while (start < end) {
+      add_extra_(in, in_idx, start, end);
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
+      start += Capacity;
+    }
+  }
+
+  __device__ void add(T val, IdxT idx)
+  {
+    // NB: the loop is used here to ensure the constant indexing,
+    //     to not force the buffers spill into the local memory.
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; ++i) {
+      if (i == buf_len_) {
+        val_buf_[i] = val;
+        idx_buf_[i] = idx;
+      }
+    }
+
+    ++buf_len_;
+    if (buf_len_ == kMaxArrLen) {
+      topk::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
+#pragma unroll
+      for (int i = 0; i < kMaxArrLen; i++) {
+        val_buf_[i] = dummy_;
+      }
+      buf_len_ = 0;
+    }
+  }
+
+  __device__ void done()
+  {
+    if (buf_len_ != 0) {
+      topk::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
+    }
+  }
+
+ private:
+  /** Fill in the primary val_arr_/idx_arr_ */
+  __device__ void add_first_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    IdxT idx = start + laneId();
+    for (int i = 0; i < kMaxArrLen; ++i, idx += WarpSize) {
+      if (idx < end) {
+        val_arr_[i] = in[idx];
+        idx_arr_[i] = in_idx[idx];
+      }
+    }
+    topk::bitonic<kMaxArrLen>(Ascending).sort(val_arr_, idx_arr_);
+  }
+
+  /** Fill in the secondary val_buf_/idx_buf_ */
+  __device__ void add_extra_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    IdxT idx = start + laneId();
+    for (int i = 0; i < kMaxArrLen; ++i, idx += WarpSize) {
+      val_buf_[i] = (idx < end) ? in[idx] : dummy_;
+      idx_buf_[i] = (idx < end) ? in_idx[idx] : std::numeric_limits<IdxT>::max();
+    }
+    topk::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+  }
+
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::dummy_;
+
+  T val_buf_[kMaxArrLen];
+  IdxT idx_buf_[kMaxArrLen];
+  int buf_len_;
+};
+
+/**
+ * This one is used for the second pass only:
+ *   if the first pass happens in multiple blocks, the output consists of a series
+ *   of sorted arrays, length `k` each.
+ *   Under this assumption, we can use load_sorted to just do the merging, rather than
+ *   the full sort.
+ */
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class warp_merge : public warp_sort<Capacity, Ascending, T, IdxT> {
+ public:
+  __device__ warp_merge(int k, T dummy) : warp_sort<Capacity, Ascending, T, IdxT>(k, dummy) {}
+
+  // NB: the input is already sorted, because it's the second pass.
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    for (; start < end; start += k_) {
+      load_sorted(in + start, in_idx + start);
+    }
+  }
+
+  __device__ void done() {}
+
+ private:
+  using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::dummy_;
+};
+
+template <typename T, typename IdxT>
+int calc_smem_size_for_block_wide(int num_of_warp, IdxT k)
+{
+  return Pow2<256>::roundUp(num_of_warp / 2 * sizeof(T) * k) + num_of_warp / 2 * sizeof(IdxT) * k;
+}
+
+template <template <int, bool, typename, typename> class WarpSortWarpWide,
+          int Capacity,
+          bool Ascending,
+          typename T,
+          typename IdxT>
+class block_sort {
+ public:
+  __device__ block_sort(int k, T dummy, void* smem_buf) : queue_(k, dummy), k_(k), dummy_(dummy)
+  {
+    val_smem_             = static_cast<T*>(smem_buf);
+    const int num_of_warp = blockDim.x / WarpSize;
+    idx_smem_             = reinterpret_cast<IdxT*>(reinterpret_cast<char*>(smem_buf) +
+                                        Pow2<256>::roundUp(num_of_warp / 2 * sizeof(T) * k_));
+  }
+
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    int num_of_warp   = blockDim.x / WarpSize;
+    const int warp_id = threadIdx.x / WarpSize;
+    IdxT len_per_warp = ceildiv<IdxT>(end - start, num_of_warp);
+    len_per_warp      = alignTo<IdxT>(len_per_warp, k_);
+
+    IdxT warp_start = start + warp_id * len_per_warp;
+    IdxT warp_end   = warp_start + len_per_warp;
+    if (warp_end > end) { warp_end = end; }
+    queue_.load(in, in_idx, warp_start, warp_end);
+  }
+
+  __device__ void add(T val, IdxT idx) { queue_.add(val, idx); }
+
+  /**
+   * At the point of calling this function, the warp-level queues consumed all input
+   * independently. The remaining work to be done is to merge them together.
+   *
+   * Here we tree-merge the results using the shared memory and block sync.
+   */
+  __device__ void done()
+  {
+    queue_.done();
+
+    int num_of_warp   = blockDim.x / WarpSize;
+    const int warp_id = threadIdx.x / WarpSize;
+
+    while (num_of_warp > 1) {
+      int half_num_of_warp = (num_of_warp + 1) / 2;
+      if (warp_id < num_of_warp && warp_id >= half_num_of_warp) {
+        int dst_warp_id = warp_id - half_num_of_warp;
+        queue_.store(val_smem_ + dst_warp_id * k_, idx_smem_ + dst_warp_id * k_);
+      }
+      __syncthreads();
+
+      if (warp_id < num_of_warp / 2) {
+        queue_.load_sorted(val_smem_ + warp_id * k_, idx_smem_ + warp_id * k_);
+      }
+      __syncthreads();
+
+      num_of_warp = half_num_of_warp;
+    }
+  }
+
+  /** Save the content by the pointer location. */
+  __device__ void store(T* out, IdxT* out_idx) const
+  {
+    if (threadIdx.x < kWarpWidth) { queue_.store(out, out_idx); }
+  }
+
+ private:
+  static constexpr int kWarpWidth = std::min<int>(Capacity, WarpSize);
+
+  WarpSortWarpWide<Capacity, Ascending, T, IdxT> queue_;
+  int k_;
+  T dummy_;
+  T* val_smem_;
+  IdxT* idx_smem_;
+};
+
+/**
+ * Uses the `WarpSortClass` to sort chunks of data within one block with no interblock
+ * communication. It can be arranged so, that multiple blocks process one row of input; in this
+ * case, they output multiple results of length k each. Then, a second pass is needed to merge
+ * those into one final output.
+ */
+template <template <int, bool, typename, typename> class WarpSortClass,
+          int Capacity,
+          bool Ascending,
+          typename T,
+          typename IdxT>
+__global__ void block_kernel(
+  const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx, T dummy)
+{
+  extern __shared__ __align__(sizeof(T) * 256) uint8_t smem_buf_bytes[];
+  block_sort<WarpSortClass, Capacity, Ascending, T, IdxT> queue(
+    k, dummy, reinterpret_cast<T*>(smem_buf_bytes));
+  in += blockIdx.y * len;
+  in_idx += blockIdx.y * len;
+
+  const IdxT len_per_block = ceildiv<IdxT>(len, gridDim.x);
+  queue.load(
+    in, in_idx, blockIdx.x * len_per_block, std::min<IdxT>(len, (blockIdx.x + 1) * len_per_block));
+
+  queue.done();
+  const int block_id = blockIdx.x + gridDim.x * blockIdx.y;
+  queue.store(out + block_id * k, out_idx + block_id * k);
+}
+
+template <template <int, bool, typename, typename> class WarpSortClass,
+          typename T,
+          typename IdxT,
+          int Capacity = kMaxCapacity>
+struct launch_setup {
+  /**
+   * @brief Calculate the best block size and minimum grid size for the given `k`.
+   *
+   * @param[in] k
+   *   The select-top-k parameter
+   * @param[out] block_size
+   *   Returned block size
+   * @param[out] min_grid_size
+   *   Returned minimum grid size needed to achieve the best potential occupancy
+   */
+  static void calc_optimal_params(int k, int* block_size, int* min_grid_size)
+  {
+    const int capacity = calc_capacity(k);
+    if constexpr (Capacity > WarpSize) {  // TODO: replace with `Capacity > 1` to allow small sizes.
+      if (capacity < Capacity) {
+        return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::calc_optimal_params(
+          capacity, block_size, min_grid_size);
+      }
+    }
+    ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
+    auto calc_smem = [k](int block_size) {
+      int num_of_warp = block_size / WarpSize;
+      return calc_smem_size_for_block_wide<T>(num_of_warp, k);
+    };
+    RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSizeVariableSMem(
+      min_grid_size, block_size, block_kernel<WarpSortClass, Capacity, true, T, IdxT>, calc_smem));
+  }
+
+  static void kernel(int k,
+                     bool select_min,
+                     IdxT batch_size,
+                     IdxT len,
+                     int num_blocks,
+                     int block_dim,
+                     int smem_size,
+                     const T* in_key,
+                     const IdxT* in_idx,
+                     T* out_key,
+                     IdxT* out_idx,
+                     cudaStream_t stream)
+  {
+    const int capacity = calc_capacity(k);
+    if constexpr (Capacity > WarpSize) {  // TODO: replace with `Capacity > 1` to allow small sizes.
+      if (capacity < Capacity) {
+        return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::kernel(k,
+                                                                          select_min,
+                                                                          batch_size,
+                                                                          len,
+                                                                          num_blocks,
+                                                                          block_dim,
+                                                                          smem_size,
+                                                                          in_key,
+                                                                          in_idx,
+                                                                          out_key,
+                                                                          out_idx,
+                                                                          stream);
+      }
+    }
+    ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
+    T dummy = select_min ? upper_bound<T>() : lower_bound<T>();
+    // This is less than cuda's max block dim along Y axis (65535), but it's a
+    // power-of-two, which ensures the alignment of batches in memory.
+    constexpr IdxT kMaxGridDimY = 32768;
+    for (IdxT offset = 0; offset < batch_size; offset += kMaxGridDimY) {
+      IdxT batch_chunk = std::min<IdxT>(kMaxGridDimY, batch_size - offset);
+      dim3 gs(num_blocks, batch_chunk, 1);
+      if (select_min) {
+        block_kernel<WarpSortClass, Capacity, true>
+          <<<gs, block_dim, smem_size, stream>>>(in_key + offset * len,
+                                                 in_idx + offset * len,
+                                                 len,
+                                                 k,
+                                                 out_key + offset * num_blocks * k,
+                                                 out_idx + offset * num_blocks * k,
+                                                 dummy);
+      } else {
+        block_kernel<WarpSortClass, Capacity, false>
+          <<<gs, block_dim, smem_size, stream>>>(in_key + offset * len,
+                                                 in_idx + offset * len,
+                                                 len,
+                                                 k,
+                                                 out_key + offset * num_blocks * k,
+                                                 out_idx + offset * num_blocks * k,
+                                                 dummy);
+      }
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+};
+
+template <template <int, bool, typename, typename> class WarpSortClass>
+struct LaunchThreshold {
+};
+
+template <>
+struct LaunchThreshold<warp_sort_filtered> {
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 32;
+};
+
+template <>
+struct LaunchThreshold<warp_sort_immediate> {
+  static constexpr int len_factor_for_choosing     = 4;
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 4;
+};
+
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
+void calc_launch_parameter(int batch_size, IdxT len, int k, int* p_num_of_block, int* p_num_of_warp)
+{
+  const int capacity = calc_capacity(k);
+  int block_size     = 0;
+  int min_grid_size  = 0;
+  launch_setup<WarpSortClass, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
+
+  int num_of_warp;
+  int num_of_block;
+  if (batch_size < min_grid_size) {  // may use multiple blocks
+    num_of_warp       = block_size / WarpSize;
+    num_of_block      = min_grid_size / batch_size;
+    int len_per_block = (len - 1) / num_of_block + 1;
+    int len_per_warp  = (len_per_block - 1) / num_of_warp + 1;
+
+    len_per_warp  = Pow2<WarpSize>::roundUp(len_per_warp);
+    len_per_block = len_per_warp * num_of_warp;
+    num_of_block  = (len - 1) / len_per_block + 1;
+
+    constexpr int len_factor = LaunchThreshold<WarpSortClass>::len_factor_for_multi_block;
+    if (len_per_warp < capacity * len_factor) {
+      len_per_warp  = capacity * len_factor;
+      len_per_block = num_of_warp * len_per_warp;
+      if ((IdxT)len_per_block > len) { len_per_block = len; }
+      num_of_block = (len - 1) / len_per_block + 1;
+      num_of_warp  = (len_per_block - 1) / len_per_warp + 1;
+    }
+  } else {  // use only single block
+    num_of_block = 1;
+
+    // block size could be decreased if batch size is large
+    float scale = batch_size / min_grid_size;
+    if (scale > 1) {
+      // make sure scale > 1 so block_size only decreases not increases
+      if (0.8 * scale > 1) { scale = 0.8 * scale; }
+      block_size /= scale;
+      if (block_size < 1) { block_size = 1; }
+      block_size = Pow2<WarpSize>::roundUp(block_size);
+    }
+
+    num_of_warp      = block_size / WarpSize;
+    int len_per_warp = (len - 1) / num_of_warp + 1;
+    len_per_warp     = Pow2<WarpSize>::roundUp(len_per_warp);
+    num_of_warp      = (len - 1) / len_per_warp + 1;
+
+    constexpr int len_factor = LaunchThreshold<WarpSortClass>::len_factor_for_single_block;
+    if (len_per_warp < capacity * len_factor) {
+      len_per_warp = capacity * len_factor;
+      num_of_warp  = (len - 1) / len_per_warp + 1;
+    }
+  }
+
+  *p_num_of_block = num_of_block;
+  *p_num_of_warp  = num_of_warp;
+}
+
+template <typename T, typename IdxT>
+void calc_launch_parameter_for_merge(IdxT len, int k, int* num_of_block, int* num_of_warp)
+{
+  *num_of_block = 1;
+
+  int block_size    = 0;
+  int min_grid_size = 0;
+  launch_setup<warp_merge, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
+
+  *num_of_warp      = block_size / WarpSize;
+  IdxT len_per_warp = (len - 1) / (*num_of_warp) + 1;
+  len_per_warp      = ((len_per_warp - 1) / k + 1) * k;
+  *num_of_warp      = (len - 1) / len_per_warp + 1;
+}
+
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
+void warp_sort_topk_(int num_of_block,
+                     int num_of_warp,
+                     const T* in,
+                     const IdxT* in_idx,
+                     size_t batch_size,
+                     size_t len,
+                     int k,
+                     T* out,
+                     IdxT* out_idx,
+                     bool select_min,
+                     cudaStream_t stream = 0)
+{
+  rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream);
+  rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream);
+
+  int capacity = calc_capacity(k);
+
+  T* result_val    = (num_of_block == 1) ? out : tmp_val.data();
+  IdxT* result_idx = (num_of_block == 1) ? out_idx : tmp_idx.data();
+  int block_dim    = num_of_warp * WarpSize;
+  int smem_size    = calc_smem_size_for_block_wide<T>(num_of_warp, (IdxT)k);
+  launch_setup<WarpSortClass, T, IdxT>::kernel((IdxT)k,
+                                               select_min,
+                                               (IdxT)batch_size,
+                                               (IdxT)len,
+                                               num_of_block,
+                                               block_dim,
+                                               smem_size,
+                                               in,
+                                               in_idx,
+                                               result_val,
+                                               result_idx,
+                                               stream);
+
+  if (num_of_block > 1) {
+    // Merge the results across blocks using warp_merge
+    len = k * num_of_block;
+    calc_launch_parameter_for_merge<T>(len, k, &num_of_block, &num_of_warp);
+    block_dim = num_of_warp * WarpSize;
+    smem_size = calc_smem_size_for_block_wide<T>(num_of_warp, (IdxT)k);
+    launch_setup<warp_merge, T, IdxT>::kernel((IdxT)k,
+                                              select_min,
+                                              (IdxT)batch_size,
+                                              (IdxT)len,
+                                              num_of_block,
+                                              block_dim,
+                                              smem_size,
+                                              tmp_val.data(),
+                                              tmp_idx.data(),
+                                              out,
+                                              out_idx,
+                                              stream);
+  }
+}
+
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_keys` as a row-major matrix with len columns and
+ * batch_size rows, then this function selects k smallest/largest values in each row and fills
+ * in the row-major matrix `out` of size (batch_size, k).
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ *
+ * @param[in] in
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   typically, these are indices of the corresponding in_keys.
+ * @param[in] batch_size
+ *   number of input rows, i.e. the batch size.
+ * @param[in] len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: len >= k.
+ * @param[in] k
+ *   the number of outputs to select in each input row.
+ * @param[out] out
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the k smallest/largest values from each row of the `in_keys`.
+ * @param[out] out_idx
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the payload selected together with `out`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] stream
+ */
+template <typename T, typename IdxT>
+void warp_sort_topk(const T* in,
+                    const IdxT* in_idx,
+                    size_t batch_size,
+                    size_t len,
+                    int k,
+                    T* out,
+                    IdxT* out_idx,
+                    bool select_min,
+                    rmm::cuda_stream_view stream = 0)
+{
+  ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
+
+  int capacity     = calc_capacity(k);
+  int num_of_block = 0;
+  int num_of_warp  = 0;
+  calc_launch_parameter<warp_sort_immediate, T>(
+    batch_size, len, (IdxT)k, &num_of_block, &num_of_warp);
+  int len_per_warp = len / (num_of_block * num_of_warp);
+
+  if (len_per_warp <= capacity * LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
+    warp_sort_topk_<warp_sort_immediate, T, IdxT>(
+      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+  } else {
+    calc_launch_parameter<warp_sort_filtered, T>(batch_size, len, k, &num_of_block, &num_of_warp);
+    warp_sort_topk_<warp_sort_filtered, T, IdxT>(
+      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+  }
+}
+
+}  // namespace raft::spatial::knn::detail::topk
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 189b537361..8765a7c30a 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -14,17 +14,17 @@
  * limitations under the License.
  */
 
-#ifndef __KNN_H
-#define __KNN_H
-
 #pragma once
 
 #include "detail/knn_brute_force_faiss.cuh"
 #include "detail/selection_faiss.cuh"
 
-namespace raft {
-namespace spatial {
-namespace knn {
+#include "detail/topk/radix_topk.cuh"
+#include "detail/topk/warpsort_topk.cuh"
+
+#include <raft/common/nvtx.hpp>
+
+namespace raft::spatial::knn {
 
 /**
  * Performs a k-select across row partitioned index/distance
@@ -38,65 +38,131 @@ namespace knn {
  *
  * etc...
  *
- * @tparam value_idx
+ * @tparam idx_t
  * @tparam value_t
- * @param inK
- * @param inV
- * @param outK
- * @param outV
+ * @param in_keys
+ * @param in_values
+ * @param out_keys
+ * @param out_values
  * @param n_samples
  * @param n_parts
  * @param k
  * @param stream
  * @param translations
  */
-template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t* inK,
-                            value_idx* inV,
-                            value_t* outK,
-                            value_idx* outV,
+template <typename idx_t = int64_t, typename value_t = float>
+inline void knn_merge_parts(value_t* in_keys,
+                            idx_t* in_values,
+                            value_t* out_keys,
+                            idx_t* out_values,
                             size_t n_samples,
                             int n_parts,
                             int k,
                             cudaStream_t stream,
-                            value_idx* translations)
+                            idx_t* translations)
 {
-  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  detail::knn_merge_parts(
+    in_keys, in_values, out_keys, out_values, n_samples, n_parts, k, stream, translations);
 }
 
+/** Choose an implementation for the select-top-k, */
+enum class SelectKAlgo {
+  /** Adapted from the faiss project. Result: sorted (not stable). */
+  FAISS,
+  /** Incomplete series of radix sort passes, comparing 8 bits per pass. Result: unsorted. */
+  RADIX_8_BITS,
+  /** Incomplete series of radix sort passes, comparing 11 bits per pass. Result: unsorted. */
+  RADIX_11_BITS,
+  /** Filtering with a bitonic-sort-based priority queue. Result: sorted (not stable). */
+  WARP_SORT
+};
+
 /**
- * Performs a k-select across column-partitioned index/distance
- * matrices formatted like the following:
- * row1: k0, k1, k2, k0, k1, k2
- * row2: k0, k1, k2, k0, k1, k2
- * row3: k0, k1, k2, k0, k1, k2
+ * Select k smallest or largest key/values from each row in the input data.
  *
- * etc...
+ * If you think of the input data `in_keys` as a row-major matrix with input_len columns and
+ * n_inputs rows, then this function selects k smallest/largest values in each row and fills
+ * in the row-major matrix `out_keys` of size (n_inputs, k).
  *
- * @tparam value_idx
+ * Note, depending on the selected algorithm, the values within rows of `out_keys` are not
+ * necessarily sorted. See the `SelectKAlgo` enumeration for more details.
+ *
+ * @tparam idx_t
+ *   the payload type (what is being selected together with the keys).
  * @tparam value_t
- * @param inK
- * @param inV
- * @param n_rows
- * @param n_cols
- * @param outK
- * @param outV
- * @param select_min
- * @param k
- * @param stream
+ *   the type of the keys (what is being compared).
+ *
+ * @param[in] in_keys
+ *   contiguous device array of inputs of size (input_len * n_inputs);
+ *   these are compared and selected.
+ * @param[in] in_values
+ *   contiguous device array of inputs of size (input_len * n_inputs);
+ *   typically, these are indices of the corresponding in_keys.
+ * @param[in] n_inputs
+ *   number of input rows, i.e. the batch size.
+ * @param[in] input_len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: input_len >= k.
+ * @param[out] out_keys
+ *   contiguous device array of outputs of size (k * n_inputs);
+ *   the k smallest/largest values from each row of the `in_keys`.
+ * @param[out] out_values
+ *   contiguous device array of outputs of size (k * n_inputs);
+ *   the payload selected together with `out_keys`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] k
+ *   the number of outputs to select in each input row.
+ * @param[in] stream
+ * @param[in] algo
+ *   the implementation of the algorithm
  */
-template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
-                     size_t n_rows,
-                     size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
+template <typename idx_t = int, typename value_t = float>
+inline void select_k(value_t* in_keys,
+                     idx_t* in_values,
+                     size_t n_inputs,
+                     size_t input_len,
+                     value_t* out_keys,
+                     idx_t* out_values,
                      bool select_min,
                      int k,
-                     cudaStream_t stream)
+                     cudaStream_t stream,
+                     SelectKAlgo algo = SelectKAlgo::FAISS)
 {
-  detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("select-%s-%d (%zu, %zu) algo-%d",
+                                                            select_min ? "min" : "max",
+                                                            k,
+                                                            n_inputs,
+                                                            input_len,
+                                                            int(algo));
+  ASSERT(size_t(input_len) >= size_t(k),
+         "Size of the input (input_len = %zu) must be not smaller than the selection (k = %zu).",
+         size_t(input_len),
+         size_t(k));
+
+  switch (algo) {
+    case SelectKAlgo::FAISS:
+      detail::select_k(
+        in_keys, in_values, n_inputs, input_len, out_keys, out_values, select_min, k, stream);
+      break;
+
+    case SelectKAlgo::RADIX_8_BITS:
+      detail::topk::radix_topk<value_t, idx_t, 8, 512>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
+      break;
+
+    case SelectKAlgo::RADIX_11_BITS:
+      detail::topk::radix_topk<value_t, idx_t, 11, 512>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
+      break;
+
+    case SelectKAlgo::WARP_SORT:
+      detail::topk::warp_sort_topk<value_t, idx_t>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
+      break;
+
+    default: ASSERT(false, "Unknown algorithm (id = %d)", int(algo));
+  }
 }
 
 /**
@@ -122,21 +188,21 @@ inline void select_k(value_t* inK,
  * @param[in] translations starting offsets for partitions. should be the same size
  *            as input vector.
  */
-template <typename value_idx = std::int64_t, typename value_t = float, typename value_int = int>
+template <typename idx_t = std::int64_t, typename value_t = float, typename value_int = int>
 void brute_force_knn(raft::handle_t const& handle,
                      std::vector<value_t*>& input,
                      std::vector<value_int>& sizes,
                      value_int D,
                      value_t* search_items,
                      value_int n,
-                     value_idx* res_I,
+                     idx_t* res_I,
                      value_t* res_D,
                      value_int k,
-                     bool rowMajorIndex                   = true,
-                     bool rowMajorQuery                   = true,
-                     std::vector<value_idx>* translations = nullptr,
-                     distance::DistanceType metric        = distance::DistanceType::L2Unexpanded,
-                     float metric_arg                     = 2.0f)
+                     bool rowMajorIndex               = true,
+                     bool rowMajorQuery               = true,
+                     std::vector<idx_t>* translations = nullptr,
+                     distance::DistanceType metric    = distance::DistanceType::L2Unexpanded,
+                     float metric_arg                 = 2.0f)
 {
   ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 
@@ -155,8 +221,4 @@ void brute_force_knn(raft::handle_t const& handle,
                                metric,
                                metric_arg);
 }
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
-
-#endif
\ No newline at end of file
+}  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index c7b21f16ad..08a6378f54 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -13,154 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
- */
-
-#ifndef __KNN_H
-#define __KNN_H
 
 #pragma once
 
-#include "detail/knn_brute_force_faiss.cuh"
-#include "detail/selection_faiss.cuh"
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-/**
- * Performs a k-select across row partitioned index/distance
- * matrices formatted like the following:
- * row1: k0, k1, k2
- * row2: k0, k1, k2
- * row3: k0, k1, k2
- * row1: k0, k1, k2
- * row2: k0, k1, k2
- * row3: k0, k1, k2
- *
- * etc...
- *
- * @tparam value_idx
- * @tparam value_t
- * @param inK
- * @param inV
- * @param outK
- * @param outV
- * @param n_samples
- * @param n_parts
- * @param k
- * @param stream
- * @param translations
- */
-template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t* inK,
-                            value_idx* inV,
-                            value_t* outK,
-                            value_idx* outV,
-                            size_t n_samples,
-                            int n_parts,
-                            int k,
-                            cudaStream_t stream,
-                            value_idx* translations)
-{
-  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
-}
-
-/**
- * Performs a k-select across column-partitioned index/distance
- * matrices formatted like the following:
- * row1: k0, k1, k2, k0, k1, k2
- * row2: k0, k1, k2, k0, k1, k2
- * row3: k0, k1, k2, k0, k1, k2
- *
- * etc...
- *
- * @tparam value_idx
- * @tparam value_t
- * @param inK
- * @param inV
- * @param n_rows
- * @param n_cols
- * @param outK
- * @param outV
- * @param select_min
- * @param k
- * @param stream
- */
-template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
-                     size_t n_rows,
-                     size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
-                     bool select_min,
-                     int k,
-                     cudaStream_t stream)
-{
-  detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-}
-
-/**
- * @brief Flat C++ API function to perform a brute force knn on
- * a series of input arrays and combine the results into a single
- * output array for indexes and distances.
- *
- * @param[in] handle the cuml handle to use
- * @param[in] input vector of pointers to the input arrays
- * @param[in] sizes vector of sizes of input arrays
- * @param[in] D the dimensionality of the arrays
- * @param[in] search_items array of items to search of dimensionality D
- * @param[in] n number of rows in search_items
- * @param[out] res_I the resulting index array of size n * k
- * @param[out] res_D the resulting distance array of size n * k
- * @param[in] k the number of nearest neighbors to return
- * @param[in] rowMajorIndex are the index arrays in row-major order?
- * @param[in] rowMajorQuery are the query arrays in row-major order?
- * @param[in] metric distance metric to use. Euclidean (L2) is used by
- * 			   default
- * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This
- * 					 is ignored if the metric_type is not Minkowski.
- * @param[in] translations starting offsets for partitions. should be the same size
- *            as input vector.
- */
-template <typename value_idx = std::int64_t, typename value_t = float, typename value_int = int>
-void brute_force_knn(raft::handle_t const& handle,
-                     std::vector<value_t*>& input,
-                     std::vector<value_int>& sizes,
-                     value_int D,
-                     value_t* search_items,
-                     value_int n,
-                     value_idx* res_I,
-                     value_t* res_D,
-                     value_int k,
-                     bool rowMajorIndex                   = true,
-                     bool rowMajorQuery                   = true,
-                     std::vector<value_idx>* translations = nullptr,
-                     distance::DistanceType metric        = distance::DistanceType::L2Unexpanded,
-                     float metric_arg                     = 2.0f)
-{
-  ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
-
-  detail::brute_force_knn_impl(handle,
-                               input,
-                               sizes,
-                               D,
-                               search_items,
-                               n,
-                               res_I,
-                               res_D,
-                               k,
-                               rowMajorIndex,
-                               rowMajorQuery,
-                               translations,
-                               metric,
-                               metric_arg);
-}
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
+#pragma message(__FILE__                                               \
+                " is deprecated and will be removed in release 22.06." \
+                " Please use the cuh version instead.")
 
-#endif
\ No newline at end of file
+#include "knn.cuh"
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 25ec2e50ab..86adb10915 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -14,8 +14,11 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <gtest/gtest.h>
+#include <numeric>
 #include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
 
 #include "../test_utils.h"
 
@@ -25,139 +28,354 @@
 #include <raft/spatial/knn/specializations.cuh>
 #endif
 
-namespace raft {
-namespace spatial {
-namespace selection {
+namespace raft::spatial::selection {
 
 using namespace raft;
 using namespace raft::sparse;
 
-template <typename value_idx, typename value_t>
-struct SparseSelectionInputs {
-  value_idx n_rows;
-  value_idx n_cols;
-
-  std::vector<value_t> dists_h;
-
-  std::vector<value_t> out_dists_ref_h;
-  std::vector<value_idx> out_indices_ref_h;
-
+struct SelectTestSpec {
+  int n_inputs;
+  int input_len;
   int k;
-
-  bool select_min;
+  int select_min;
 };
 
-template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os,
-                           const SparseSelectionInputs<value_idx, value_t>& dims)
+std::ostream& operator<<(std::ostream& os, const SelectTestSpec& ss)
 {
+  os << "spec{size: " << ss.input_len << "*" << ss.n_inputs << ", k: " << ss.k;
+  os << (ss.select_min ? "; min}" : "; max}");
   return os;
 }
 
-template <typename value_idx, typename value_t>
-class SparseSelectionTest
-  : public ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>> {
+template <typename IdxT>
+auto gen_simple_ids(int n_inputs, int input_len) -> std::vector<IdxT>
+{
+  std::vector<IdxT> out(n_inputs * input_len);
+  auto s = rmm::cuda_stream_default;
+  rmm::device_uvector<IdxT> out_d(out.size(), s);
+  iota_fill(out_d.data(), IdxT(n_inputs), IdxT(input_len), s);
+  update_host(out.data(), out_d.data(), out.size(), s);
+  s.synchronize();
+  return out;
+}
+
+template <typename KeyT, typename IdxT>
+struct SelectInOutSimple {
  public:
-  SparseSelectionTest()
-    : params(::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>>::GetParam()),
-      stream(handle.get_stream()),
-      dists(0, stream),
-      inds(0, stream),
-      out_indices_ref(0, stream),
-      out_dists_ref(0, stream),
-      out_dists(0, stream),
-      out_indices(0, stream)
+  bool not_supported = false;
+
+  SelectInOutSimple(const SelectTestSpec& spec,
+                    const std::vector<KeyT>& in_dists,
+                    const std::vector<KeyT>& out_dists,
+                    const std::vector<IdxT>& out_ids)
+    : in_dists_(in_dists),
+      in_ids_(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len)),
+      out_dists_(out_dists),
+      out_ids_(out_ids)
   {
   }
 
- protected:
-  void make_data()
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+};
+
+template <typename KeyT, typename IdxT>
+struct SelectInOutComputed {
+ public:
+  bool not_supported = false;
+
+  SelectInOutComputed(const SelectTestSpec& spec,
+                      knn::SelectKAlgo algo,
+                      const std::vector<KeyT>& in_dists,
+                      const std::optional<std::vector<IdxT>>& in_ids = std::nullopt)
+    : in_dists_(in_dists),
+      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len))),
+      out_dists_(spec.n_inputs * spec.k),
+      out_ids_(spec.n_inputs * spec.k)
   {
-    std::vector<value_t> dists_h = params.dists_h;
+    // check if the size is supported by the algorithm
+    switch (algo) {
+      case knn::SelectKAlgo::WARP_SORT:
+        if (spec.k > raft::spatial::knn::detail::topk::kMaxCapacity) {
+          not_supported = true;
+          return;
+        }
+        break;
+      case knn::SelectKAlgo::FAISS:
+        if (spec.k > raft::spatial::knn::detail::kFaissMaxK<IdxT, KeyT>()) {
+          not_supported = true;
+          return;
+        }
+        break;
+      default: break;
+    }
+
+    auto stream = rmm::cuda_stream_default;
 
-    dists.resize(n_rows * n_cols, stream);
-    inds.resize(n_rows * n_cols, stream);
-    out_dists.resize(n_rows * k, stream);
-    out_indices.resize(n_rows * k, stream);
+    rmm::device_uvector<KeyT> in_dists_d(in_dists_.size(), stream);
+    rmm::device_uvector<IdxT> in_ids_d(in_ids_.size(), stream);
+    rmm::device_uvector<KeyT> out_dists_d(out_dists_.size(), stream);
+    rmm::device_uvector<IdxT> out_ids_d(out_ids_.size(), stream);
 
-    update_device(dists.data(), dists_h.data(), dists_h.size(), stream);
-    iota_fill(inds.data(), n_rows, n_cols, stream);
+    update_device(in_dists_d.data(), in_dists_.data(), in_dists_.size(), stream);
+    update_device(in_ids_d.data(), in_ids_.data(), in_ids_.size(), stream);
 
-    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
-    std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    out_indices_ref.resize(out_indices_ref_h.size(), stream);
-    out_dists_ref.resize(out_dists_ref_h.size(), stream);
+    raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_d.data(),
+                                             in_ids_d.data(),
+                                             spec.n_inputs,
+                                             spec.input_len,
+                                             out_dists_d.data(),
+                                             out_ids_d.data(),
+                                             spec.select_min,
+                                             spec.k,
+                                             stream,
+                                             algo);
 
-    update_device(
-      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
+    update_host(out_dists_.data(), out_dists_d.data(), out_dists_.size(), stream);
+    update_host(out_ids_.data(), out_ids_d.data(), out_ids_.size(), stream);
+
+    interruptible::synchronize(stream);
+
+    auto p = topk_sort_permutation(out_dists_, out_ids_, spec.k, spec.select_min);
+    apply_permutation(out_dists_, p);
+    apply_permutation(out_ids_, p);
   }
 
-  void SetUp() override
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+
+  auto topk_sort_permutation(const std::vector<KeyT>& vec,
+                             const std::vector<IdxT>& inds,
+                             int k,
+                             bool select_min) -> std::vector<IdxT>
   {
-    n_rows = params.n_rows;
-    n_cols = params.n_cols;
-    k      = params.k;
-
-    make_data();
-
-    raft::spatial::knn::select_k(dists.data(),
-                                 inds.data(),
-                                 n_rows,
-                                 n_cols,
-                                 out_dists.data(),
-                                 out_indices.data(),
-                                 params.select_min,
-                                 k,
-                                 stream);
-
-    handle.sync_stream(stream);
+    std::vector<IdxT> p(vec.size());
+    std::iota(p.begin(), p.end(), 0);
+    if (select_min) {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] < vec[j];
+        }
+        return ik < jk;
+      });
+    } else {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] > vec[j];
+        }
+        return ik < jk;
+      });
+    }
+    return p;
   }
 
-  void compare()
+  template <typename T>
+  void apply_permutation(std::vector<T>& vec, const std::vector<IdxT>& p)
   {
-    ASSERT_TRUE(
-      devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, Compare<value_t>()));
-    ASSERT_TRUE(
-      devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare<value_idx>()));
+    for (auto i = IdxT(vec.size()) - 1; i > 0; i--) {
+      auto j = p[i];
+      while (j > i)
+        j = p[j];
+      std::swap(vec[j], vec[i]);
+    }
   }
+};
+
+template <typename InOut>
+using Params = std::tuple<SelectTestSpec, knn::SelectKAlgo, InOut>;
 
+template <typename KeyT, typename IdxT, template <typename, typename> typename ParamsReader>
+class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::ParamsIn> {
  protected:
-  raft::handle_t handle;
-  cudaStream_t stream;
+  const SelectTestSpec spec;
+  const knn::SelectKAlgo algo;
+
+  typename ParamsReader<KeyT, IdxT>::InOut ref;
+  SelectInOutComputed<KeyT, IdxT> res;
+
+ public:
+  explicit SelectionTest(Params<typename ParamsReader<KeyT, IdxT>::InOut> ps)
+    : spec(std::get<0>(ps)),
+      algo(std::get<1>(ps)),
+      ref(std::get<2>(ps)),
+      res(spec, algo, ref.get_in_dists(), ref.get_in_ids())
+  {
+  }
+
+  explicit SelectionTest(typename ParamsReader<KeyT, IdxT>::ParamsIn ps)
+    : SelectionTest(ParamsReader<KeyT, IdxT>::read(ps))
+  {
+  }
 
-  int n_rows, n_cols, k;
+  SelectionTest()
+    : SelectionTest(testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::ParamsIn>::GetParam())
+  {
+  }
 
-  // input data
-  rmm::device_uvector<value_t> dists;
-  rmm::device_uvector<value_idx> inds;
+  void run()
+  {
+    if (ref.not_supported || res.not_supported) { GTEST_SKIP(); }
+    ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
+    ASSERT_TRUE(hostVecMatch(ref.get_out_ids(), res.get_out_ids(), Compare<IdxT>()));
+  }
+};
 
-  // output data
-  rmm::device_uvector<value_idx> out_indices;
-  rmm::device_uvector<value_t> out_dists;
+auto selection_algos = testing::Values(knn::SelectKAlgo::FAISS,
+                                       knn::SelectKAlgo::RADIX_8_BITS,
+                                       knn::SelectKAlgo::RADIX_11_BITS,
+                                       knn::SelectKAlgo::WARP_SORT);
 
-  rmm::device_uvector<value_idx> out_indices_ref;
-  rmm::device_uvector<value_t> out_dists_ref;
+template <typename KeyT, typename IdxT>
+struct params_simple {
+  using InOut = SelectInOutSimple<KeyT, IdxT>;
+  using Inputs =
+    std::tuple<SelectTestSpec, std::vector<KeyT>, std::vector<KeyT>, std::vector<IdxT>>;
+  using ParamsIn = std::tuple<Inputs, knn::SelectKAlgo>;
 
-  SparseSelectionInputs<value_idx, value_t> params;
+  static auto read(ParamsIn ps) -> Params<InOut>
+  {
+    auto ins  = std::get<0>(ps);
+    auto algo = std::get<1>(ps);
+    return std::make_tuple(
+      std::get<0>(ins),
+      algo,
+      SelectInOutSimple<KeyT, IdxT>(
+        std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)));
+  }
 };
 
-const std::vector<SparseSelectionInputs<int, float>> inputs_i32_f = {
-  {5,
-   5,
-   {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
-    1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
-   {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
-    4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
-   {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3},
-   5,
-   true}};
-typedef SparseSelectionTest<int, float> SparseSelectionTestF;
-TEST_P(SparseSelectionTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseSelectionTest,
-                        SparseSelectionTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
-
-};  // end namespace selection
-};  // end namespace spatial
-};  // end namespace raft
+auto inputs_simple_f = testing::Values(
+  params_simple<float, int>::Inputs(
+    {5, 5, 5, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
+     4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
+    {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
+  params_simple<float, int>::Inputs(
+    {5, 5, 3, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
+    {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
+  params_simple<float, int>::Inputs(
+    {5, 7, 3, true},
+    {5.0, 4.0, 3.0, 2.0, 1.3, 7.5, 19.0, 9.0, 2.0, 3.0, 3.0, 5.0, 6.0, 4.0, 2.0, 3.0, 5.0, 1.0,
+     4.0, 1.0, 1.0, 5.0, 7.0, 2.5, 4.0,  7.0, 8.0, 8.0, 1.0, 3.0, 2.0, 5.0, 4.0, 1.1, 1.2},
+    {1.3, 2.0, 3.0, 2.0, 3.0, 3.0, 1.0, 1.0, 1.0, 2.5, 4.0, 5.0, 1.0, 1.1, 1.2},
+    {4, 3, 2, 1, 2, 3, 3, 5, 6, 2, 3, 0, 0, 5, 6}),
+  params_simple<float, int>::Inputs(
+    {1, 7, 3, true}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {3, 5, 6}),
+  params_simple<float, int>::Inputs(
+    {1, 7, 3, false}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {5.0, 4.0, 3.0}, {2, 4, 1}),
+  params_simple<float, int>::Inputs(
+    {1, 7, 3, false}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}),
+  params_simple<float, int>::Inputs(
+    {1, 130, 15, false},
+    {19, 1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+     1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
+     5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6},
+    {129, 0, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105}));
+
+typedef SelectionTest<float, int, params_simple> SimpleFloatInt;
+TEST_P(SimpleFloatInt, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(SelectionTest,
+                        SimpleFloatInt,
+                        testing::Combine(inputs_simple_f, selection_algos));
+
+template <knn::SelectKAlgo RefAlgo>
+struct with_ref {
+  template <typename KeyT, typename IdxT>
+  struct params_random {
+    using InOut    = SelectInOutComputed<KeyT, IdxT>;
+    using ParamsIn = std::tuple<SelectTestSpec, knn::SelectKAlgo>;
+
+    static auto read(ParamsIn ps) -> Params<InOut>
+    {
+      auto spec = std::get<0>(ps);
+      auto algo = std::get<1>(ps);
+      std::vector<KeyT> dists(spec.input_len * spec.n_inputs);
+
+      auto s = rmm::cuda_stream_default;
+      rmm::device_uvector<KeyT> dists_d(spec.input_len * spec.n_inputs, s);
+      raft::random::Rng(42).normal(dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0), s);
+      update_host(dists.data(), dists_d.data(), dists_d.size(), s);
+      s.synchronize();
+
+      return std::make_tuple(spec, algo, SelectInOutComputed<KeyT, IdxT>(spec, RefAlgo, dists));
+    }
+  };
+};
+
+auto inputs_random = testing::Values(SelectTestSpec{1, 130, 15, false},
+                                     SelectTestSpec{1, 128, 15, false},
+                                     SelectTestSpec{20, 700, 1, true},
+                                     SelectTestSpec{20, 700, 2, true},
+                                     SelectTestSpec{20, 700, 3, true},
+                                     SelectTestSpec{20, 700, 4, true},
+                                     SelectTestSpec{20, 700, 5, true},
+                                     SelectTestSpec{20, 700, 6, true},
+                                     SelectTestSpec{20, 700, 7, true},
+                                     SelectTestSpec{20, 700, 8, true},
+                                     SelectTestSpec{20, 700, 9, true},
+                                     SelectTestSpec{20, 700, 10, true},
+                                     SelectTestSpec{20, 700, 11, true},
+                                     SelectTestSpec{20, 700, 12, true},
+                                     SelectTestSpec{20, 700, 16, true},
+                                     SelectTestSpec{100, 1700, 17, true},
+                                     SelectTestSpec{100, 1700, 31, true},
+                                     SelectTestSpec{100, 1700, 32, false},
+                                     SelectTestSpec{100, 1700, 33, false},
+                                     SelectTestSpec{100, 1700, 63, false},
+                                     SelectTestSpec{100, 1700, 64, false},
+                                     SelectTestSpec{100, 1700, 65, false},
+                                     SelectTestSpec{100, 1700, 255, true},
+                                     SelectTestSpec{100, 1700, 256, true},
+                                     SelectTestSpec{100, 1700, 511, false},
+                                     SelectTestSpec{100, 1700, 512, true},
+                                     SelectTestSpec{100, 1700, 1023, false},
+                                     SelectTestSpec{100, 1700, 1024, true},
+                                     SelectTestSpec{100, 1700, 1700, true},
+                                     SelectTestSpec{10000, 100, 100, false},
+                                     SelectTestSpec{10000, 200, 100, false});
+
+typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::RADIX_8_BITS>::params_random>
+  ReferencedRandomFloatInt;
+TEST_P(ReferencedRandomFloatInt, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(SelectionTest,
+                        ReferencedRandomFloatInt,
+                        testing::Combine(inputs_random, selection_algos));
+
+typedef SelectionTest<double, int, with_ref<knn::SelectKAlgo::RADIX_8_BITS>::params_random>
+  ReferencedRandomDoubleInt;
+TEST_P(ReferencedRandomDoubleInt, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(SelectionTest,
+                        ReferencedRandomDoubleInt,
+                        testing::Combine(inputs_random, selection_algos));
+
+}  // namespace raft::spatial::selection
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index 5349ac23d9..196b0cd0a8 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -209,6 +209,36 @@ testing::AssertionResult devArrMatchHost(
   return testing::AssertionSuccess();
 }
 
+/**
+ * @brief Helper function to compare host vectors using a custom comparison
+ * @tparam T the element type
+ * @tparam L the comparator lambda or object function
+ * @param expected_h host vector of expected value(s)
+ * @param actual_h host vector actual values
+ * @param eq_compare the comparator
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
+template <typename T, typename L>
+testing::AssertionResult hostVecMatch(const std::vector<T>& expected_h,
+                                      const std::vector<T>& actual_h,
+                                      L eq_compare)
+{
+  auto n = actual_h.size();
+  if (n != expected_h.size())
+    return testing::AssertionFailure()
+           << "vector sizez mismatch: "
+           << "actual=" << n << " != expected=" << expected_h.size() << "; ";
+  for (size_t i = 0; i < n; ++i) {
+    auto exp = expected_h[i];
+    auto act = actual_h[i];
+    if (!eq_compare(exp, act)) {
+      return testing::AssertionFailure()
+             << "actual=" << act << " != expected=" << exp << " @" << i << "; ";
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
 /*
  * @brief Helper function to compare diagonal values of a 2D matrix
  * @tparam T the data type of the arrays