From 179e1dff1b20866745e10d05422305cb502cad47 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 9 Mar 2022 09:57:54 +0100
Subject: [PATCH 01/41] Integrate new select-top-k implementations

---
 cpp/bench/CMakeLists.txt                      |   1 +
 cpp/bench/spatial/selection.cu                | 132 +++
 .../knn/detail/ivf_flat/bitonic_sort.cuh      | 168 ++++
 .../knn/detail/ivf_flat/radix_topk.cuh        | 657 ++++++++++++++
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 850 ++++++++++++++++++
 .../spatial/knn/detail/selection_faiss.cuh    |  68 +-
 cpp/include/raft/spatial/knn/knn.cuh          | 218 ++++-
 cpp/include/raft/spatial/knn/knn.hpp          | 222 ++++-
 cpp/test/spatial/selection.cu                 | 386 ++++++--
 cpp/test/test_utils.h                         |  26 +
 10 files changed, 2502 insertions(+), 226 deletions(-)
 create mode 100644 cpp/bench/spatial/selection.cu
 create mode 100644 cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
 create mode 100644 cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 9f0a6096d9..5214047571 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -19,6 +19,7 @@ set(RAFT_CPP_BENCH_TARGET "bench_raft")
 # (please keep the filenames in alphabetical order)
 add_executable(${RAFT_CPP_BENCH_TARGET}
   bench/linalg/reduce.cu
+  bench/spatial/selection.cu
   bench/main.cpp
 )
 
diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
new file mode 100644
index 0000000000..644b983a7e
--- /dev/null
+++ b/cpp/bench/spatial/selection.cu
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/spatial/knn/knn.hpp>
+
+#include <raft/random/rng.hpp>
+#include <raft/sparse/detail/utils.h>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft::bench::spatial {
+
+struct params {
+  int n_inputs;
+  int input_len;
+  int k;
+  int select_min;
+};
+
+template <typename KeyT, typename IdxT, raft::spatial::knn::SelectKAlgo Algo>
+struct selection : public Fixture {
+  selection(const std::string& name, const params& p) : Fixture(name), params_(p) {}
+
+ protected:
+  void allocateBuffers(const ::benchmark::State& state) override
+  {
+    auto in_len = params_.n_inputs * params_.input_len;
+    alloc(in_dists_, in_len, false);
+    alloc(in_ids_, in_len, false);
+    alloc(out_dists_, params_.n_inputs * params_.k, false);
+    alloc(out_ids_, params_.n_inputs * params_.k, false);
+
+    raft::sparse::iota_fill(in_ids_, IdxT(params_.n_inputs), IdxT(params_.input_len), stream);
+    raft::random::Rng(42).uniform(in_dists_, in_len, KeyT(-1.0), KeyT(1.0), stream);
+  }
+
+  void deallocateBuffers(const ::benchmark::State& state) override
+  {
+    dealloc(in_dists_, params_.n_inputs * params_.input_len);
+    dealloc(in_ids_, params_.n_inputs * params_.input_len);
+    dealloc(out_dists_, params_.n_inputs * params_.k);
+    dealloc(out_ids_, params_.n_inputs * params_.k);
+  }
+
+  void runBenchmark(::benchmark::State& state) override
+  {
+    rmm::mr::cuda_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{
+      &cuda_mr, size_t(1) << size_t(30), size_t(16) << size_t(30)};
+    rmm::mr::set_current_device_resource(&pool_mr);
+    try {
+      std::ostringstream label_stream;
+      label_stream << params_.n_inputs << "#" << params_.input_len << "#" << params_.k;
+      state.SetLabel(label_stream.str());
+      loopOnState(state, [this]() {
+        raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_,
+                                                 in_ids_,
+                                                 params_.n_inputs,
+                                                 params_.input_len,
+                                                 out_dists_,
+                                                 out_ids_,
+                                                 params_.select_min,
+                                                 params_.k,
+                                                 stream,
+                                                 Algo);
+      });
+    } catch (raft::exception& e) {
+      state.SkipWithError(e.what());
+    }
+    rmm::mr::set_current_device_resource(nullptr);
+  }
+
+ private:
+  params params_;
+  KeyT *in_dists_, *out_dists_;
+  IdxT *in_ids_, *out_ids_;
+};
+
+const std::vector<params> kInputs{
+  {10000, 10, 3, true},     {10000, 10, 10, true},     {10000, 700, 3, true},
+  {10000, 700, 32, true},   {10000, 2000, 64, true},   {10000, 10000, 7, true},
+  {10000, 10000, 19, true}, {10000, 10000, 127, true},
+
+  {1000, 10000, 1, true},   {1000, 10000, 2, true},    {1000, 10000, 4, true},
+  {1000, 10000, 8, true},   {1000, 10000, 16, true},   {1000, 10000, 32, true},
+  {1000, 10000, 64, true},  {1000, 10000, 128, true},  {1000, 10000, 256, true},
+  {1000, 10000, 512, true}, {1000, 10000, 1024, true}, {1000, 10000, 2048, true},
+
+  {100, 100000, 1, true},   {100, 100000, 2, true},    {100, 100000, 4, true},
+  {100, 100000, 8, true},   {100, 100000, 16, true},   {100, 100000, 32, true},
+  {100, 100000, 64, true},  {100, 100000, 128, true},  {100, 100000, 256, true},
+  {100, 100000, 512, true}, {100, 100000, 1024, true}, {100, 100000, 2048, true},
+
+  {10, 1000000, 1, true},   {10, 1000000, 2, true},    {10, 1000000, 4, true},
+  {10, 1000000, 8, true},   {10, 1000000, 16, true},   {10, 1000000, 32, true},
+  {10, 1000000, 64, true},  {10, 1000000, 128, true},  {10, 1000000, 256, true},
+  {10, 1000000, 512, true}, {10, 1000000, 1024, true}, {10, 1000000, 2048, true},
+};
+
+#define SELECTION_REGISTER(KeyT, IdxT, Algo)                                      \
+  namespace BENCHMARK_PRIVATE_NAME(selection)                                     \
+  {                                                                               \
+    using SelectK = selection<KeyT, IdxT, raft::spatial::knn::SelectKAlgo::Algo>; \
+    RAFT_BENCH_REGISTER(params, SelectK, #KeyT "/" #IdxT "/" #Algo, kInputs);     \
+  }
+
+SELECTION_REGISTER(float, int, FAISS);
+SELECTION_REGISTER(float, int, RADIX_8_BITS);
+SELECTION_REGISTER(float, int, RADIX_11_BITS);
+SELECTION_REGISTER(float, int, WARP_SORT);
+
+// SELECTION_REGISTER(double, int, FAISS);
+// SELECTION_REGISTER(double, int, RADIX_8_BITS);
+// SELECTION_REGISTER(double, int, RADIX_11_BITS);
+// SELECTION_REGISTER(double, int, WARP_SORT);
+
+}  // namespace raft::bench::spatial
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
new file mode 100644
index 0000000000..c99d9b0313
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+
+namespace raft::spatial::knn::detail::ivf_flat {
+
+namespace helpers {
+
+template <typename T>
+__device__ __forceinline__ void swap(T& x, T& y)
+{
+  T t = x;
+  x   = y;
+  y   = t;
+}
+
+template <typename T>
+__device__ __forceinline__ void assign(bool cond, T* ptr, T x)
+{
+  if (cond) { *ptr = x; }
+}
+
+template <typename T, typename... Ts>
+__device__ __forceinline__ auto first(T x, Ts... xs) -> T
+{
+  return x;
+}
+
+}  // namespace helpers
+
+/**
+ * Bitonic merge at the warp level.
+ *
+ * @tparam Size is the number of elements (must be power of two).
+ * @tparam Ascending is the resulting order (true: ascending, false: descending).
+ */
+template <int Size, bool Ascending>
+struct bitonic_merge {
+  static_assert(isPo2(Size));
+
+  /** How many contiguous elements are processed by one thread. */
+  static constexpr int kArrLen = Size / WarpSize;
+  static constexpr int kStride = kArrLen / 2;
+
+  template <bool Fits, typename Dummy>
+  using when_fits_in_warp =
+    std::enable_if_t<(Fits == (Size <= WarpSize)) && std::is_same_v<Dummy, Dummy>, void>;
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ auto run(bool reverse, KeyT* keys, PayloadTs*... payloads)
+    -> when_fits_in_warp<false, KeyT>
+  {
+    for (int i = 0; i < kStride; ++i) {
+      const int other_i = i + kStride;
+      KeyT& key         = keys[i];
+      KeyT& other       = keys[other_i];
+      bool do_swap      = Ascending != reverse ? key > other : key < other;
+      // Normally, we expect `payloads` to be the array of indices from 0 to len;
+      // in that case, the construct below makes the sorting stable.
+      if constexpr (sizeof...(payloads) > 0) {
+        if (key == other) {
+          do_swap =
+            reverse != (helpers::first(payloads...)[i] > helpers::first(payloads...)[other_i]);
+        }
+      }
+      if (do_swap) {
+        helpers::swap(key, other);
+        (helpers::swap(payloads[i], payloads[other_i]), ...);
+      }
+    }
+
+    bitonic_merge<Size / 2, Ascending>::run(reverse, keys, payloads...);
+    bitonic_merge<Size / 2, Ascending>::run(reverse, keys + kStride, (payloads + kStride)...);
+  }
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ auto run(bool reverse, KeyT* keys, PayloadTs*... payloads)
+    -> when_fits_in_warp<true, KeyT>
+  {
+    const int lane = threadIdx.x % Size;
+    for (int stride = Size / 2; stride > 0; stride /= 2) {
+      bool is_second = lane & stride;
+      KeyT& key      = *keys;
+      KeyT other     = shfl_xor(key, stride, Size);
+
+      bool asc       = Ascending != reverse;
+      bool do_assign = key != other && ((key > other) == (asc != is_second));
+      // Normally, we expect `payloads` to be the array of indices from 0 to len;
+      // in that case, the construct below makes the sorting stable.
+      if constexpr (sizeof...(payloads) > 0) {
+        auto payload_this = *helpers::first(payloads...);
+        auto payload_that = shfl_xor(payload_this, stride, Size);
+        if (key == other) { do_assign = reverse != ((payload_this > payload_that) != is_second); }
+      }
+
+      helpers::assign(do_assign, keys, other);
+      // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
+      (helpers::assign(do_assign, payloads, shfl_xor(*payloads, stride, Size)), ...);
+    }
+  }
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void run(KeyT* keys, PayloadTs*... payloads)
+  {
+    return run(false, keys, payloads...);
+  }
+};
+
+/**
+ * Bitonic sort at the warp level.
+ *
+ * @tparam Size is the number of elements (must be power of two).
+ * @tparam Ascending is the resulting order (true: ascending, false: descending).
+ */
+template <int Size, bool Ascending>
+struct bitonic_sort {
+  static_assert(isPo2(Size));
+
+  static constexpr int kSize2 = Size / 2;
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void run(bool reverse, KeyT* keys, PayloadTs*... payloads)
+  {
+    if constexpr (Size > 2) {
+      // NB: the `reverse` expression here is always `0` (false) when `Size > WarpSize`
+      bitonic_sort<kSize2, Ascending>::run(laneId() & kSize2, keys, payloads...);
+    }
+    if constexpr (Size > WarpSize) {
+      // NB: this part is executed only if the size of the input arrays is larger than the warp.
+      constexpr int kShift = kSize2 / WarpSize;
+      bitonic_sort<kSize2, Ascending>::run(true, keys + kShift, (payloads + kShift)...);
+    }
+    bitonic_merge<Size, Ascending>::run(reverse, keys, payloads...);
+  }
+
+  /**
+   * Execute the sort.
+   *
+   * @param keys
+   *   is a device pointer to a contiguous array of keys, unique per thread;
+   * @param payloads
+   *   are zero or more associated arrays of the same size as keys, which are sorted together with
+   *   the keys.
+   */
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void run(KeyT* keys, PayloadTs*... payloads)
+  {
+    return run(false, keys, payloads...);
+  }
+};
+
+}  // namespace raft::spatial::knn::detail::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
new file mode 100644
index 0000000000..a48f7a1e3c
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/radix_rank_sort_operations.cuh>
+
+#include <raft/cudart_utils.h>
+
+/*
+  Two implementations:
+
+  (1) radix select (select + filter):
+      first select the k-th value by going through radix passes,
+      then filter out all wanted data from original data
+
+  (2) radix topk:
+      filter out wanted data directly while going through radix passes
+*/
+
+namespace raft::spatial::knn::detail::ivf_flat {
+
+inline size_t calc_aligned_size(const std::vector<size_t>& sizes)
+{
+  const size_t ALIGN_BYTES = 256;
+  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
+  size_t total             = 0;
+  for (auto sz : sizes) {
+    total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
+  }
+  return total + ALIGN_BYTES - 1;
+}
+
+inline std::vector<void*> calc_aligned_pointers(const void* p, const std::vector<size_t>& sizes)
+{
+  const size_t ALIGN_BYTES = 256;
+  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
+
+  char* ptr = reinterpret_cast<char*>((reinterpret_cast<size_t>(p) + ALIGN_BYTES - 1) & ALIGN_MASK);
+
+  std::vector<void*> aligned_pointers;
+  aligned_pointers.reserve(sizes.size());
+  for (auto sz : sizes) {
+    aligned_pointers.push_back(ptr);
+    ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
+  }
+
+  return aligned_pointers;
+}
+
+constexpr int BLOCK_DIM       = 512;
+constexpr int ITEM_PER_THREAD = 32;
+
+template <int BITS_PER_PASS>
+__host__ __device__ constexpr int calc_num_buckets()
+{
+  return 1 << BITS_PER_PASS;
+}
+
+template <typename T, int BITS_PER_PASS>
+__host__ __device__ constexpr int calc_num_passes()
+{
+  return (sizeof(T) * 8 - 1) / BITS_PER_PASS + 1;
+}
+
+// bit 0 is the least significant (rightmost) bit
+// this function works even when pass=-1, which is used in calc_mask()
+template <typename T, int BITS_PER_PASS>
+__device__ constexpr int calc_start_bit(int pass)
+{
+  int start_bit = static_cast<int>(sizeof(T) * 8) - (pass + 1) * BITS_PER_PASS;
+  if (start_bit < 0) { start_bit = 0; }
+  return start_bit;
+}
+
+template <typename T, int BITS_PER_PASS>
+__device__ constexpr unsigned calc_mask(int pass)
+{
+  static_assert(BITS_PER_PASS <= 31);
+  int num_bits =
+    calc_start_bit<T, BITS_PER_PASS>(pass - 1) - calc_start_bit<T, BITS_PER_PASS>(pass);
+  return (1 << num_bits) - 1;
+}
+
+template <typename T>
+__device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
+{
+  auto bits = reinterpret_cast<typename cub::Traits<T>::UnsignedBits&>(key);
+  bits      = cub::Traits<T>::TwiddleIn(bits);
+  if (greater) { bits = ~bits; }
+  return bits;
+}
+
+template <typename T, int BITS_PER_PASS>
+__device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
+{
+  static_assert(BITS_PER_PASS <= sizeof(int) * 8 - 1);  // so return type can be int
+  return (twiddle_in(x, greater) >> start_bit) & mask;
+}
+
+template <typename T, typename idxT, typename Func>
+__device__ void vectorized_process(const T* in, idxT len, Func f)
+{
+  using WideT = float4;
+
+  const idxT stride = blockDim.x * gridDim.x;
+  const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
+  if constexpr (sizeof(T) >= sizeof(WideT)) {
+    for (idxT i = tid; i < len; i += stride) {
+      f(in[i], i);
+    }
+  } else {
+    static_assert(sizeof(WideT) % sizeof(T) == 0);
+    constexpr int items_per_scalar = sizeof(WideT) / sizeof(T);
+    // TODO: it's UB
+    union {
+      WideT scalar;
+      T array[items_per_scalar];
+    } wide;
+
+    int skip_cnt = (reinterpret_cast<size_t>(in) % sizeof(WideT))
+                     ? ((sizeof(WideT) - reinterpret_cast<size_t>(in) % sizeof(WideT)) / sizeof(T))
+                     : 0;
+    if (skip_cnt > len) { skip_cnt = len; }
+    const WideT* in_cast = reinterpret_cast<decltype(in_cast)>(in + skip_cnt);
+    const idxT len_cast  = (len - skip_cnt) / items_per_scalar;
+    for (idxT i = tid; i < len_cast; i += stride) {
+      wide.scalar       = in_cast[i];
+      const idxT real_i = skip_cnt + i * items_per_scalar;
+#pragma unroll
+      for (int j = 0; j < items_per_scalar; ++j) {
+        f(wide.array[j], real_i + j);
+      }
+    }
+
+    static_assert(WarpSize >= items_per_scalar);
+    // and because items_per_scalar > skip_cnt, WarpSize > skip_cnt
+    // no need to use loop
+    if (tid < skip_cnt) { f(in[tid], tid); }
+    // because len_cast = (len - skip_cnt) / items_per_scalar,
+    // len_cast * items_per_scalar + items_per_scalar > len - skip_cnt;
+    // and so
+    // len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <= WarpSize
+    // no need to use loop
+    const idxT remain_i = skip_cnt + len_cast * items_per_scalar + tid;
+    if (remain_i < len) { f(in[remain_i], remain_i); }
+  }
+}
+
+template <typename T, typename idxT>
+struct Counter {
+  idxT k;
+  idxT len;
+  idxT previous_len;
+  int bucket;
+
+  idxT filter_cnt;
+  unsigned int finished_block_cnt;
+  idxT out_cnt;
+  idxT out_back_cnt;
+  T kth_value;
+};
+
+template <typename T, typename idxT, int BITS_PER_PASS>
+__device__ void filter_and_histogram(const T* in_buf,
+                                     const idxT* in_idx_buf,
+                                     T* out_buf,
+                                     idxT* out_idx_buf,
+                                     T* out,
+                                     idxT* out_idx,
+                                     idxT len,
+                                     Counter<T, idxT>* counter,
+                                     idxT* histogram,
+                                     bool greater,
+                                     int pass,
+                                     int k)
+{
+  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
+  __shared__ idxT histogram_smem[num_buckets];
+  for (idxT i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+    histogram_smem[i] = 0;
+  }
+  __syncthreads();
+
+  const int start_bit = calc_start_bit<T, BITS_PER_PASS>(pass);
+  const unsigned mask = calc_mask<T, BITS_PER_PASS>(pass);
+
+  if (pass == 0) {
+    auto f = [greater, start_bit, mask](T value, idxT) {
+      int bucket = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
+      atomicAdd(histogram_smem + bucket, 1);
+    };
+    vectorized_process(in_buf, len, f);
+  } else {
+    const idxT previous_len      = counter->previous_len;
+    const int want_bucket        = counter->bucket;
+    idxT& filter_cnt             = counter->filter_cnt;
+    idxT& out_cnt                = counter->out_cnt;
+    T& kth_value                 = counter->kth_value;
+    const idxT counter_len       = counter->len;
+    const int previous_start_bit = calc_start_bit<T, BITS_PER_PASS>(pass - 1);
+    const unsigned previous_mask = calc_mask<T, BITS_PER_PASS>(pass - 1);
+
+    auto f = [in_idx_buf,
+              out_buf,
+              out_idx_buf,
+              out,
+              out_idx,
+              greater,
+              k,
+              start_bit,
+              mask,
+              previous_start_bit,
+              previous_mask,
+              want_bucket,
+              &filter_cnt,
+              &out_cnt,
+              &kth_value,
+              counter_len](T value, idxT i) {
+      int prev_bucket =
+        calc_bucket<T, BITS_PER_PASS>(value, previous_start_bit, previous_mask, greater);
+      if (prev_bucket == want_bucket) {
+        idxT pos     = atomicAdd(&filter_cnt, 1);
+        out_buf[pos] = value;
+        if (out_idx_buf) { out_idx_buf[pos] = in_idx_buf ? in_idx_buf[i] : i; }
+        int bucket = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
+        atomicAdd(histogram_smem + bucket, 1);
+
+        if (counter_len == 1) {
+          if (out) {
+            out[k - 1]     = value;
+            out_idx[k - 1] = in_idx_buf ? in_idx_buf[i] : i;
+          } else {
+            kth_value = value;
+          }
+        }
+      } else if (out && prev_bucket < want_bucket) {
+        idxT pos     = atomicAdd(&out_cnt, 1);
+        out[pos]     = value;
+        out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
+      }
+    };
+
+    vectorized_process(in_buf, previous_len, f);
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+    if (histogram_smem[i] != 0) { atomicAdd(histogram + i, histogram_smem[i]); }
+  }
+}
+
+template <typename idxT, int BITS_PER_PASS, int NUM_THREAD>
+__device__ void scan(volatile idxT* histogram,
+                     const int start,
+                     const int num_buckets,
+                     const idxT current)
+{
+  typedef cub::BlockScan<idxT, NUM_THREAD> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+
+  idxT thread_data = 0;
+  int index        = start + threadIdx.x;
+  if (index < num_buckets) { thread_data = histogram[index]; }
+
+  BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+  __syncthreads();
+  if (index < num_buckets) { histogram[index] = thread_data + current; }
+  __syncthreads();  // This sync is necessary, as the content of histogram needs to be
+                    // read after
+}
+
+template <typename T, typename idxT, int BITS_PER_PASS, int NUM_THREAD>
+__device__ void choose_bucket(Counter<T, idxT>* counter, idxT* histogram, const idxT k)
+{
+  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
+  int index                 = threadIdx.x;
+  idxT current_value        = 0;
+  int num_pass              = 1;
+  if constexpr (num_buckets >= NUM_THREAD) {
+    static_assert(num_buckets % NUM_THREAD == 0);
+    num_pass = num_buckets / NUM_THREAD;
+  }
+
+  for (int i = 0; i < num_pass && (current_value < k); i++) {
+    scan<idxT, BITS_PER_PASS, NUM_THREAD>(histogram, i * NUM_THREAD, num_buckets, current_value);
+    if (index < num_buckets) {
+      idxT prev = (index == 0) ? 0 : histogram[index - 1];
+      idxT cur  = histogram[index];
+
+      // one and only one thread will satisfy this condition, so only write once
+      if (prev < k && cur >= k) {
+        counter->k            = k - prev;
+        counter->previous_len = counter->len;
+        counter->len          = cur - prev;
+        counter->bucket       = index;
+      }
+    }
+    index += NUM_THREAD;
+    current_value = histogram[(i + 1) * NUM_THREAD - 1];
+  }
+}
+
+template <typename T, typename idxT, int BITS_PER_PASS, int NUM_THREAD>
+__global__ void radix_kernel(const T* in_buf,
+                             const idxT* in_idx_buf,
+                             T* out_buf,
+                             idxT* out_idx_buf,
+                             T* out,
+                             idxT* out_idx,
+                             Counter<T, idxT>* counters,
+                             idxT* histograms,
+                             const idxT len,
+                             const idxT k,
+                             const bool greater,
+                             const int pass)
+{
+  __shared__ bool isLastBlockDone;
+
+  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
+  constexpr int num_passes  = calc_num_passes<T, BITS_PER_PASS>();
+  const int batch_id        = blockIdx.y;
+  in_buf += batch_id * len;
+  out_buf += batch_id * len;
+  if (in_idx_buf) { in_idx_buf += batch_id * len; }
+  if (out_idx_buf) { out_idx_buf += batch_id * len; }
+  if (out) {
+    out += batch_id * k;
+    out_idx += batch_id * k;
+  }
+  auto counter   = counters + batch_id;
+  auto histogram = histograms + batch_id * num_buckets;
+
+  filter_and_histogram<T, idxT, BITS_PER_PASS>(in_buf,
+                                               in_idx_buf,
+                                               out_buf,
+                                               out_idx_buf,
+                                               out,
+                                               out_idx,
+                                               len,
+                                               counter,
+                                               histogram,
+                                               greater,
+                                               pass,
+                                               k);
+  __threadfence();
+
+  if (threadIdx.x == 0) {
+    unsigned int finished = atomicInc(&counter->finished_block_cnt, gridDim.x - 1);
+    isLastBlockDone       = (finished == (gridDim.x - 1));
+  }
+
+  // Synchronize to make sure that each thread reads the correct value of
+  // isLastBlockDone.
+  __syncthreads();
+  if (isLastBlockDone) {
+    if (counter->len == 1 && threadIdx.x == 0) {
+      counter->previous_len = 0;
+      counter->len          = 0;
+    }
+    // init counter, other members of counter is initialized with 0 by cudaMemset()
+    if (pass == 0 && threadIdx.x == 0) {
+      counter->k   = k;
+      counter->len = len;
+      if (out) { counter->out_back_cnt = 0; }
+    }
+    __syncthreads();
+
+    idxT ori_k = counter->k;
+
+    if (counter->len > 0) {
+      choose_bucket<T, idxT, BITS_PER_PASS, NUM_THREAD>(counter, histogram, ori_k);
+    }
+
+    __syncthreads();
+    if (pass == num_passes - 1) {
+      const idxT previous_len = counter->previous_len;
+      const int want_bucket   = counter->bucket;
+      int start_bit           = calc_start_bit<T, BITS_PER_PASS>(pass);
+      unsigned mask           = calc_mask<T, BITS_PER_PASS>(pass);
+
+      if (!out) {  // radix select
+        for (idxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
+          const T value = out_buf[i];
+          int bucket    = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
+          if (bucket == want_bucket) {
+            // TODO: UB
+            // could use atomicExch, but it's not defined for T=half
+            counter->kth_value = value;
+            break;
+          }
+        }
+      } else {  // radix topk
+        idxT& out_cnt = counter->out_cnt;
+        for (idxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
+          const T value = out_buf[i];
+          int bucket    = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
+          if (bucket < want_bucket) {
+            idxT pos     = atomicAdd(&out_cnt, 1);
+            out[pos]     = value;
+            out_idx[pos] = out_idx_buf[i];
+          } else if (bucket == want_bucket) {
+            idxT needed_num_of_kth = counter->k;
+            idxT back_pos          = atomicAdd(&(counter->out_back_cnt), 1);
+            if (back_pos < needed_num_of_kth) {
+              idxT pos     = k - 1 - back_pos;
+              out[pos]     = value;
+              out_idx[pos] = out_idx_buf[i];
+            }
+          }
+        }
+        __syncthreads();
+      }
+    } else {
+      // reset for next pass
+      for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+        histogram[i] = 0;
+      }
+      if (threadIdx.x == 0) { counter->filter_cnt = 0; }
+    }
+  }
+}
+
+template <typename T, typename idxT>
+__global__ void final_filter(const T* in,
+                             const idxT len,
+                             const idxT k,
+                             Counter<T, idxT>* counters,
+                             T* out,
+                             idxT* out_idx,
+                             bool greater)
+{
+  const int batch_id           = blockIdx.y;
+  const T kth_value            = counters[batch_id].kth_value;
+  const idxT needed_num_of_kth = counters[batch_id].k;
+  idxT& out_cnt                = counters[batch_id].out_cnt;
+  idxT& out_back_cnt           = counters[batch_id].out_back_cnt;
+
+  in      = in + batch_id * len;
+  out     = out + batch_id * k;
+  out_idx = out_idx + batch_id * k;
+
+  auto f = [k, greater, kth_value, needed_num_of_kth, &out_cnt, &out_back_cnt, out, out_idx](
+             T val, idxT i) {
+    if ((greater && val > kth_value) || (!greater && val < kth_value)) {
+      idxT pos     = atomicAdd(&out_cnt, 1);
+      out[pos]     = val;
+      out_idx[pos] = i;
+    } else if (val == kth_value) {
+      idxT back_pos = atomicAdd(&out_back_cnt, 1);
+      if (back_pos < needed_num_of_kth) {
+        idxT pos     = k - 1 - back_pos;
+        out[pos]     = val;
+        out_idx[pos] = i;
+      }
+    }
+  };
+  vectorized_process(in, len, f);
+}
+
+template <typename T, typename idxT, int BITS_PER_PASS, int NUM_THREAD>
+void radix_select_topk(void* buf,
+                       size_t& buf_size,
+                       const T* in,
+                       idxT batch_size,
+                       idxT len,
+                       idxT k,
+                       T* out,
+                       idxT* out_idx,
+                       bool greater,
+                       cudaStream_t stream)
+{
+  // TODO: is it possible to relax this restriction?
+  static_assert(calc_num_passes<T, BITS_PER_PASS>() > 1);
+  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
+
+  Counter<T, idxT>* counters = nullptr;
+  idxT* histograms           = nullptr;
+  T* buf1                    = nullptr;
+  T* buf2                    = nullptr;
+  {
+    std::vector<size_t> sizes = {sizeof(*counters) * batch_size,
+                                 sizeof(*histograms) * num_buckets * batch_size,
+                                 sizeof(*buf1) * len * batch_size,
+                                 sizeof(*buf2) * len * batch_size};
+    size_t total_size         = calc_aligned_size(sizes);
+    if (!buf) {
+      buf_size = total_size;
+      return;
+    }
+
+    std::vector<void*> aligned_pointers = calc_aligned_pointers(buf, sizes);
+    counters                            = static_cast<decltype(counters)>(aligned_pointers[0]);
+    histograms                          = static_cast<decltype(histograms)>(aligned_pointers[1]);
+    buf1                                = static_cast<decltype(buf1)>(aligned_pointers[2]);
+    buf2                                = static_cast<decltype(buf2)>(aligned_pointers[3]);
+
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      buf,
+      0,
+      static_cast<char*>(aligned_pointers[2]) - static_cast<char*>(aligned_pointers[0]),
+      stream));
+  }
+
+  const T* in_buf = nullptr;
+  T* out_buf      = nullptr;
+
+  dim3 blocks((len - 1) / (NUM_THREAD * ITEM_PER_THREAD) + 1, batch_size);
+
+  constexpr int num_passes = calc_num_passes<T, BITS_PER_PASS>();
+  for (int pass = 0; pass < num_passes; ++pass) {
+    if (pass == 0) {
+      in_buf  = in;
+      out_buf = nullptr;
+    } else if (pass == 1) {
+      in_buf  = in;
+      out_buf = buf1;
+    } else {
+      in_buf  = (pass % 2 == 0) ? buf1 : buf2;
+      out_buf = (pass % 2 == 0) ? buf2 : buf1;
+    }
+    radix_kernel<T, idxT, BITS_PER_PASS, NUM_THREAD><<<blocks, NUM_THREAD, 0, stream>>>(in_buf,
+                                                                                        nullptr,
+                                                                                        out_buf,
+                                                                                        nullptr,
+                                                                                        nullptr,
+                                                                                        nullptr,
+                                                                                        counters,
+                                                                                        histograms,
+                                                                                        len,
+                                                                                        k,
+                                                                                        greater,
+                                                                                        pass);
+  }
+
+  constexpr int FILTER_BLOCK_DIM       = 256;
+  constexpr int FILTER_ITEM_PER_THREAD = 32;
+  dim3 filter_blocks((len - 1) / (FILTER_BLOCK_DIM * FILTER_ITEM_PER_THREAD) + 1, batch_size);
+  final_filter<<<filter_blocks, FILTER_BLOCK_DIM, 0, stream>>>(
+    in, len, k, counters, out, out_idx, greater);
+}
+
+template <typename T, typename idxT, int BITS_PER_PASS, int NUM_THREAD>
+void radix_topk(void* buf,
+                size_t& buf_size,
+                const T* in,
+                const idxT* in_idx,
+                idxT batch_size,
+                idxT len,
+                idxT k,
+                T* out,
+                idxT* out_idx,
+                bool greater,
+                cudaStream_t stream)
+{
+  // TODO: is it possible to relax this restriction?
+  static_assert(calc_num_passes<T, BITS_PER_PASS>() > 1);
+  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
+
+  Counter<T, idxT>* counters = nullptr;
+  idxT* histograms           = nullptr;
+  T* buf1                    = nullptr;
+  idxT* idx_buf1             = nullptr;
+  T* buf2                    = nullptr;
+  idxT* idx_buf2             = nullptr;
+  {
+    std::vector<size_t> sizes = {sizeof(*counters) * batch_size,
+                                 sizeof(*histograms) * num_buckets * batch_size,
+                                 sizeof(*buf1) * len * batch_size,
+                                 sizeof(*idx_buf1) * len * batch_size,
+                                 sizeof(*buf2) * len * batch_size,
+                                 sizeof(*idx_buf2) * len * batch_size};
+    size_t total_size         = calc_aligned_size(sizes);
+    if (!buf) {
+      buf_size = total_size;
+      return;
+    }
+
+    std::vector<void*> aligned_pointers = calc_aligned_pointers(buf, sizes);
+    counters                            = static_cast<decltype(counters)>(aligned_pointers[0]);
+    histograms                          = static_cast<decltype(histograms)>(aligned_pointers[1]);
+    buf1                                = static_cast<decltype(buf1)>(aligned_pointers[2]);
+    idx_buf1                            = static_cast<decltype(idx_buf1)>(aligned_pointers[3]);
+    buf2                                = static_cast<decltype(buf2)>(aligned_pointers[4]);
+    idx_buf2                            = static_cast<decltype(idx_buf2)>(aligned_pointers[5]);
+
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      buf,
+      0,
+      static_cast<char*>(aligned_pointers[2]) - static_cast<char*>(aligned_pointers[0]),
+      stream));
+  }
+
+  const T* in_buf        = nullptr;
+  const idxT* in_idx_buf = nullptr;
+  T* out_buf             = nullptr;
+  idxT* out_idx_buf      = nullptr;
+
+  dim3 blocks((len - 1) / (NUM_THREAD * ITEM_PER_THREAD) + 1, batch_size);
+
+  constexpr int num_passes = calc_num_passes<T, BITS_PER_PASS>();
+
+  for (int pass = 0; pass < num_passes; ++pass) {
+    if (pass == 0) {
+      in_buf      = in;
+      in_idx_buf  = nullptr;
+      out_buf     = nullptr;
+      out_idx_buf = nullptr;
+    } else if (pass == 1) {
+      in_buf      = in;
+      in_idx_buf  = in_idx ? in_idx : nullptr;
+      out_buf     = buf1;
+      out_idx_buf = idx_buf1;
+    } else if (pass % 2 == 0) {
+      in_buf      = buf1;
+      in_idx_buf  = idx_buf1;
+      out_buf     = buf2;
+      out_idx_buf = idx_buf2;
+    } else {
+      in_buf      = buf2;
+      in_idx_buf  = idx_buf2;
+      out_buf     = buf1;
+      out_idx_buf = idx_buf1;
+    }
+
+    radix_kernel<T, idxT, BITS_PER_PASS, NUM_THREAD><<<blocks, NUM_THREAD, 0, stream>>>(in_buf,
+                                                                                        in_idx_buf,
+                                                                                        out_buf,
+                                                                                        out_idx_buf,
+                                                                                        out,
+                                                                                        out_idx,
+                                                                                        counters,
+                                                                                        histograms,
+                                                                                        len,
+                                                                                        k,
+                                                                                        greater,
+                                                                                        pass);
+  }
+}
+
+}  // namespace raft::spatial::knn::detail::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
new file mode 100644
index 0000000000..1ffeb7335f
--- /dev/null
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "bitonic_sort.cuh"
+
+#include <raft/cuda_utils.cuh>
+#include <raft/pow2_utils.cuh>
+
+#include <algorithm>
+#include <functional>
+#include <type_traits>
+
+/*
+  Three APIs of different scope are provided:
+    1. host function: warp_sort_topk()
+    2. block-wide API: class WarpSortBlockWide
+    3. warp-wide API: class WarpSelect and class WarpBitonic
+
+
+  1. warp_sort_topk()
+    Like CUB functions, it should be called twice.
+    First for getting required buffer size, and a second for the real top-k computation.
+    For the first call, buf==nullptr should be passed, and required buffer
+    size is returned as parameter buf_size.
+    For the second call, pass allocated buffer of required size.
+
+    Example:
+      void* buf = nullptr;
+      size_t buf_size;
+      warp_sort_topk(nullptr, buf_size, ...);  // will set buf_size
+      cudaMalloc(&buf, buf_size);
+      warp_sort_topk(buf, buf_size, ...);
+
+
+  2. class WarpSortBlockWide
+    It can be regarded as a fixed size priority queue for a thread block,
+    although the API is not typical.
+    class WarpSelect and WarpBitonic can be used to instantiate WarpSortBlockWide.
+
+    It uses dynamic shared memory as intermediate buffer.
+    So the required shared memory size should be calculated using
+    calc_smem_size_for_block_wide() and passed as the 3rd kernel launch parameter.
+
+    Two overloaded add() functions can be used to add items to the queue.
+    One is add(const T* in, idxT start, idxT end) and it adds a range of items,
+    namely [start, end) of in. The idx is inferred from start.
+    This function should be called only once to add all items, and should not be
+    used together with the second form of add().
+    The second one is add(T val, idxT idx), and it adds only one item pair.
+    Note that the range [start, end) is for the whole block of threads, that is,
+    each thread in the same block should get the same start/end.
+    In contrast, the parameters of the second form are for only one thread,
+    so each thread must get different val/idx.
+
+    After adding is finished, function done() should be called. And finally,
+    dump() is used to get the top-k result.
+
+    Example:
+      __global__ void kernel() {
+        WarpSortBlockWide<WarpBitonic, ...> queue(...);
+
+        // way 1, [0, len) is same for the whole block
+        queue.add(in, 0, len);
+        // way 2, each thread gets its own val/idx pair
+        for (idxT i = threadIdx.x; i < len, i += blockDim.x) {
+          queue.add(in[i], idx[i]);
+        }
+
+        queue.done();
+        queue.dump(out, out_idx);
+     }
+
+     int smem_size = calc_smem_size_for_block_wide<T>(...);
+     kernel<<grid_dim, block_dim, smem_size>>>();
+
+
+  3. class WarpSelect and class WarpBitonic
+    These two classes can be regarded as fixed sized priority queue for a warp.
+    Usage is similar to class WarpSortBlockWide.
+    Two types of add() functions are provided, and also note that [start, end) is
+    for a whole warp, while val/idx is for a thread.
+    No shared memory is needed.
+
+    Example:
+      __global__ void kernel() {
+        WarpBitonic<...> queue(...);
+        int warp_id = threadIdx.x / WarpSize;
+        int lane_id = threadIdx.x % WarpSize;
+
+        // way 1, [0, len) is same for the whole warp
+        queue.add(in, 0, len);
+        // way 2, each thread gets its own val/idx pair
+        for (idxT i = lane_id; i < len, i += WarpSize) {
+          queue.add(in[i], idx[i]);
+        }
+
+        queue.done();
+        // each warp outputs to a different offset
+        queue.dump(out+ warp_id * k * sizeof(T), out_idx+ warp_id * k * sizeof(idxT));
+      }
+ */
+
+namespace raft::spatial::knn::detail::ivf_flat {
+
+namespace {
+
+template <typename T>
+constexpr T get_lower_bound()
+{
+  if (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
+    return -std::numeric_limits<T>::infinity();
+  } else {
+    return std::numeric_limits<T>::lowest();
+  }
+}
+
+template <typename T>
+constexpr T get_upper_bound()
+{
+  if (std::numeric_limits<T>::has_infinity) {
+    return std::numeric_limits<T>::infinity();
+  } else {
+    return std::numeric_limits<T>::max();
+  }
+}
+
+template <typename T>
+constexpr T get_dummy(bool greater)
+{
+  return greater ? get_lower_bound<T>() : get_upper_bound<T>();
+}
+
+template <bool greater, typename T>
+__device__ inline bool is_greater_than(T val, T baseline)
+{
+  if constexpr (greater) { return val > baseline; }
+  if constexpr (!greater) { return val < baseline; }
+}
+
+template <typename T>
+constexpr HDI T nextHighestPowerOf2(T v)
+{
+  /**
+   * TODO: Not entirely sure if this is what we need in the code of this file.
+   *       It returns `r`, such that r > v, r <= v*2, and r is power of two.
+   */
+  return isPo2(v) ? (v << (T)1) : ((T)1 << (log2(v) + 1));
+}
+
+int calc_capacity(int k)
+{
+  int capacity = nextHighestPowerOf2(k);
+  if (capacity < WarpSize) { capacity = WarpSize; }
+  return capacity;
+}
+}  // namespace
+template <int capacity, bool greater, typename T, typename idxT>
+class WarpSort {
+ public:
+  __device__ WarpSort(idxT k, T dummy) : lane_(threadIdx.x % WarpSize), k_(k), dummy_(dummy)
+  {
+    static_assert(capacity >= WarpSize && isPo2(capacity));
+
+    for (int i = 0; i < max_arr_len_; ++i) {
+      val_arr_[i] = dummy_;
+    }
+  }
+
+  // load and merge k sorted values
+  __device__ void load_sorted(const T* in, const idxT* in_idx, idxT start)
+  {
+    idxT idx = start + WarpSize - 1 - lane_;
+    for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WarpSize) {
+      if (idx < start + k_) {
+        T t = in[idx];
+        if (is_greater_than<greater>(t, val_arr_[i])) {
+          val_arr_[i] = t;
+          idx_arr_[i] = in_idx[idx];
+        }
+      }
+    }
+
+    bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
+  }
+
+  __device__ void dump(T* out, idxT* out_idx) const
+  {
+    for (int i = 0; i < max_arr_len_; ++i) {
+      idxT out_i = i * WarpSize + lane_;
+      if (out_i < k_) {
+        out[out_i]     = val_arr_[i];
+        out_idx[out_i] = idx_arr_[i];
+      }
+    }
+  }
+
+ protected:
+  static constexpr int max_arr_len_ = capacity / WarpSize;
+
+  T val_arr_[max_arr_len_];
+  idxT idx_arr_[max_arr_len_];
+
+  const int lane_;
+  const idxT k_;
+  const T dummy_;
+};
+
+template <int capacity, bool greater, typename T, typename idxT>
+class WarpSelect : public WarpSort<capacity, greater, T, idxT> {
+ public:
+  __device__ WarpSelect(idxT k, T dummy)
+    : WarpSort<capacity, greater, T, idxT>(k, dummy),
+      buf_len_(0),
+      k_th_(dummy),
+      k_th_lane_((k - 1) % WarpSize)
+  {
+    for (int i = 0; i < max_buf_len_; ++i) {
+      val_buf_[i] = dummy_;
+    }
+  }
+
+  __device__ void add(const T* in, idxT start, idxT end)
+  {
+    const idxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
+    for (idxT i = start + lane_; i < end_for_fullwarp; i += WarpSize) {
+      T val = (i < end) ? in[i] : dummy_;
+      add(val, i);
+    }
+  }
+
+  __device__ void add(T val, idxT idx)
+  {
+    if (is_greater_than<greater>(val, k_th_)) {
+      for (int i = 0; i < max_buf_len_ - 1; ++i) {
+        val_buf_[i] = val_buf_[i + 1];
+        idx_buf_[i] = idx_buf_[i + 1];
+      }
+      val_buf_[max_buf_len_ - 1] = val;
+      idx_buf_[max_buf_len_ - 1] = idx;
+
+      ++buf_len_;
+    }
+
+    if (any(buf_len_ == max_buf_len_)) { merge_buf_(); }
+  }
+
+  __device__ void done()
+  {
+    if (any(buf_len_ != 0)) { merge_buf_(); }
+  }
+
+ private:
+  __device__ void set_k_th_()
+  {
+    // it's the best we can do, should use "val_arr_[k_th_row_]"
+    k_th_ = shfl(val_arr_[max_arr_len_ - 1], k_th_lane_);
+  }
+
+  __device__ void merge_buf_()
+  {
+    bitonic_sort<max_buf_len_ * WarpSize, greater>::run(val_buf_, idx_buf_);
+
+    if (max_arr_len_ > max_buf_len_) {
+      for (int i = 0; i < max_buf_len_; ++i) {
+        T& val = val_arr_[max_arr_len_ - max_buf_len_ + i];
+        T& buf = val_buf_[i];
+        if (is_greater_than<greater>(buf, val)) {
+          val                                       = buf;
+          idx_arr_[max_arr_len_ - max_buf_len_ + i] = idx_buf_[i];
+        }
+      }
+    } else if (max_arr_len_ < max_buf_len_) {
+      for (int i = 0; i < max_arr_len_; ++i) {
+        T& val = val_arr_[i];
+        T& buf = val_buf_[max_buf_len_ - max_arr_len_ + i];
+        if (is_greater_than<greater>(buf, val)) {
+          val         = buf;
+          idx_arr_[i] = idx_buf_[max_buf_len_ - max_arr_len_ + i];
+        }
+      }
+    } else {
+      for (int i = 0; i < max_arr_len_; ++i) {
+        if (is_greater_than<greater>(val_buf_[i], val_arr_[i])) {
+          val_arr_[i] = val_buf_[i];
+          idx_arr_[i] = idx_buf_[i];
+        }
+      }
+    }
+
+    bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
+
+    buf_len_ = 0;
+    set_k_th_();  // contains sync
+    for (int i = 0; i < max_buf_len_; ++i) {
+      val_buf_[i] = dummy_;
+    }
+  }
+
+  using WarpSort<capacity, greater, T, idxT>::max_arr_len_;
+  using WarpSort<capacity, greater, T, idxT>::val_arr_;
+  using WarpSort<capacity, greater, T, idxT>::idx_arr_;
+  using WarpSort<capacity, greater, T, idxT>::lane_;
+  using WarpSort<capacity, greater, T, idxT>::k_;
+  using WarpSort<capacity, greater, T, idxT>::dummy_;
+
+  static constexpr int max_buf_len_ = (capacity <= 64) ? 2 : 4;
+
+  T val_buf_[max_buf_len_];
+  idxT idx_buf_[max_buf_len_];
+  int buf_len_;
+
+  T k_th_;
+  const int k_th_lane_;
+};
+
+template <int capacity, bool greater, typename T, typename idxT>
+class WarpBitonic : public WarpSort<capacity, greater, T, idxT> {
+ public:
+  __device__ WarpBitonic(idxT k, T dummy)
+    : WarpSort<capacity, greater, T, idxT>(k, dummy), buf_len_(0)
+  {
+    for (int i = 0; i < max_arr_len_; ++i) {
+      val_buf_[i] = dummy_;
+    }
+  }
+
+  __device__ void add(const T* in, idxT start, idxT end)
+  {
+    add_first_(in, start, end);
+    start += capacity;
+    while (start < end) {
+      add_extra_(in, start, end);
+      merge_();
+      start += capacity;
+    }
+  }
+
+  __device__ void add(T val, idxT idx)
+  {
+    for (int i = 0; i < max_arr_len_; ++i) {
+      if (i == buf_len_) {
+        val_buf_[i] = val;
+        idx_buf_[i] = idx;
+      }
+    }
+
+    ++buf_len_;
+    if (buf_len_ == max_arr_len_) {
+      bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+      merge_();
+
+      for (int i = 0; i < max_arr_len_; ++i) {
+        val_buf_[i] = dummy_;
+      }
+      buf_len_ = 0;
+    }
+  }
+
+  __device__ void done()
+  {
+    if (buf_len_ != 0) {
+      bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+      merge_();
+    }
+  }
+
+ private:
+  __device__ void add_first_(const T* in, idxT start, idxT end)
+  {
+    idxT idx = start + lane_;
+    for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
+      if (idx < end) {
+        val_arr_[i] = in[idx];
+        idx_arr_[i] = idx;
+      }
+    }
+    bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
+  }
+
+  __device__ void add_extra_(const T* in, idxT start, idxT end)
+  {
+    idxT idx = start + lane_;
+    for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
+      val_buf_[i] = (idx < end) ? in[idx] : dummy_;
+      idx_buf_[i] = idx;
+    }
+    bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+  }
+
+  __device__ void merge_()
+  {
+    for (int i = 0; i < max_arr_len_; ++i) {
+      if (is_greater_than<greater>(val_buf_[i], val_arr_[i])) {
+        val_arr_[i] = val_buf_[i];
+        idx_arr_[i] = idx_buf_[i];
+      }
+    }
+    bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
+  }
+
+  using WarpSort<capacity, greater, T, idxT>::max_arr_len_;
+  using WarpSort<capacity, greater, T, idxT>::val_arr_;
+  using WarpSort<capacity, greater, T, idxT>::idx_arr_;
+  using WarpSort<capacity, greater, T, idxT>::lane_;
+  using WarpSort<capacity, greater, T, idxT>::k_;
+  using WarpSort<capacity, greater, T, idxT>::dummy_;
+
+  T val_buf_[max_arr_len_];
+  idxT idx_buf_[max_arr_len_];
+  int buf_len_;
+};
+
+template <int capacity, bool greater, typename T, typename idxT>
+class WarpMerge : public WarpSort<capacity, greater, T, idxT> {
+ public:
+  __device__ WarpMerge(idxT k, T dummy) : WarpSort<capacity, greater, T, idxT>(k, dummy) {}
+
+  __device__ void add(const T* in, const idxT* in_idx, idxT start, idxT end)
+  {
+    idxT idx       = start + lane_;
+    idxT first_end = (start + k_ < end) ? (start + k_) : end;
+    for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
+      if (idx < first_end) {
+        val_arr_[i] = in[idx];
+        idx_arr_[i] = in_idx[idx];
+      }
+    }
+
+    for (start += k_; start < end; start += k_) {
+      load_sorted(in, in_idx, start);
+    }
+  }
+
+  __device__ void done() {}
+
+ private:
+  using WarpSort<capacity, greater, T, idxT>::max_arr_len_;
+  using WarpSort<capacity, greater, T, idxT>::val_arr_;
+  using WarpSort<capacity, greater, T, idxT>::idx_arr_;
+  using WarpSort<capacity, greater, T, idxT>::lane_;
+  using WarpSort<capacity, greater, T, idxT>::k_;
+  using WarpSort<capacity, greater, T, idxT>::dummy_;
+};
+
+template <typename T, typename idxT>
+int calc_smem_size_for_block_wide(int num_of_warp, idxT k)
+{
+  return Pow2<256>::roundUp(num_of_warp / 2 * sizeof(T) * k) + num_of_warp / 2 * sizeof(idxT) * k;
+}
+
+template <template <int, bool, typename, typename> class WarpSortWarpWide,
+          int capacity,
+          bool greater,
+          typename T,
+          typename idxT>
+class WarpSortBlockWide {
+ public:
+  __device__ WarpSortBlockWide(idxT k, T dummy, void* smem_buf)
+    : queue_(k, dummy), k_(k), dummy_(dummy)
+  {
+    val_smem_             = static_cast<T*>(smem_buf);
+    const int num_of_warp = blockDim.x / WarpSize;
+    idx_smem_             = reinterpret_cast<idxT*>(reinterpret_cast<char*>(smem_buf) +
+                                        Pow2<256>::roundUp(num_of_warp / 2 * sizeof(T) * k_));
+  }
+
+  __device__ void add(const T* in, const idxT* in_idx, idxT start, idxT end)
+  {
+    static_assert(std::is_same_v<WarpSortWarpWide<capacity, greater, T, idxT>,
+                                 WarpMerge<capacity, greater, T, idxT>>);
+
+    int num_of_warp   = blockDim.x / WarpSize;
+    const int warp_id = threadIdx.x / WarpSize;
+    idxT len_per_warp = (end - start - 1) / num_of_warp + 1;
+    len_per_warp      = ((len_per_warp - 1) / k_ + 1) * k_;
+
+    idxT warp_start = start + warp_id * len_per_warp;
+    idxT warp_end   = warp_start + len_per_warp;
+    if (warp_end > end) { warp_end = end; }
+    queue_.add(in, in_idx, warp_start, warp_end);
+  }
+
+  // can't use the form of "in + len" and let the caller pass "in" by setting it to
+  // correct offset.
+  // It's due to the need to fill idx.
+  // so has to pass the correct offset as "start"
+  __device__ void add(const T* in, idxT start, idxT end)
+  {
+    if constexpr (std::is_same_v<WarpSortWarpWide<capacity, greater, T, idxT>,
+                                 WarpSelect<capacity, greater, T, idxT>>) {
+      const idxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
+      for (idxT i = start + threadIdx.x; i < end_for_fullwarp; i += blockDim.x) {
+        T val = (i < end) ? in[i] : dummy_;
+        queue_.add(val, i);
+      }
+    } else if constexpr (std::is_same_v<WarpSortWarpWide<capacity, greater, T, idxT>,
+                                        WarpBitonic<capacity, greater, T, idxT>>) {
+      int num_of_warp   = blockDim.x / WarpSize;
+      const int warp_id = threadIdx.x / WarpSize;
+      idxT len_per_warp = (end - start - 1) / num_of_warp + 1;
+      len_per_warp      = Pow2<WarpSize>::roundUp(len_per_warp);
+
+      idxT warp_start = start + warp_id * len_per_warp;
+      idxT warp_end   = warp_start + len_per_warp;
+      if (warp_end > end) { warp_end = end; }
+      queue_.add(in, warp_start, warp_end);
+    }
+  }
+
+  __device__ void add(T val, idxT idx) { queue_.add(val, idx); }
+
+  __device__ void done()
+  {
+    queue_.done();
+
+    int num_of_warp   = blockDim.x / WarpSize;
+    const int warp_id = threadIdx.x / WarpSize;
+
+    while (num_of_warp > 1) {
+      int half_num_of_warp = (num_of_warp + 1) / 2;
+      if (warp_id < num_of_warp && warp_id >= half_num_of_warp) {
+        int dst_warp_id = warp_id - half_num_of_warp;
+        queue_.dump(val_smem_ + dst_warp_id * k_, idx_smem_ + dst_warp_id * k_);
+      }
+      __syncthreads();
+
+      if (warp_id < num_of_warp / 2) { queue_.load_sorted(val_smem_, idx_smem_, warp_id * k_); }
+      __syncthreads();
+
+      num_of_warp = half_num_of_warp;
+    }
+  }
+
+  __device__ void dump(T* out, idxT* out_idx) const
+  {
+    if (threadIdx.x < WarpSize) { queue_.dump(out, out_idx); }
+  }
+
+ private:
+  WarpSortWarpWide<capacity, greater, T, idxT> queue_;
+  int k_;
+  T dummy_;
+  T* val_smem_;
+  idxT* idx_smem_;
+};
+
+template <template <int, bool, typename, typename> class WarpSortClass,
+          int capacity,
+          bool greater,
+          typename T,
+          typename idxT>
+__global__ void block_kernel(const T* in,
+                             const idxT* in_idx,
+                             idxT batch_size,
+                             idxT len,
+                             idxT k,
+                             T* out,
+                             idxT* out_idx,
+                             T dummy)
+{
+  extern __shared__ __align__(sizeof(T) * 256) uint8_t smem_buf_bytes[];
+  T* smem_buf = (T*)smem_buf_bytes;
+
+  const int num_of_block        = gridDim.x / batch_size;
+  const idxT len_per_block      = (len - 1) / num_of_block + 1;
+  const int batch_id            = blockIdx.x / num_of_block;
+  const int block_id_in_a_batch = blockIdx.x % num_of_block;
+
+  idxT start = block_id_in_a_batch * len_per_block;
+  idxT end   = start + len_per_block;
+  if (end >= len) { end = len; }
+
+  WarpSortBlockWide<WarpSortClass, capacity, greater, T, idxT> queue(k, dummy, smem_buf);
+  if constexpr (std::is_same_v<WarpSortClass<capacity, greater, T, idxT>,
+                               WarpMerge<capacity, greater, T, idxT>>) {
+    queue.add(in + batch_id * len, in_idx + batch_id * len, start, end);
+  } else {
+    queue.add(in + batch_id * len, start, end);
+  }
+
+  queue.done();
+  queue.dump(out + blockIdx.x * k, out_idx + blockIdx.x * k);
+}
+
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename idxT>
+void calc_launch_parameter_by_occupancy(idxT k, int* block_size, int* min_grid_size)
+{
+  const int capacity                                             = calc_capacity(k);
+  decltype(&block_kernel<WarpSortClass, 32, true, T, idxT>) func = nullptr;
+  if (capacity == 32) {
+    func = block_kernel<WarpSortClass, 32, true, T, idxT>;
+  } else if (capacity == 64) {
+    func = block_kernel<WarpSortClass, 64, true, T, idxT>;
+  } else if (capacity == 128) {
+    func = block_kernel<WarpSortClass, 128, true, T, idxT>;
+  } else if (capacity == 256) {
+    func = block_kernel<WarpSortClass, 256, true, T, idxT>;
+  } else {
+    ASSERT(false, "Requested capacity is too big (%d)", capacity);
+  }
+
+  auto calc_smem = [k](int block_size) {
+    int num_of_warp = block_size / WarpSize;
+    return calc_smem_size_for_block_wide<T>(num_of_warp, k);
+  };
+
+  cudaOccupancyMaxPotentialBlockSizeVariableSMem(min_grid_size, block_size, func, calc_smem);
+}
+
+template <template <int, bool, typename, typename> class WarpSortClass>
+struct LaunchThreshold {
+};
+
+template <>
+struct LaunchThreshold<WarpSelect> {
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 32;
+};
+
+template <>
+struct LaunchThreshold<WarpBitonic> {
+  static constexpr int len_factor_for_choosing     = 4;
+  static constexpr int len_factor_for_multi_block  = 2;
+  static constexpr int len_factor_for_single_block = 4;
+};
+
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename idxT>
+void calc_launch_parameter(
+  int batch_size, idxT len, idxT k, int* p_num_of_block, int* p_num_of_warp)
+{
+  const int capacity = calc_capacity(k);
+  int block_size     = 0;
+  int min_grid_size  = 0;
+  calc_launch_parameter_by_occupancy<WarpSortClass, T, idxT>(k, &block_size, &min_grid_size);
+
+  int num_of_warp;
+  int num_of_block;
+  if (batch_size < min_grid_size) {  // may use multiple blocks
+    num_of_warp        = block_size / WarpSize;
+    num_of_block       = min_grid_size / batch_size;
+    idxT len_per_block = (len - 1) / num_of_block + 1;
+    idxT len_per_warp  = (len_per_block - 1) / num_of_warp + 1;
+
+    len_per_warp  = Pow2<WarpSize>::roundUp(len_per_warp);
+    len_per_block = len_per_warp * num_of_warp;
+    num_of_block  = (len - 1) / len_per_block + 1;
+
+    constexpr int len_factor = LaunchThreshold<WarpSortClass>::len_factor_for_multi_block;
+    if (len_per_warp < capacity * len_factor) {
+      len_per_warp  = capacity * len_factor;
+      len_per_block = num_of_warp * len_per_warp;
+      if (len_per_block > len) { len_per_block = len; }
+      num_of_block = (len - 1) / len_per_block + 1;
+      num_of_warp  = (len_per_block - 1) / len_per_warp + 1;
+    }
+  } else {  // use only single block
+    num_of_block = 1;
+
+    // block size could be decreased if batch size is large
+    float scale = batch_size / min_grid_size;
+    if (scale > 1) {
+      // make sure scale > 1 so block_size only decreases not increases
+      if (0.8 * scale > 1) { scale = 0.8 * scale; }
+      block_size /= scale;
+      if (block_size < 1) { block_size = 1; }
+      block_size = Pow2<WarpSize>::roundUp(block_size);
+    }
+
+    num_of_warp       = block_size / WarpSize;
+    idxT len_per_warp = (len - 1) / num_of_warp + 1;
+    len_per_warp      = Pow2<WarpSize>::roundUp(len_per_warp);
+    num_of_warp       = (len - 1) / len_per_warp + 1;
+
+    constexpr int len_factor = LaunchThreshold<WarpSortClass>::len_factor_for_single_block;
+    if (len_per_warp < capacity * len_factor) {
+      len_per_warp = capacity * len_factor;
+      num_of_warp  = (len - 1) / len_per_warp + 1;
+    }
+  }
+
+  *p_num_of_block = num_of_block;
+  *p_num_of_warp  = num_of_warp;
+}
+
+template <typename T, typename idxT>
+void calc_launch_parameter_for_merge(idxT len, idxT k, int* num_of_block, int* num_of_warp)
+{
+  *num_of_block = 1;
+
+  int block_size    = 0;
+  int min_grid_size = 0;
+  calc_launch_parameter_by_occupancy<WarpMerge, T, idxT>(k, &block_size, &min_grid_size);
+
+  *num_of_warp      = block_size / WarpSize;
+  idxT len_per_warp = (len - 1) / (*num_of_warp) + 1;
+  len_per_warp      = ((len_per_warp - 1) / k + 1) * k;
+  *num_of_warp      = (len - 1) / len_per_warp + 1;
+}
+
+#define BLOCK_CASE(WarpSortClass, capacity, in_val, in_idx, out_val, out_idx) \
+  case capacity:                                                              \
+    if (greater) {                                                            \
+      block_kernel<WarpSortClass, capacity, true>                             \
+        <<<batch_size * num_of_block, block_dim, smem_size, stream>>>(        \
+          in_val, in_idx, batch_size, len, k, out_val, out_idx, dummy);       \
+    } else {                                                                  \
+      block_kernel<WarpSortClass, capacity, false>                            \
+        <<<batch_size * num_of_block, block_dim, smem_size, stream>>>(        \
+          in_val, in_idx, batch_size, len, k, out_val, out_idx, dummy);       \
+    }                                                                         \
+    break
+
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename idxT>
+void warp_sort_topk_(int num_of_block,
+                     int num_of_warp,
+                     void* buf,
+                     size_t& buf_size,
+                     const T* in,
+                     idxT batch_size,
+                     idxT len,
+                     idxT k,
+                     T* out,
+                     idxT* out_idx       = nullptr,
+                     bool greater        = true,
+                     cudaStream_t stream = 0)
+{
+  T* tmp_val    = nullptr;
+  idxT* tmp_idx = nullptr;
+
+  if (num_of_block > 1) {
+    std::vector<size_t> sizes = {sizeof(T) * num_of_block * k * batch_size,
+                                 sizeof(idxT) * num_of_block * k * batch_size};
+    size_t total_size         = calc_aligned_size(sizes);
+    if (!buf) {
+      buf_size = total_size;
+      return;
+    }
+    std::vector<void*> aligned_pointers = calc_aligned_pointers(buf, sizes);
+    tmp_val                             = static_cast<T*>(aligned_pointers[0]);
+    tmp_idx                             = static_cast<idxT*>(aligned_pointers[1]);
+  } else if (!buf) {
+    // although don't need buf when num_of_block==1, but can't set buf_size=0
+    // otherwise, cudaMalloc(&buf, 0) can result in buf==nullptr
+    // then the next call of topk() won't do anything but set buf_size again
+    // so set buf_size to 1 here to avoid such case
+    buf_size = 1;
+    return;
+  }
+
+  // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
+  T dummy      = get_dummy<T>(greater);
+  int capacity = calc_capacity(k);
+
+  T* result_val    = (num_of_block == 1) ? out : tmp_val;
+  idxT* result_idx = (num_of_block == 1) ? out_idx : tmp_idx;
+  int block_dim    = num_of_warp * WarpSize;
+  int smem_size    = calc_smem_size_for_block_wide<T>(num_of_warp, k);
+  switch (capacity) {
+    BLOCK_CASE(WarpSortClass, 32, in, static_cast<idxT*>(nullptr), result_val, result_idx);
+    BLOCK_CASE(WarpSortClass, 64, in, static_cast<idxT*>(nullptr), result_val, result_idx);
+    BLOCK_CASE(WarpSortClass, 128, in, static_cast<idxT*>(nullptr), result_val, result_idx);
+    BLOCK_CASE(WarpSortClass, 256, in, static_cast<idxT*>(nullptr), result_val, result_idx);
+    default: ASSERT(false, "Requested capacity is too big (%d)", capacity);
+  }
+  // CUDA_CHECK_LAST_ERROR();
+
+  if (num_of_block > 1) {
+    len = k * num_of_block;
+    calc_launch_parameter_for_merge<T>(len, k, &num_of_block, &num_of_warp);
+    // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
+    block_dim = num_of_warp * WarpSize;
+    smem_size = calc_smem_size_for_block_wide<T>(num_of_warp, k);
+    switch (capacity) {
+      BLOCK_CASE(WarpMerge, 32, tmp_val, tmp_idx, out, out_idx);
+      BLOCK_CASE(WarpMerge, 64, tmp_val, tmp_idx, out, out_idx);
+      BLOCK_CASE(WarpMerge, 128, tmp_val, tmp_idx, out, out_idx);
+      BLOCK_CASE(WarpMerge, 256, tmp_val, tmp_idx, out, out_idx);
+      default: ASSERT(false, "Requested capacity is too big (%d)", capacity);
+    }
+    // CUDA_CHECK_LAST_ERROR();
+  }
+}
+
+template <typename T, typename idxT>
+void warp_sort_topk(void* buf,
+                    size_t& buf_size,
+                    const T* in,
+                    const idxT*,
+                    idxT batch_size,
+                    idxT len,
+                    idxT k,
+                    T* out,
+                    idxT* out_idx       = nullptr,
+                    bool greater        = true,
+                    cudaStream_t stream = 0)
+{
+  ASSERT(k <= 256, "Current max k is 256 (requested %d)", k);
+
+  int capacity     = calc_capacity(k);
+  int num_of_block = 0;
+  int num_of_warp  = 0;
+  calc_launch_parameter<WarpBitonic, T>(batch_size, len, k, &num_of_block, &num_of_warp);
+  int len_per_warp = len / (num_of_block * num_of_warp);
+
+  if (len_per_warp <= capacity * LaunchThreshold<WarpBitonic>::len_factor_for_choosing) {
+    warp_sort_topk_<WarpBitonic, T, idxT>(num_of_block,
+                                          num_of_warp,
+                                          buf,
+                                          buf_size,
+                                          in,
+                                          batch_size,
+                                          len,
+                                          k,
+                                          out,
+                                          out_idx,
+                                          greater,
+                                          stream);
+  } else {
+    calc_launch_parameter<WarpSelect, T>(batch_size, len, k, &num_of_block, &num_of_warp);
+    warp_sort_topk_<WarpSelect, T, idxT>(num_of_block,
+                                         num_of_warp,
+                                         buf,
+                                         buf_size,
+                                         in,
+                                         batch_size,
+                                         len,
+                                         k,
+                                         out,
+                                         out_idx,
+                                         greater,
+                                         stream);
+  }
+}
+
+}  // namespace raft::spatial::knn::detail::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 03a4eabaac..c49e49575e 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,24 +31,24 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename K, typename IndexType, bool select_min, int warp_q, int thread_q, int tpb>
-__global__ void select_k_kernel(K* inK,
-                                IndexType* inV,
+template <typename key_t, typename payload_t, bool select_min, int warp_q, int thread_q, int tpb>
+__global__ void select_k_kernel(key_t* inK,
+                                payload_t* inV,
                                 size_t n_rows,
                                 size_t n_cols,
-                                K* outK,
-                                IndexType* outV,
-                                K initK,
-                                IndexType initV,
+                                key_t* outK,
+                                payload_t* outV,
+                                key_t initK,
+                                payload_t initV,
                                 int k)
 {
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
-  __shared__ K smemK[kNumWarps * warp_q];
-  __shared__ IndexType smemV[kNumWarps * warp_q];
+  __shared__ key_t smemK[kNumWarps * warp_q];
+  __shared__ payload_t smemV[kNumWarps * warp_q];
 
   faiss::gpu::
-    BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>, warp_q, thread_q, tpb>
+    BlockSelect<key_t, payload_t, select_min, faiss::gpu::Comparator<key_t>, warp_q, thread_q, tpb>
       heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
@@ -56,8 +56,8 @@ __global__ void select_k_kernel(K* inK,
   int i   = threadIdx.x;
 
   int idx             = row * n_cols;
-  K* inKStart         = inK + idx + i;
-  IndexType* inVStart = inV + idx + i;
+  key_t* inKStart     = inK + idx + i;
+  payload_t* inVStart = inV + idx + i;
 
   // Whole warps must participate in the selection
   int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
@@ -84,13 +84,13 @@ __global__ void select_k_kernel(K* inK,
   }
 }
 
-template <typename value_idx = int, typename value_t = float, int warp_q, int thread_q>
-inline void select_k_impl(value_t* inK,
-                          value_idx* inV,
+template <typename payload_t = int, typename key_t = float, int warp_q, int thread_q>
+inline void select_k_impl(key_t* inK,
+                          payload_t* inV,
                           size_t n_rows,
                           size_t n_cols,
-                          value_t* outK,
-                          value_idx* outV,
+                          key_t* outK,
+                          payload_t* outV,
                           bool select_min,
                           int k,
                           cudaStream_t stream)
@@ -101,13 +101,13 @@ inline void select_k_impl(value_t* inK,
   auto block              = dim3(n_threads);
 
   auto kInit =
-    select_min ? faiss::gpu::Limits<value_t>::getMax() : faiss::gpu::Limits<value_t>::getMin();
+    select_min ? faiss::gpu::Limits<key_t>::getMax() : faiss::gpu::Limits<key_t>::getMin();
   auto vInit = -1;
   if (select_min) {
-    select_k_kernel<value_t, value_idx, false, warp_q, thread_q, n_threads>
+    select_k_kernel<key_t, payload_t, false, warp_q, thread_q, n_threads>
       <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   } else {
-    select_k_kernel<value_t, value_idx, true, warp_q, thread_q, n_threads>
+    select_k_kernel<key_t, payload_t, true, warp_q, thread_q, n_threads>
       <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   }
   RAFT_CUDA_TRY(cudaGetLastError());
@@ -127,38 +127,40 @@ inline void select_k_impl(value_t* inK,
  * @param[in] k number of neighbors per partition (also number of merged neighbors)
  * @param[in] stream CUDA stream to use
  */
-template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
+template <typename payload_t = int, typename key_t = float>
+inline void select_k(key_t* inK,
+                     payload_t* inV,
                      size_t n_rows,
                      size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
+                     key_t* outK,
+                     payload_t* outV,
                      bool select_min,
                      int k,
                      cudaStream_t stream)
 {
   if (k == 1)
-    select_k_impl<value_idx, value_t, 1, 1>(
+    select_k_impl<payload_t, key_t, 1, 1>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 32)
-    select_k_impl<value_idx, value_t, 32, 2>(
+    select_k_impl<payload_t, key_t, 32, 2>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 64)
-    select_k_impl<value_idx, value_t, 64, 3>(
+    select_k_impl<payload_t, key_t, 64, 3>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 128)
-    select_k_impl<value_idx, value_t, 128, 3>(
+    select_k_impl<payload_t, key_t, 128, 3>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 256)
-    select_k_impl<value_idx, value_t, 256, 4>(
+    select_k_impl<payload_t, key_t, 256, 4>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 512)
-    select_k_impl<value_idx, value_t, 512, 8>(
+    select_k_impl<payload_t, key_t, 512, 8>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 1024)
-    select_k_impl<value_idx, value_t, 1024, 8>(
+    select_k_impl<payload_t, key_t, 1024, 8>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else
+    ASSERT(k <= 1024, "Current max k is 1024 (requested %d)", k);
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 189b537361..d49a718559 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -22,6 +22,11 @@
 #include "detail/knn_brute_force_faiss.cuh"
 #include "detail/selection_faiss.cuh"
 
+#include "detail/ivf_flat/radix_topk.cuh"
+#include "detail/ivf_flat/warpsort_topk.cuh"
+
+#include <raft/common/nvtx.hpp>
+
 namespace raft {
 namespace spatial {
 namespace knn {
@@ -38,65 +43,188 @@ namespace knn {
  *
  * etc...
  *
- * @tparam value_idx
+ * @tparam idx_t
  * @tparam value_t
- * @param inK
- * @param inV
- * @param outK
- * @param outV
+ * @param in_keys
+ * @param in_values
+ * @param out_keys
+ * @param out_values
  * @param n_samples
  * @param n_parts
  * @param k
  * @param stream
  * @param translations
  */
-template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t* inK,
-                            value_idx* inV,
-                            value_t* outK,
-                            value_idx* outV,
+template <typename idx_t = int64_t, typename value_t = float>
+inline void knn_merge_parts(value_t* in_keys,
+                            idx_t* in_values,
+                            value_t* out_keys,
+                            idx_t* out_values,
                             size_t n_samples,
                             int n_parts,
                             int k,
                             cudaStream_t stream,
-                            value_idx* translations)
+                            idx_t* translations)
 {
-  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  detail::knn_merge_parts(
+    in_keys, in_values, out_keys, out_values, n_samples, n_parts, k, stream, translations);
 }
 
+enum class SelectKAlgo { FAISS, RADIX_8_BITS, RADIX_11_BITS, WARP_SORT };
+
 /**
- * Performs a k-select across column-partitioned index/distance
- * matrices formatted like the following:
- * row1: k0, k1, k2, k0, k1, k2
- * row2: k0, k1, k2, k0, k1, k2
- * row3: k0, k1, k2, k0, k1, k2
+ * Select k smallest or largest key/values from each row in the input data.
  *
- * etc...
+ * If you think of the input data `in_keys` as a row-major matrix with input_len columns and
+ * n_inputs rows, then this function selects k smallest/largest values in each row and fills
+ * in the row-major matrix `out_keys` of size (n_inputs, k).
+ *
+ * Note, depending on the selected algorithm, the values within rows of `out_keys` are not
+ * necessarily sorted.
  *
- * @tparam value_idx
+ * @tparam idx_t
+ *   the payload type (what is being selected together with the keys).
  * @tparam value_t
- * @param inK
- * @param inV
- * @param n_rows
- * @param n_cols
- * @param outK
- * @param outV
- * @param select_min
- * @param k
- * @param stream
+ *   what is being compared.
+ *
+ * @param[in] in_keys
+ *   contiguous array of inputs of size (input_len * n_inputs);
+ *   these are compared and selected.
+ * @param[in] in_values
+ *   contiguous array of inputs of size (input_len * n_inputs);
+ *   typically, these are indices of the corresponding in_keys.
+ * @param[in] n_inputs
+ *   number of input rows, i.e. the batch size.
+ * @param[in] input_len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: input_len >= k.
+ * @param[out] out_keys
+ *   contiguous array of outputs of size (k * n_inputs);
+ *   the k smallest/largest values from each row of the `in_keys`.
+ * @param[out] out_values
+ *   contiguous array of outputs of size (k * n_inputs);
+ *   the payload selected together with `out_keys`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] k
+ *   the number of outputs to select in each input row.
+ * @param[in] stream
+ * @param[in] algo
+ *   the implementation of the algorithm
  */
-template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
-                     size_t n_rows,
-                     size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
+template <typename idx_t = int, typename value_t = float>
+inline void select_k(value_t* in_keys,
+                     idx_t* in_values,
+                     size_t n_inputs,
+                     size_t input_len,
+                     value_t* out_keys,
+                     idx_t* out_values,
                      bool select_min,
                      int k,
-                     cudaStream_t stream)
+                     cudaStream_t stream,
+                     SelectKAlgo algo = SelectKAlgo::FAISS)
 {
-  detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("select-%s-%d (%zu, %zu) algo-%d",
+                                                            select_min ? "min" : "max",
+                                                            k,
+                                                            n_inputs,
+                                                            input_len,
+                                                            int(algo));
+  ASSERT(size_t(input_len) >= size_t(k),
+         "Size of the input (input_len = %zu) must be not smaller than the selection (k = %zu).",
+         size_t(input_len),
+         size_t(k));
+  size_t buf_size;
+  switch (algo) {
+    case SelectKAlgo::FAISS:
+      detail::select_k(
+        in_keys, in_values, n_inputs, input_len, out_keys, out_values, select_min, k, stream);
+      break;
+
+    case SelectKAlgo::RADIX_8_BITS:
+      detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(nullptr,
+                                                           buf_size,
+                                                           in_keys,
+                                                           in_values,
+                                                           (idx_t)n_inputs,
+                                                           (idx_t)input_len,
+                                                           (idx_t)k,
+                                                           out_keys,
+                                                           out_values,
+                                                           !select_min,
+                                                           stream);
+      {
+        rmm::device_uvector<uint8_t> workspace(buf_size, stream);
+        detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(static_cast<void*>(workspace.data()),
+                                                             buf_size,
+                                                             in_keys,
+                                                             in_values,
+                                                             (idx_t)n_inputs,
+                                                             (idx_t)input_len,
+                                                             (idx_t)k,
+                                                             out_keys,
+                                                             out_values,
+                                                             !select_min,
+                                                             stream);
+      }
+      break;
+
+    case SelectKAlgo::RADIX_11_BITS:
+      detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(nullptr,
+                                                            buf_size,
+                                                            in_keys,
+                                                            in_values,
+                                                            (idx_t)n_inputs,
+                                                            (idx_t)input_len,
+                                                            (idx_t)k,
+                                                            out_keys,
+                                                            out_values,
+                                                            !select_min,
+                                                            stream);
+      {
+        rmm::device_uvector<uint8_t> workspace(buf_size, stream);
+        detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(static_cast<void*>(workspace.data()),
+                                                              buf_size,
+                                                              in_keys,
+                                                              in_values,
+                                                              (idx_t)n_inputs,
+                                                              (idx_t)input_len,
+                                                              (idx_t)k,
+                                                              out_keys,
+                                                              out_values,
+                                                              !select_min,
+                                                              stream);
+      }
+      break;
+
+    case SelectKAlgo::WARP_SORT: {
+      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(nullptr,
+                                                       buf_size,
+                                                       in_keys,
+                                                       in_values,
+                                                       (idx_t)n_inputs,
+                                                       (idx_t)input_len,
+                                                       (idx_t)k,
+                                                       out_keys,
+                                                       out_values,
+                                                       !select_min,
+                                                       stream);
+      rmm::device_uvector<uint8_t> workspace(buf_size, stream);
+      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(static_cast<void*>(workspace.data()),
+                                                       buf_size,
+                                                       in_keys,
+                                                       in_values,
+                                                       (idx_t)n_inputs,
+                                                       (idx_t)input_len,
+                                                       (idx_t)k,
+                                                       out_keys,
+                                                       out_values,
+                                                       !select_min,
+                                                       stream);
+    }
+
+    default: break;
+  }
 }
 
 /**
@@ -122,21 +250,21 @@ inline void select_k(value_t* inK,
  * @param[in] translations starting offsets for partitions. should be the same size
  *            as input vector.
  */
-template <typename value_idx = std::int64_t, typename value_t = float, typename value_int = int>
+template <typename idx_t = std::int64_t, typename value_t = float, typename value_int = int>
 void brute_force_knn(raft::handle_t const& handle,
                      std::vector<value_t*>& input,
                      std::vector<value_int>& sizes,
                      value_int D,
                      value_t* search_items,
                      value_int n,
-                     value_idx* res_I,
+                     idx_t* res_I,
                      value_t* res_D,
                      value_int k,
-                     bool rowMajorIndex                   = true,
-                     bool rowMajorQuery                   = true,
-                     std::vector<value_idx>* translations = nullptr,
-                     distance::DistanceType metric        = distance::DistanceType::L2Unexpanded,
-                     float metric_arg                     = 2.0f)
+                     bool rowMajorIndex               = true,
+                     bool rowMajorQuery               = true,
+                     std::vector<idx_t>* translations = nullptr,
+                     distance::DistanceType metric    = distance::DistanceType::L2Unexpanded,
+                     float metric_arg                 = 2.0f)
 {
   ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 
@@ -159,4 +287,4 @@ void brute_force_knn(raft::handle_t const& handle,
 }  // namespace spatial
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index da18e891d4..d49a718559 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -13,10 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/**
- * @warning This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
- */
 
 #ifndef __KNN_H
 #define __KNN_H
@@ -26,6 +22,11 @@
 #include "detail/knn_brute_force_faiss.cuh"
 #include "detail/selection_faiss.cuh"
 
+#include "detail/ivf_flat/radix_topk.cuh"
+#include "detail/ivf_flat/warpsort_topk.cuh"
+
+#include <raft/common/nvtx.hpp>
+
 namespace raft {
 namespace spatial {
 namespace knn {
@@ -42,65 +43,188 @@ namespace knn {
  *
  * etc...
  *
- * @tparam value_idx
+ * @tparam idx_t
  * @tparam value_t
- * @param inK
- * @param inV
- * @param outK
- * @param outV
+ * @param in_keys
+ * @param in_values
+ * @param out_keys
+ * @param out_values
  * @param n_samples
  * @param n_parts
  * @param k
  * @param stream
  * @param translations
  */
-template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t* inK,
-                            value_idx* inV,
-                            value_t* outK,
-                            value_idx* outV,
+template <typename idx_t = int64_t, typename value_t = float>
+inline void knn_merge_parts(value_t* in_keys,
+                            idx_t* in_values,
+                            value_t* out_keys,
+                            idx_t* out_values,
                             size_t n_samples,
                             int n_parts,
                             int k,
                             cudaStream_t stream,
-                            value_idx* translations)
+                            idx_t* translations)
 {
-  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+  detail::knn_merge_parts(
+    in_keys, in_values, out_keys, out_values, n_samples, n_parts, k, stream, translations);
 }
 
+enum class SelectKAlgo { FAISS, RADIX_8_BITS, RADIX_11_BITS, WARP_SORT };
+
 /**
- * Performs a k-select across column-partitioned index/distance
- * matrices formatted like the following:
- * row1: k0, k1, k2, k0, k1, k2
- * row2: k0, k1, k2, k0, k1, k2
- * row3: k0, k1, k2, k0, k1, k2
+ * Select k smallest or largest key/values from each row in the input data.
  *
- * etc...
+ * If you think of the input data `in_keys` as a row-major matrix with input_len columns and
+ * n_inputs rows, then this function selects k smallest/largest values in each row and fills
+ * in the row-major matrix `out_keys` of size (n_inputs, k).
+ *
+ * Note, depending on the selected algorithm, the values within rows of `out_keys` are not
+ * necessarily sorted.
  *
- * @tparam value_idx
+ * @tparam idx_t
+ *   the payload type (what is being selected together with the keys).
  * @tparam value_t
- * @param inK
- * @param inV
- * @param n_rows
- * @param n_cols
- * @param outK
- * @param outV
- * @param select_min
- * @param k
- * @param stream
+ *   what is being compared.
+ *
+ * @param[in] in_keys
+ *   contiguous array of inputs of size (input_len * n_inputs);
+ *   these are compared and selected.
+ * @param[in] in_values
+ *   contiguous array of inputs of size (input_len * n_inputs);
+ *   typically, these are indices of the corresponding in_keys.
+ * @param[in] n_inputs
+ *   number of input rows, i.e. the batch size.
+ * @param[in] input_len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: input_len >= k.
+ * @param[out] out_keys
+ *   contiguous array of outputs of size (k * n_inputs);
+ *   the k smallest/largest values from each row of the `in_keys`.
+ * @param[out] out_values
+ *   contiguous array of outputs of size (k * n_inputs);
+ *   the payload selected together with `out_keys`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] k
+ *   the number of outputs to select in each input row.
+ * @param[in] stream
+ * @param[in] algo
+ *   the implementation of the algorithm
  */
-template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
-                     size_t n_rows,
-                     size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
+template <typename idx_t = int, typename value_t = float>
+inline void select_k(value_t* in_keys,
+                     idx_t* in_values,
+                     size_t n_inputs,
+                     size_t input_len,
+                     value_t* out_keys,
+                     idx_t* out_values,
                      bool select_min,
                      int k,
-                     cudaStream_t stream)
+                     cudaStream_t stream,
+                     SelectKAlgo algo = SelectKAlgo::FAISS)
 {
-  detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("select-%s-%d (%zu, %zu) algo-%d",
+                                                            select_min ? "min" : "max",
+                                                            k,
+                                                            n_inputs,
+                                                            input_len,
+                                                            int(algo));
+  ASSERT(size_t(input_len) >= size_t(k),
+         "Size of the input (input_len = %zu) must be not smaller than the selection (k = %zu).",
+         size_t(input_len),
+         size_t(k));
+  size_t buf_size;
+  switch (algo) {
+    case SelectKAlgo::FAISS:
+      detail::select_k(
+        in_keys, in_values, n_inputs, input_len, out_keys, out_values, select_min, k, stream);
+      break;
+
+    case SelectKAlgo::RADIX_8_BITS:
+      detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(nullptr,
+                                                           buf_size,
+                                                           in_keys,
+                                                           in_values,
+                                                           (idx_t)n_inputs,
+                                                           (idx_t)input_len,
+                                                           (idx_t)k,
+                                                           out_keys,
+                                                           out_values,
+                                                           !select_min,
+                                                           stream);
+      {
+        rmm::device_uvector<uint8_t> workspace(buf_size, stream);
+        detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(static_cast<void*>(workspace.data()),
+                                                             buf_size,
+                                                             in_keys,
+                                                             in_values,
+                                                             (idx_t)n_inputs,
+                                                             (idx_t)input_len,
+                                                             (idx_t)k,
+                                                             out_keys,
+                                                             out_values,
+                                                             !select_min,
+                                                             stream);
+      }
+      break;
+
+    case SelectKAlgo::RADIX_11_BITS:
+      detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(nullptr,
+                                                            buf_size,
+                                                            in_keys,
+                                                            in_values,
+                                                            (idx_t)n_inputs,
+                                                            (idx_t)input_len,
+                                                            (idx_t)k,
+                                                            out_keys,
+                                                            out_values,
+                                                            !select_min,
+                                                            stream);
+      {
+        rmm::device_uvector<uint8_t> workspace(buf_size, stream);
+        detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(static_cast<void*>(workspace.data()),
+                                                              buf_size,
+                                                              in_keys,
+                                                              in_values,
+                                                              (idx_t)n_inputs,
+                                                              (idx_t)input_len,
+                                                              (idx_t)k,
+                                                              out_keys,
+                                                              out_values,
+                                                              !select_min,
+                                                              stream);
+      }
+      break;
+
+    case SelectKAlgo::WARP_SORT: {
+      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(nullptr,
+                                                       buf_size,
+                                                       in_keys,
+                                                       in_values,
+                                                       (idx_t)n_inputs,
+                                                       (idx_t)input_len,
+                                                       (idx_t)k,
+                                                       out_keys,
+                                                       out_values,
+                                                       !select_min,
+                                                       stream);
+      rmm::device_uvector<uint8_t> workspace(buf_size, stream);
+      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(static_cast<void*>(workspace.data()),
+                                                       buf_size,
+                                                       in_keys,
+                                                       in_values,
+                                                       (idx_t)n_inputs,
+                                                       (idx_t)input_len,
+                                                       (idx_t)k,
+                                                       out_keys,
+                                                       out_values,
+                                                       !select_min,
+                                                       stream);
+    }
+
+    default: break;
+  }
 }
 
 /**
@@ -126,21 +250,21 @@ inline void select_k(value_t* inK,
  * @param[in] translations starting offsets for partitions. should be the same size
  *            as input vector.
  */
-template <typename value_idx = std::int64_t, typename value_t = float, typename value_int = int>
+template <typename idx_t = std::int64_t, typename value_t = float, typename value_int = int>
 void brute_force_knn(raft::handle_t const& handle,
                      std::vector<value_t*>& input,
                      std::vector<value_int>& sizes,
                      value_int D,
                      value_t* search_items,
                      value_int n,
-                     value_idx* res_I,
+                     idx_t* res_I,
                      value_t* res_D,
                      value_int k,
-                     bool rowMajorIndex                   = true,
-                     bool rowMajorQuery                   = true,
-                     std::vector<value_idx>* translations = nullptr,
-                     distance::DistanceType metric        = distance::DistanceType::L2Unexpanded,
-                     float metric_arg                     = 2.0f)
+                     bool rowMajorIndex               = true,
+                     bool rowMajorQuery               = true,
+                     std::vector<idx_t>* translations = nullptr,
+                     distance::DistanceType metric    = distance::DistanceType::L2Unexpanded,
+                     float metric_arg                 = 2.0f)
 {
   ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 
@@ -163,4 +287,4 @@ void brute_force_knn(raft::handle_t const& handle,
 }  // namespace spatial
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 25ec2e50ab..123ef733d8 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <algorithm>
 #include <gtest/gtest.h>
+#include <numeric>
 #include <raft/cudart_utils.h>
 
 #include "../test_utils.h"
@@ -25,139 +27,325 @@
 #include <raft/spatial/knn/specializations.cuh>
 #endif
 
-namespace raft {
-namespace spatial {
-namespace selection {
+namespace raft::spatial::selection {
 
 using namespace raft;
 using namespace raft::sparse;
 
-template <typename value_idx, typename value_t>
-struct SparseSelectionInputs {
-  value_idx n_rows;
-  value_idx n_cols;
-
-  std::vector<value_t> dists_h;
-
-  std::vector<value_t> out_dists_ref_h;
-  std::vector<value_idx> out_indices_ref_h;
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec)
+{
+  for (auto e : vec)
+    os << " " << e;
+  return os;
+}
 
+struct SelectTestSpec {
+  int n_inputs;
+  int input_len;
   int k;
-
-  bool select_min;
+  int select_min;
 };
 
-template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os,
-                           const SparseSelectionInputs<value_idx, value_t>& dims)
+std::ostream& operator<<(std::ostream& os, const SelectTestSpec& ss)
 {
+  os << "spec{size: " << ss.input_len << "*" << ss.n_inputs << ", k: " << ss.k;
+  os << (ss.select_min ? "; min}" : "; max}");
   return os;
 }
 
-template <typename value_idx, typename value_t>
-class SparseSelectionTest
-  : public ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>> {
+template <typename IdxT>
+auto gen_simple_ids(int n_inputs, int input_len) -> std::vector<IdxT>
+{
+  std::vector<IdxT> out(n_inputs * input_len);
+  auto s = rmm::cuda_stream_default;
+  rmm::device_uvector<IdxT> out_d(out.size(), s);
+  iota_fill(out_d.data(), IdxT(n_inputs), IdxT(input_len), s);
+  update_host(out.data(), out_d.data(), out.size(), s);
+  s.synchronize();
+  return out;
+}
+
+template <typename KeyT, typename IdxT>
+struct SelectInOutSimple {
  public:
-  SparseSelectionTest()
-    : params(::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>>::GetParam()),
-      stream(handle.get_stream()),
-      dists(0, stream),
-      inds(0, stream),
-      out_indices_ref(0, stream),
-      out_dists_ref(0, stream),
-      out_dists(0, stream),
-      out_indices(0, stream)
+  SelectInOutSimple(const SelectTestSpec& spec,
+                    const std::vector<KeyT>& in_dists,
+                    const std::vector<KeyT>& out_dists,
+                    const std::vector<IdxT>& out_ids)
+    : in_dists_(in_dists),
+      in_ids_(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len)),
+      out_dists_(out_dists),
+      out_ids_(out_ids)
   {
   }
 
- protected:
-  void make_data()
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+};
+
+template <typename KeyT, typename IdxT>
+struct SelectInOutComputed {
+ public:
+  SelectInOutComputed(const SelectTestSpec& spec,
+                      knn::SelectKAlgo algo,
+                      const std::vector<KeyT>& in_dists,
+                      const std::optional<std::vector<IdxT>>& in_ids = std::nullopt)
+    : in_dists_(in_dists),
+      in_ids_(in_ids.value_or(gen_simple_ids<IdxT>(spec.n_inputs, spec.input_len))),
+      out_dists_(spec.n_inputs * spec.k),
+      out_ids_(spec.n_inputs * spec.k)
   {
-    std::vector<value_t> dists_h = params.dists_h;
+    auto stream = rmm::cuda_stream_default;
+
+    rmm::device_uvector<KeyT> in_dists_d(in_dists_.size(), stream);
+    rmm::device_uvector<IdxT> in_ids_d(in_ids_.size(), stream);
+    rmm::device_uvector<KeyT> out_dists_d(out_dists_.size(), stream);
+    rmm::device_uvector<IdxT> out_ids_d(out_ids_.size(), stream);
 
-    dists.resize(n_rows * n_cols, stream);
-    inds.resize(n_rows * n_cols, stream);
-    out_dists.resize(n_rows * k, stream);
-    out_indices.resize(n_rows * k, stream);
+    update_device(in_dists_d.data(), in_dists_.data(), in_dists_.size(), stream);
+    update_device(in_ids_d.data(), in_ids_.data(), in_ids_.size(), stream);
 
-    update_device(dists.data(), dists_h.data(), dists_h.size(), stream);
-    iota_fill(inds.data(), n_rows, n_cols, stream);
+    raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_d.data(),
+                                             in_ids_d.data(),
+                                             spec.n_inputs,
+                                             spec.input_len,
+                                             out_dists_d.data(),
+                                             out_ids_d.data(),
+                                             spec.select_min,
+                                             spec.k,
+                                             stream,
+                                             algo);
 
-    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
-    std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    out_indices_ref.resize(out_indices_ref_h.size(), stream);
-    out_dists_ref.resize(out_dists_ref_h.size(), stream);
+    update_host(out_dists_.data(), out_dists_d.data(), out_dists_.size(), stream);
+    update_host(out_ids_.data(), out_ids_d.data(), out_ids_.size(), stream);
 
-    update_device(
-      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
+    interruptible::synchronize(stream);
+
+    if (algo != knn::SelectKAlgo::WARP_SORT) {
+      // knn::SelectKAlgo::WARP_SORT is stable!
+      auto p = topk_sort_permutation(out_dists_, out_ids_, spec.k, spec.select_min);
+      apply_permutation(out_dists_, p);
+      apply_permutation(out_ids_, p);
+    }
   }
 
-  void SetUp() override
+  auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
+  auto get_in_ids() -> std::vector<IdxT>& { return in_ids_; }
+  auto get_out_dists() -> std::vector<KeyT>& { return out_dists_; }
+  auto get_out_ids() -> std::vector<IdxT>& { return out_ids_; }
+
+ private:
+  std::vector<KeyT> in_dists_;
+  std::vector<IdxT> in_ids_;
+  std::vector<KeyT> out_dists_;
+  std::vector<IdxT> out_ids_;
+
+  auto topk_sort_permutation(const std::vector<KeyT>& vec,
+                             const std::vector<IdxT>& inds,
+                             int k,
+                             bool select_min) -> std::vector<IdxT>
   {
-    n_rows = params.n_rows;
-    n_cols = params.n_cols;
-    k      = params.k;
-
-    make_data();
-
-    raft::spatial::knn::select_k(dists.data(),
-                                 inds.data(),
-                                 n_rows,
-                                 n_cols,
-                                 out_dists.data(),
-                                 out_indices.data(),
-                                 params.select_min,
-                                 k,
-                                 stream);
-
-    handle.sync_stream(stream);
+    std::vector<IdxT> p(vec.size());
+    std::iota(p.begin(), p.end(), 0);
+    if (select_min) {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] < vec[j];
+        }
+        return ik < jk;
+      });
+    } else {
+      std::sort(p.begin(), p.end(), [&vec, &inds, k](IdxT i, IdxT j) {
+        const IdxT ik = i / k;
+        const IdxT jk = j / k;
+        if (ik == jk) {
+          if (vec[i] == vec[j]) { return inds[i] < inds[j]; }
+          return vec[i] > vec[j];
+        }
+        return ik < jk;
+      });
+    }
+    return p;
   }
 
-  void compare()
+  template <typename T>
+  void apply_permutation(std::vector<T>& vec, const std::vector<IdxT>& p)
   {
-    ASSERT_TRUE(
-      devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, Compare<value_t>()));
-    ASSERT_TRUE(
-      devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare<value_idx>()));
+    for (auto i = IdxT(vec.size()) - 1; i > 0; i--) {
+      auto j = p[i];
+      while (j > i)
+        j = p[j];
+      std::swap(vec[j], vec[i]);
+    }
   }
+};
+
+template <typename InOut>
+using Params = std::tuple<SelectTestSpec, knn::SelectKAlgo, InOut>;
 
+template <typename KeyT, typename IdxT, template <typename, typename> typename ParamsReader>
+class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::ParamsIn> {
  protected:
-  raft::handle_t handle;
-  cudaStream_t stream;
+  const SelectTestSpec spec;
+  const knn::SelectKAlgo algo;
+
+  typename ParamsReader<KeyT, IdxT>::InOut ref;
+  SelectInOutComputed<KeyT, IdxT> res;
+
+ public:
+  explicit SelectionTest(Params<typename ParamsReader<KeyT, IdxT>::InOut> ps)
+    : spec(std::get<0>(ps)),
+      algo(std::get<1>(ps)),
+      ref(std::get<2>(ps)),
+      res(spec, algo, ref.get_in_dists(), ref.get_in_ids())
+  {
+    // std::cout << "dists in: " << ref.get_in_dists() << std::endl;
+    // std::cout << "dists ref:" << ref.get_out_dists() << std::endl;
+    // std::cout << "dists out:" << res.get_out_dists() << std::endl;
 
-  int n_rows, n_cols, k;
+    // std::cout << std::endl;
 
-  // input data
-  rmm::device_uvector<value_t> dists;
-  rmm::device_uvector<value_idx> inds;
+    // std::cout << "indices in :" << ref.get_in_ids() << std::endl;
+    // std::cout << "indices ref:" << ref.get_out_ids() << std::endl;
+    // std::cout << "indices out:" << res.get_out_ids() << std::endl;
+  }
 
-  // output data
-  rmm::device_uvector<value_idx> out_indices;
-  rmm::device_uvector<value_t> out_dists;
+  explicit SelectionTest(typename ParamsReader<KeyT, IdxT>::ParamsIn ps)
+    : SelectionTest(ParamsReader<KeyT, IdxT>::read(ps))
+  {
+  }
 
-  rmm::device_uvector<value_idx> out_indices_ref;
-  rmm::device_uvector<value_t> out_dists_ref;
+  SelectionTest()
+    : SelectionTest(testing::TestWithParam<typename ParamsReader<KeyT, IdxT>::ParamsIn>::GetParam())
+  {
+  }
 
-  SparseSelectionInputs<value_idx, value_t> params;
+  void run()
+  {
+    ASSERT_TRUE(hostArrMatch(ref.get_out_dists().data(),
+                             res.get_out_dists().data(),
+                             spec.n_inputs * spec.k,
+                             Compare<KeyT>()));
+    ASSERT_TRUE(hostArrMatch(
+      ref.get_out_ids().data(), res.get_out_ids().data(), spec.n_inputs * spec.k, Compare<IdxT>()));
+  }
 };
 
-const std::vector<SparseSelectionInputs<int, float>> inputs_i32_f = {
-  {5,
-   5,
-   {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
-    1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
-   {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
-    4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
-   {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3},
-   5,
-   true}};
-typedef SparseSelectionTest<int, float> SparseSelectionTestF;
-TEST_P(SparseSelectionTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseSelectionTest,
-                        SparseSelectionTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
-
-};  // end namespace selection
-};  // end namespace spatial
-};  // end namespace raft
+auto selection_algos = testing::Values(knn::SelectKAlgo::FAISS,
+                                       knn::SelectKAlgo::RADIX_8_BITS,
+                                       knn::SelectKAlgo::RADIX_11_BITS,
+                                       knn::SelectKAlgo::WARP_SORT);
+
+template <typename KeyT, typename IdxT>
+struct params_simple {
+  using InOut = SelectInOutSimple<KeyT, IdxT>;
+  using Inputs =
+    std::tuple<SelectTestSpec, std::vector<KeyT>, std::vector<KeyT>, std::vector<IdxT>>;
+  using ParamsIn = std::tuple<Inputs, knn::SelectKAlgo>;
+
+  static auto read(ParamsIn ps) -> Params<InOut>
+  {
+    auto ins  = std::get<0>(ps);
+    auto algo = std::get<1>(ps);
+    return std::make_tuple(
+      std::get<0>(ins),
+      algo,
+      SelectInOutSimple<KeyT, IdxT>(
+        std::get<0>(ins), std::get<1>(ins), std::get<2>(ins), std::get<3>(ins)));
+  }
+};
+
+auto inputs_simple_f = testing::Values(
+  params_simple<float, int>::Inputs(
+    {5, 5, 5, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0,
+     4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0, 1.0, 2.0, 3.0, 4.0, 5.0},
+    {4, 3, 2, 1, 0, 0, 1, 2, 3, 4, 3, 0, 1, 4, 2, 4, 2, 1, 3, 0, 0, 2, 1, 4, 3}),
+  params_simple<float, int>::Inputs(
+    {5, 5, 3, true},
+    {5.0, 4.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 2.0, 3.0, 5.0,
+     1.0, 4.0, 5.0, 3.0, 2.0, 4.0, 1.0, 1.0, 3.0, 2.0, 5.0, 4.0},
+    {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0},
+    {4, 3, 2, 0, 1, 2, 3, 0, 1, 4, 2, 1, 0, 2, 1}),
+  params_simple<float, int>::Inputs(
+    {5, 7, 3, true},
+    {5.0, 4.0, 3.0, 2.0, 1.3, 7.5, 19.0, 9.0, 2.0, 3.0, 3.0, 5.0, 6.0, 4.0, 2.0, 3.0, 5.0, 1.0,
+     4.0, 1.0, 1.0, 5.0, 7.0, 2.5, 4.0,  7.0, 8.0, 8.0, 1.0, 3.0, 2.0, 5.0, 4.0, 1.1, 1.2},
+    {1.3, 2.0, 3.0, 2.0, 3.0, 3.0, 1.0, 1.0, 1.0, 2.5, 4.0, 5.0, 1.0, 1.1, 1.2},
+    {4, 3, 2, 1, 2, 3, 3, 5, 6, 2, 3, 0, 0, 5, 6}),
+  params_simple<float, int>::Inputs(
+    {1, 7, 3, true}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {1.0, 1.0, 1.0}, {3, 5, 6}),
+  params_simple<float, int>::Inputs(
+    {1, 7, 3, false}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {5.0, 4.0, 3.0}, {2, 4, 1}),
+  params_simple<float, int>::Inputs(
+    {1, 7, 3, false}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}));
+
+typedef SelectionTest<float, int, params_simple> SimpleFloatInt;
+TEST_P(SimpleFloatInt, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(SelectionTest,
+                        SimpleFloatInt,
+                        testing::Combine(inputs_simple_f, selection_algos));
+
+template <knn::SelectKAlgo RefAlgo>
+struct with_ref {
+  template <typename KeyT, typename IdxT>
+  struct params_random {
+    using InOut    = SelectInOutComputed<KeyT, IdxT>;
+    using ParamsIn = std::tuple<SelectTestSpec, knn::SelectKAlgo>;
+
+    static auto read(ParamsIn ps) -> Params<InOut>
+    {
+      auto spec = std::get<0>(ps);
+      auto algo = std::get<1>(ps);
+      std::vector<KeyT> dists(spec.input_len * spec.n_inputs);
+
+      auto s = rmm::cuda_stream_default;
+      rmm::device_uvector<KeyT> dists_d(spec.input_len * spec.n_inputs, s);
+      raft::random::Rng r(42);
+      r.uniform(dists_d.data(), dists_d.size(), KeyT(-1.0), KeyT(1.0), s);
+      update_host(dists.data(), dists_d.data(), dists_d.size(), s);
+      s.synchronize();
+
+      return std::make_tuple(spec, algo, SelectInOutComputed<KeyT, IdxT>(spec, algo, dists));
+    }
+  };
+};
+
+auto inputs_random_f = testing::Values(SelectTestSpec{20, 700, 8, true},
+                                       SelectTestSpec{100, 1700, 17, true},
+                                       SelectTestSpec{100, 1700, 31, true},
+                                       SelectTestSpec{100, 1700, 32, false},
+                                       SelectTestSpec{100, 1700, 33, false},
+                                       SelectTestSpec{100, 1700, 63, false},
+                                       SelectTestSpec{100, 1700, 64, false},
+                                       SelectTestSpec{100, 1700, 65, false},
+                                       SelectTestSpec{100, 1700, 255, true},
+                                       SelectTestSpec{100, 1700, 256, true},
+                                       SelectTestSpec{100, 1700, 511, false},
+                                       SelectTestSpec{100, 1700, 512, true},
+                                       SelectTestSpec{100, 1700, 1023, false},
+                                       SelectTestSpec{100, 1700, 1024, true},
+                                       SelectTestSpec{100, 1700, 1700, true});
+
+typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::FAISS>::params_random>
+  ReferencedRandomFloatInt;
+TEST_P(ReferencedRandomFloatInt, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(SelectionTest,
+                        ReferencedRandomFloatInt,
+                        testing::Combine(inputs_random_f, selection_algos));
+
+}  // namespace raft::spatial::selection
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index 5349ac23d9..3f1338c5cd 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -209,6 +209,32 @@ testing::AssertionResult devArrMatchHost(
   return testing::AssertionSuccess();
 }
 
+/*
+ * @brief Helper function to compare host n-D arrays using a custom comparison
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected_h host array of expected value(s)
+ * @param actual_h host array actual values
+ * @param eq_compare the comparator
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
+template <typename T, typename L>
+testing::AssertionResult hostArrMatch(const T* expected_h,
+                                      const T* actual_h,
+                                      size_t size,
+                                      L eq_compare)
+{
+  for (size_t i(0); i < size; ++i) {
+    auto exp = expected_h[i];
+    auto act = actual_h[i];
+    if (!eq_compare(exp, act)) {
+      return testing::AssertionFailure()
+             << "actual=" << act << " != expected=" << exp << " @" << i << "; ";
+    }
+  }
+  return testing::AssertionSuccess();
+}
+
 /*
  * @brief Helper function to compare diagonal values of a 2D matrix
  * @tparam T the data type of the arrays

From 474929558b4ea4e26546458ce700bc11fe6219d3 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 9 Mar 2022 12:37:42 +0100
Subject: [PATCH 02/41] warpsort_topk: refactoring and fixing some bugs

---
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 182 +++++++++++-------
 cpp/test/spatial/selection.cu                 |  14 +-
 2 files changed, 130 insertions(+), 66 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index 1ffeb7335f..9f0da18489 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -117,6 +117,8 @@
 
 namespace raft::spatial::knn::detail::ivf_flat {
 
+static constexpr int kMaxCapacity = 1024;
+
 namespace {
 
 template <typename T>
@@ -152,23 +154,25 @@ __device__ inline bool is_greater_than(T val, T baseline)
   if constexpr (!greater) { return val < baseline; }
 }
 
-template <typename T>
-constexpr HDI T nextHighestPowerOf2(T v)
-{
-  /**
-   * TODO: Not entirely sure if this is what we need in the code of this file.
-   *       It returns `r`, such that r > v, r <= v*2, and r is power of two.
-   */
-  return isPo2(v) ? (v << (T)1) : ((T)1 << (log2(v) + 1));
-}
+// template <typename T>
+// constexpr HDI T nextHighestPowerOf2(T v)
+// {
+//   /**
+//    * TODO: Not entirely sure if this is what we need in the code of this file.
+//    *       It returns `r`, such that r > v, r <= v*2, and r is power of two.
+//    */
+//   return isPo2(v) ? (v << (T)1) : ((T)1 << (log2(v) + 1));
+// }
 
 int calc_capacity(int k)
 {
-  int capacity = nextHighestPowerOf2(k);
+  int capacity = isPo2(k) ? k : (1 << (log2(k) + 1));
   if (capacity < WarpSize) { capacity = WarpSize; }
   return capacity;
 }
+
 }  // namespace
+
 template <int capacity, bool greater, typename T, typename idxT>
 class WarpSort {
  public:
@@ -597,30 +601,83 @@ __global__ void block_kernel(const T* in,
   queue.dump(out + blockIdx.x * k, out_idx + blockIdx.x * k);
 }
 
-template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename idxT>
-void calc_launch_parameter_by_occupancy(idxT k, int* block_size, int* min_grid_size)
-{
-  const int capacity                                             = calc_capacity(k);
-  decltype(&block_kernel<WarpSortClass, 32, true, T, idxT>) func = nullptr;
-  if (capacity == 32) {
-    func = block_kernel<WarpSortClass, 32, true, T, idxT>;
-  } else if (capacity == 64) {
-    func = block_kernel<WarpSortClass, 64, true, T, idxT>;
-  } else if (capacity == 128) {
-    func = block_kernel<WarpSortClass, 128, true, T, idxT>;
-  } else if (capacity == 256) {
-    func = block_kernel<WarpSortClass, 256, true, T, idxT>;
-  } else {
-    ASSERT(false, "Requested capacity is too big (%d)", capacity);
+template <template <int, bool, typename, typename> class WarpSortClass,
+          typename T,
+          typename idxT,
+          int Capacity = kMaxCapacity>
+struct launch_setup {
+  /**
+   * @brief Calculate the best block size and minimum grid size for the given `k`.
+   *
+   * @param[in] k
+   *   The select-top-k parameter
+   * @param[out] block_size
+   *   Returned block size
+   * @param[out] min_grid_size
+   *   Returned minimum grid size needed to achieve the best potential occupancy
+   */
+  static void calc_optimal_params(int k, int* block_size, int* min_grid_size)
+  {
+    const int capacity = calc_capacity(k);
+    if constexpr (Capacity > WarpSize) {  // TODO: replace with `Capacity > 1` to allow small sizes.
+      if (capacity < Capacity) {
+        return launch_setup<WarpSortClass, T, idxT, Capacity / 2>::calc_optimal_params(
+          capacity, block_size, min_grid_size);
+      }
+    }
+    ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
+    auto calc_smem = [k](int block_size) {
+      int num_of_warp = block_size / WarpSize;
+      return calc_smem_size_for_block_wide<T>(num_of_warp, k);
+    };
+    RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSizeVariableSMem(
+      min_grid_size, block_size, block_kernel<WarpSortClass, Capacity, true, T, idxT>, calc_smem));
   }
 
-  auto calc_smem = [k](int block_size) {
-    int num_of_warp = block_size / WarpSize;
-    return calc_smem_size_for_block_wide<T>(num_of_warp, k);
-  };
-
-  cudaOccupancyMaxPotentialBlockSizeVariableSMem(min_grid_size, block_size, func, calc_smem);
-}
+  static void kernel(int k,
+                     bool greater,
+                     idxT batch_size,
+                     idxT len,
+                     int num_blocks,
+                     int block_dim,
+                     int smem_size,
+                     const T* in_key,
+                     const idxT* in_idx,
+                     T* out_key,
+                     idxT* out_idx,
+                     cudaStream_t stream)
+  {
+    const int capacity = calc_capacity(k);
+    if constexpr (Capacity > WarpSize) {  // TODO: replace with `Capacity > 1` to allow small sizes.
+      if (capacity < Capacity) {
+        return launch_setup<WarpSortClass, T, idxT, Capacity / 2>::kernel(k,
+                                                                          greater,
+                                                                          batch_size,
+                                                                          len,
+                                                                          num_blocks,
+                                                                          block_dim,
+                                                                          smem_size,
+                                                                          in_key,
+                                                                          in_idx,
+                                                                          out_key,
+                                                                          out_idx,
+                                                                          stream);
+      }
+    }
+    ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
+    T dummy = get_dummy<T>(greater);
+    if (greater) {
+      block_kernel<WarpSortClass, Capacity, true>
+        <<<batch_size * num_blocks, block_dim, smem_size, stream>>>(
+          in_key, in_idx, batch_size, len, k, out_key, out_idx, dummy);
+    } else {
+      block_kernel<WarpSortClass, Capacity, false>
+        <<<batch_size * num_blocks, block_dim, smem_size, stream>>>(
+          in_key, in_idx, batch_size, len, k, out_key, out_idx, dummy);
+    }
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+};
 
 template <template <int, bool, typename, typename> class WarpSortClass>
 struct LaunchThreshold {
@@ -646,7 +703,7 @@ void calc_launch_parameter(
   const int capacity = calc_capacity(k);
   int block_size     = 0;
   int min_grid_size  = 0;
-  calc_launch_parameter_by_occupancy<WarpSortClass, T, idxT>(k, &block_size, &min_grid_size);
+  launch_setup<WarpSortClass, T, idxT>::calc_optimal_params(k, &block_size, &min_grid_size);
 
   int num_of_warp;
   int num_of_block;
@@ -704,7 +761,7 @@ void calc_launch_parameter_for_merge(idxT len, idxT k, int* num_of_block, int* n
 
   int block_size    = 0;
   int min_grid_size = 0;
-  calc_launch_parameter_by_occupancy<WarpMerge, T, idxT>(k, &block_size, &min_grid_size);
+  launch_setup<WarpMerge, T, idxT>::calc_optimal_params(k, &block_size, &min_grid_size);
 
   *num_of_warp      = block_size / WarpSize;
   idxT len_per_warp = (len - 1) / (*num_of_warp) + 1;
@@ -712,19 +769,6 @@ void calc_launch_parameter_for_merge(idxT len, idxT k, int* num_of_block, int* n
   *num_of_warp      = (len - 1) / len_per_warp + 1;
 }
 
-#define BLOCK_CASE(WarpSortClass, capacity, in_val, in_idx, out_val, out_idx) \
-  case capacity:                                                              \
-    if (greater) {                                                            \
-      block_kernel<WarpSortClass, capacity, true>                             \
-        <<<batch_size * num_of_block, block_dim, smem_size, stream>>>(        \
-          in_val, in_idx, batch_size, len, k, out_val, out_idx, dummy);       \
-    } else {                                                                  \
-      block_kernel<WarpSortClass, capacity, false>                            \
-        <<<batch_size * num_of_block, block_dim, smem_size, stream>>>(        \
-          in_val, in_idx, batch_size, len, k, out_val, out_idx, dummy);       \
-    }                                                                         \
-    break
-
 template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename idxT>
 void warp_sort_topk_(int num_of_block,
                      int num_of_warp,
@@ -763,21 +807,25 @@ void warp_sort_topk_(int num_of_block,
   }
 
   // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
-  T dummy      = get_dummy<T>(greater);
+  // T dummy      = get_dummy<T>(greater);
   int capacity = calc_capacity(k);
 
   T* result_val    = (num_of_block == 1) ? out : tmp_val;
   idxT* result_idx = (num_of_block == 1) ? out_idx : tmp_idx;
   int block_dim    = num_of_warp * WarpSize;
   int smem_size    = calc_smem_size_for_block_wide<T>(num_of_warp, k);
-  switch (capacity) {
-    BLOCK_CASE(WarpSortClass, 32, in, static_cast<idxT*>(nullptr), result_val, result_idx);
-    BLOCK_CASE(WarpSortClass, 64, in, static_cast<idxT*>(nullptr), result_val, result_idx);
-    BLOCK_CASE(WarpSortClass, 128, in, static_cast<idxT*>(nullptr), result_val, result_idx);
-    BLOCK_CASE(WarpSortClass, 256, in, static_cast<idxT*>(nullptr), result_val, result_idx);
-    default: ASSERT(false, "Requested capacity is too big (%d)", capacity);
-  }
-  // CUDA_CHECK_LAST_ERROR();
+  launch_setup<WarpSortClass, T, idxT>::kernel(k,
+                                               greater,
+                                               batch_size,
+                                               len,
+                                               num_of_block,
+                                               block_dim,
+                                               smem_size,
+                                               in,
+                                               static_cast<idxT*>(nullptr),
+                                               result_val,
+                                               result_idx,
+                                               stream);
 
   if (num_of_block > 1) {
     len = k * num_of_block;
@@ -785,14 +833,18 @@ void warp_sort_topk_(int num_of_block,
     // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
     block_dim = num_of_warp * WarpSize;
     smem_size = calc_smem_size_for_block_wide<T>(num_of_warp, k);
-    switch (capacity) {
-      BLOCK_CASE(WarpMerge, 32, tmp_val, tmp_idx, out, out_idx);
-      BLOCK_CASE(WarpMerge, 64, tmp_val, tmp_idx, out, out_idx);
-      BLOCK_CASE(WarpMerge, 128, tmp_val, tmp_idx, out, out_idx);
-      BLOCK_CASE(WarpMerge, 256, tmp_val, tmp_idx, out, out_idx);
-      default: ASSERT(false, "Requested capacity is too big (%d)", capacity);
-    }
-    // CUDA_CHECK_LAST_ERROR();
+    launch_setup<WarpSortClass, T, idxT>::kernel(k,
+                                                 greater,
+                                                 batch_size,
+                                                 len,
+                                                 num_of_block,
+                                                 block_dim,
+                                                 smem_size,
+                                                 tmp_val,
+                                                 tmp_idx,
+                                                 out,
+                                                 out_idx,
+                                                 stream);
   }
 }
 
@@ -809,7 +861,7 @@ void warp_sort_topk(void* buf,
                     bool greater        = true,
                     cudaStream_t stream = 0)
 {
-  ASSERT(k <= 256, "Current max k is 256 (requested %d)", k);
+  ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
 
   int capacity     = calc_capacity(k);
   int num_of_block = 0;
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 123ef733d8..9a326f7654 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -325,7 +325,19 @@ struct with_ref {
   };
 };
 
-auto inputs_random_f = testing::Values(SelectTestSpec{20, 700, 8, true},
+auto inputs_random_f = testing::Values(SelectTestSpec{20, 700, 1, true},
+                                       SelectTestSpec{20, 700, 2, true},
+                                       SelectTestSpec{20, 700, 3, true},
+                                       SelectTestSpec{20, 700, 4, true},
+                                       SelectTestSpec{20, 700, 5, true},
+                                       SelectTestSpec{20, 700, 6, true},
+                                       SelectTestSpec{20, 700, 7, true},
+                                       SelectTestSpec{20, 700, 8, true},
+                                       SelectTestSpec{20, 700, 9, true},
+                                       SelectTestSpec{20, 700, 10, true},
+                                       SelectTestSpec{20, 700, 11, true},
+                                       SelectTestSpec{20, 700, 12, true},
+                                       SelectTestSpec{20, 700, 16, true},
                                        SelectTestSpec{100, 1700, 17, true},
                                        SelectTestSpec{100, 1700, 31, true},
                                        SelectTestSpec{100, 1700, 32, false},

From 8504d32233007fd04a0b0960918957aff4581c7e Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 9 Mar 2022 15:45:03 +0100
Subject: [PATCH 03/41] Allow passing indices along with keys (values).

---
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 328 ++++++++++--------
 1 file changed, 184 insertions(+), 144 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index 9f0da18489..7cf0ff566d 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -57,11 +57,11 @@
     calc_smem_size_for_block_wide() and passed as the 3rd kernel launch parameter.
 
     Two overloaded add() functions can be used to add items to the queue.
-    One is add(const T* in, idxT start, idxT end) and it adds a range of items,
+    One is add(const T* in, IdxT start, IdxT end) and it adds a range of items,
     namely [start, end) of in. The idx is inferred from start.
     This function should be called only once to add all items, and should not be
     used together with the second form of add().
-    The second one is add(T val, idxT idx), and it adds only one item pair.
+    The second one is add(T val, IdxT idx), and it adds only one item pair.
     Note that the range [start, end) is for the whole block of threads, that is,
     each thread in the same block should get the same start/end.
     In contrast, the parameters of the second form are for only one thread,
@@ -77,7 +77,7 @@
         // way 1, [0, len) is same for the whole block
         queue.add(in, 0, len);
         // way 2, each thread gets its own val/idx pair
-        for (idxT i = threadIdx.x; i < len, i += blockDim.x) {
+        for (IdxT i = threadIdx.x; i < len, i += blockDim.x) {
           queue.add(in[i], idx[i]);
         }
 
@@ -105,13 +105,13 @@
         // way 1, [0, len) is same for the whole warp
         queue.add(in, 0, len);
         // way 2, each thread gets its own val/idx pair
-        for (idxT i = lane_id; i < len, i += WarpSize) {
+        for (IdxT i = lane_id; i < len, i += WarpSize) {
           queue.add(in[i], idx[i]);
         }
 
         queue.done();
         // each warp outputs to a different offset
-        queue.dump(out+ warp_id * k * sizeof(T), out_idx+ warp_id * k * sizeof(idxT));
+        queue.dump(out+ warp_id * k * sizeof(T), out_idx+ warp_id * k * sizeof(IdxT));
       }
  */
 
@@ -154,16 +154,6 @@ __device__ inline bool is_greater_than(T val, T baseline)
   if constexpr (!greater) { return val < baseline; }
 }
 
-// template <typename T>
-// constexpr HDI T nextHighestPowerOf2(T v)
-// {
-//   /**
-//    * TODO: Not entirely sure if this is what we need in the code of this file.
-//    *       It returns `r`, such that r > v, r <= v*2, and r is power of two.
-//    */
-//   return isPo2(v) ? (v << (T)1) : ((T)1 << (log2(v) + 1));
-// }
-
 int calc_capacity(int k)
 {
   int capacity = isPo2(k) ? k : (1 << (log2(k) + 1));
@@ -173,10 +163,10 @@ int calc_capacity(int k)
 
 }  // namespace
 
-template <int capacity, bool greater, typename T, typename idxT>
+template <int capacity, bool greater, typename T, typename IdxT>
 class WarpSort {
  public:
-  __device__ WarpSort(idxT k, T dummy) : lane_(threadIdx.x % WarpSize), k_(k), dummy_(dummy)
+  __device__ WarpSort(IdxT k, T dummy) : lane_(threadIdx.x % WarpSize), k_(k), dummy_(dummy)
   {
     static_assert(capacity >= WarpSize && isPo2(capacity));
 
@@ -186,9 +176,9 @@ class WarpSort {
   }
 
   // load and merge k sorted values
-  __device__ void load_sorted(const T* in, const idxT* in_idx, idxT start)
+  __device__ void load_sorted(const T* in, const IdxT* in_idx, IdxT start)
   {
-    idxT idx = start + WarpSize - 1 - lane_;
+    IdxT idx = start + WarpSize - 1 - lane_;
     for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WarpSize) {
       if (idx < start + k_) {
         T t = in[idx];
@@ -202,10 +192,10 @@ class WarpSort {
     bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
   }
 
-  __device__ void dump(T* out, idxT* out_idx) const
+  __device__ void dump(T* out, IdxT* out_idx) const
   {
     for (int i = 0; i < max_arr_len_; ++i) {
-      idxT out_i = i * WarpSize + lane_;
+      IdxT out_i = i * WarpSize + lane_;
       if (out_i < k_) {
         out[out_i]     = val_arr_[i];
         out_idx[out_i] = idx_arr_[i];
@@ -217,18 +207,18 @@ class WarpSort {
   static constexpr int max_arr_len_ = capacity / WarpSize;
 
   T val_arr_[max_arr_len_];
-  idxT idx_arr_[max_arr_len_];
+  IdxT idx_arr_[max_arr_len_];
 
   const int lane_;
-  const idxT k_;
+  const IdxT k_;
   const T dummy_;
 };
 
-template <int capacity, bool greater, typename T, typename idxT>
-class WarpSelect : public WarpSort<capacity, greater, T, idxT> {
+template <int capacity, bool greater, typename T, typename IdxT>
+class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
  public:
-  __device__ WarpSelect(idxT k, T dummy)
-    : WarpSort<capacity, greater, T, idxT>(k, dummy),
+  __device__ WarpSelect(IdxT k, T dummy)
+    : WarpSort<capacity, greater, T, IdxT>(k, dummy),
       buf_len_(0),
       k_th_(dummy),
       k_th_lane_((k - 1) % WarpSize)
@@ -238,16 +228,26 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT> {
     }
   }
 
-  __device__ void add(const T* in, idxT start, idxT end)
+  __device__ void add(const T* in, IdxT start, IdxT end)
   {
-    const idxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
-    for (idxT i = start + lane_; i < end_for_fullwarp; i += WarpSize) {
+    const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
+    for (IdxT i = start + lane_; i < end_for_fullwarp; i += WarpSize) {
       T val = (i < end) ? in[i] : dummy_;
       add(val, i);
     }
   }
 
-  __device__ void add(T val, idxT idx)
+  __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
+    for (IdxT i = start + lane_; i < end_for_fullwarp; i += WarpSize) {
+      T val    = (i < end) ? in[i] : dummy_;
+      IdxT idx = (i < end) ? in_idx[i] : std::numeric_limits<IdxT>::max();
+      add(val, idx);
+    }
+  }
+
+  __device__ void add(T val, IdxT idx)
   {
     if (is_greater_than<greater>(val, k_th_)) {
       for (int i = 0; i < max_buf_len_ - 1; ++i) {
@@ -315,35 +315,35 @@ class WarpSelect : public WarpSort<capacity, greater, T, idxT> {
     }
   }
 
-  using WarpSort<capacity, greater, T, idxT>::max_arr_len_;
-  using WarpSort<capacity, greater, T, idxT>::val_arr_;
-  using WarpSort<capacity, greater, T, idxT>::idx_arr_;
-  using WarpSort<capacity, greater, T, idxT>::lane_;
-  using WarpSort<capacity, greater, T, idxT>::k_;
-  using WarpSort<capacity, greater, T, idxT>::dummy_;
+  using WarpSort<capacity, greater, T, IdxT>::max_arr_len_;
+  using WarpSort<capacity, greater, T, IdxT>::val_arr_;
+  using WarpSort<capacity, greater, T, IdxT>::idx_arr_;
+  using WarpSort<capacity, greater, T, IdxT>::lane_;
+  using WarpSort<capacity, greater, T, IdxT>::k_;
+  using WarpSort<capacity, greater, T, IdxT>::dummy_;
 
   static constexpr int max_buf_len_ = (capacity <= 64) ? 2 : 4;
 
   T val_buf_[max_buf_len_];
-  idxT idx_buf_[max_buf_len_];
+  IdxT idx_buf_[max_buf_len_];
   int buf_len_;
 
   T k_th_;
   const int k_th_lane_;
 };
 
-template <int capacity, bool greater, typename T, typename idxT>
-class WarpBitonic : public WarpSort<capacity, greater, T, idxT> {
+template <int capacity, bool greater, typename T, typename IdxT>
+class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
  public:
-  __device__ WarpBitonic(idxT k, T dummy)
-    : WarpSort<capacity, greater, T, idxT>(k, dummy), buf_len_(0)
+  __device__ WarpBitonic(IdxT k, T dummy)
+    : WarpSort<capacity, greater, T, IdxT>(k, dummy), buf_len_(0)
   {
     for (int i = 0; i < max_arr_len_; ++i) {
       val_buf_[i] = dummy_;
     }
   }
 
-  __device__ void add(const T* in, idxT start, idxT end)
+  __device__ void add(const T* in, IdxT start, IdxT end)
   {
     add_first_(in, start, end);
     start += capacity;
@@ -354,7 +354,18 @@ class WarpBitonic : public WarpSort<capacity, greater, T, idxT> {
     }
   }
 
-  __device__ void add(T val, idxT idx)
+  __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    add_first_(in, in_idx, start, end);
+    start += capacity;
+    while (start < end) {
+      add_extra_(in, in_idx, start, end);
+      merge_();
+      start += capacity;
+    }
+  }
+
+  __device__ void add(T val, IdxT idx)
   {
     for (int i = 0; i < max_arr_len_; ++i) {
       if (i == buf_len_) {
@@ -384,9 +395,9 @@ class WarpBitonic : public WarpSort<capacity, greater, T, idxT> {
   }
 
  private:
-  __device__ void add_first_(const T* in, idxT start, idxT end)
+  __device__ void add_first_(const T* in, IdxT start, IdxT end)
   {
-    idxT idx = start + lane_;
+    IdxT idx = start + lane_;
     for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
       if (idx < end) {
         val_arr_[i] = in[idx];
@@ -396,9 +407,21 @@ class WarpBitonic : public WarpSort<capacity, greater, T, idxT> {
     bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
   }
 
-  __device__ void add_extra_(const T* in, idxT start, idxT end)
+  __device__ void add_first_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    IdxT idx = start + lane_;
+    for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
+      if (idx < end) {
+        val_arr_[i] = in[idx];
+        idx_arr_[i] = in_idx[idx];
+      }
+    }
+    bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
+  }
+
+  __device__ void add_extra_(const T* in, IdxT start, IdxT end)
   {
-    idxT idx = start + lane_;
+    IdxT idx = start + lane_;
     for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
       val_buf_[i] = (idx < end) ? in[idx] : dummy_;
       idx_buf_[i] = idx;
@@ -406,6 +429,16 @@ class WarpBitonic : public WarpSort<capacity, greater, T, idxT> {
     bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
   }
 
+  __device__ void add_extra_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  {
+    IdxT idx = start + lane_;
+    for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
+      val_buf_[i] = (idx < end) ? in[idx] : dummy_;
+      idx_buf_[i] = (idx < end) ? in_idx[idx] : std::numeric_limits<IdxT>::max();
+    }
+    bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+  }
+
   __device__ void merge_()
   {
     for (int i = 0; i < max_arr_len_; ++i) {
@@ -417,27 +450,27 @@ class WarpBitonic : public WarpSort<capacity, greater, T, idxT> {
     bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
   }
 
-  using WarpSort<capacity, greater, T, idxT>::max_arr_len_;
-  using WarpSort<capacity, greater, T, idxT>::val_arr_;
-  using WarpSort<capacity, greater, T, idxT>::idx_arr_;
-  using WarpSort<capacity, greater, T, idxT>::lane_;
-  using WarpSort<capacity, greater, T, idxT>::k_;
-  using WarpSort<capacity, greater, T, idxT>::dummy_;
+  using WarpSort<capacity, greater, T, IdxT>::max_arr_len_;
+  using WarpSort<capacity, greater, T, IdxT>::val_arr_;
+  using WarpSort<capacity, greater, T, IdxT>::idx_arr_;
+  using WarpSort<capacity, greater, T, IdxT>::lane_;
+  using WarpSort<capacity, greater, T, IdxT>::k_;
+  using WarpSort<capacity, greater, T, IdxT>::dummy_;
 
   T val_buf_[max_arr_len_];
-  idxT idx_buf_[max_arr_len_];
+  IdxT idx_buf_[max_arr_len_];
   int buf_len_;
 };
 
-template <int capacity, bool greater, typename T, typename idxT>
-class WarpMerge : public WarpSort<capacity, greater, T, idxT> {
+template <int capacity, bool greater, typename T, typename IdxT>
+class WarpMerge : public WarpSort<capacity, greater, T, IdxT> {
  public:
-  __device__ WarpMerge(idxT k, T dummy) : WarpSort<capacity, greater, T, idxT>(k, dummy) {}
+  __device__ WarpMerge(IdxT k, T dummy) : WarpSort<capacity, greater, T, IdxT>(k, dummy) {}
 
-  __device__ void add(const T* in, const idxT* in_idx, idxT start, idxT end)
+  __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
-    idxT idx       = start + lane_;
-    idxT first_end = (start + k_ < end) ? (start + k_) : end;
+    IdxT idx       = start + lane_;
+    IdxT first_end = (start + k_ < end) ? (start + k_) : end;
     for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
       if (idx < first_end) {
         val_arr_[i] = in[idx];
@@ -453,48 +486,48 @@ class WarpMerge : public WarpSort<capacity, greater, T, idxT> {
   __device__ void done() {}
 
  private:
-  using WarpSort<capacity, greater, T, idxT>::max_arr_len_;
-  using WarpSort<capacity, greater, T, idxT>::val_arr_;
-  using WarpSort<capacity, greater, T, idxT>::idx_arr_;
-  using WarpSort<capacity, greater, T, idxT>::lane_;
-  using WarpSort<capacity, greater, T, idxT>::k_;
-  using WarpSort<capacity, greater, T, idxT>::dummy_;
+  using WarpSort<capacity, greater, T, IdxT>::max_arr_len_;
+  using WarpSort<capacity, greater, T, IdxT>::val_arr_;
+  using WarpSort<capacity, greater, T, IdxT>::idx_arr_;
+  using WarpSort<capacity, greater, T, IdxT>::lane_;
+  using WarpSort<capacity, greater, T, IdxT>::k_;
+  using WarpSort<capacity, greater, T, IdxT>::dummy_;
 };
 
-template <typename T, typename idxT>
-int calc_smem_size_for_block_wide(int num_of_warp, idxT k)
+template <typename T, typename IdxT>
+int calc_smem_size_for_block_wide(int num_of_warp, IdxT k)
 {
-  return Pow2<256>::roundUp(num_of_warp / 2 * sizeof(T) * k) + num_of_warp / 2 * sizeof(idxT) * k;
+  return Pow2<256>::roundUp(num_of_warp / 2 * sizeof(T) * k) + num_of_warp / 2 * sizeof(IdxT) * k;
 }
 
 template <template <int, bool, typename, typename> class WarpSortWarpWide,
           int capacity,
           bool greater,
           typename T,
-          typename idxT>
+          typename IdxT>
 class WarpSortBlockWide {
  public:
-  __device__ WarpSortBlockWide(idxT k, T dummy, void* smem_buf)
+  __device__ WarpSortBlockWide(IdxT k, T dummy, void* smem_buf)
     : queue_(k, dummy), k_(k), dummy_(dummy)
   {
     val_smem_             = static_cast<T*>(smem_buf);
     const int num_of_warp = blockDim.x / WarpSize;
-    idx_smem_             = reinterpret_cast<idxT*>(reinterpret_cast<char*>(smem_buf) +
+    idx_smem_             = reinterpret_cast<IdxT*>(reinterpret_cast<char*>(smem_buf) +
                                         Pow2<256>::roundUp(num_of_warp / 2 * sizeof(T) * k_));
   }
 
-  __device__ void add(const T* in, const idxT* in_idx, idxT start, idxT end)
+  __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
-    static_assert(std::is_same_v<WarpSortWarpWide<capacity, greater, T, idxT>,
-                                 WarpMerge<capacity, greater, T, idxT>>);
+    // static_assert(std::is_same_v<WarpSortWarpWide<capacity, greater, T, IdxT>,
+    //                              WarpMerge<capacity, greater, T, IdxT>>);
 
     int num_of_warp   = blockDim.x / WarpSize;
     const int warp_id = threadIdx.x / WarpSize;
-    idxT len_per_warp = (end - start - 1) / num_of_warp + 1;
+    IdxT len_per_warp = (end - start - 1) / num_of_warp + 1;
     len_per_warp      = ((len_per_warp - 1) / k_ + 1) * k_;
 
-    idxT warp_start = start + warp_id * len_per_warp;
-    idxT warp_end   = warp_start + len_per_warp;
+    IdxT warp_start = start + warp_id * len_per_warp;
+    IdxT warp_end   = warp_start + len_per_warp;
     if (warp_end > end) { warp_end = end; }
     queue_.add(in, in_idx, warp_start, warp_end);
   }
@@ -503,30 +536,30 @@ class WarpSortBlockWide {
   // correct offset.
   // It's due to the need to fill idx.
   // so has to pass the correct offset as "start"
-  __device__ void add(const T* in, idxT start, idxT end)
+  __device__ void add(const T* in, IdxT start, IdxT end)
   {
-    if constexpr (std::is_same_v<WarpSortWarpWide<capacity, greater, T, idxT>,
-                                 WarpSelect<capacity, greater, T, idxT>>) {
-      const idxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
-      for (idxT i = start + threadIdx.x; i < end_for_fullwarp; i += blockDim.x) {
+    if constexpr (std::is_same_v<WarpSortWarpWide<capacity, greater, T, IdxT>,
+                                 WarpSelect<capacity, greater, T, IdxT>>) {
+      const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
+      for (IdxT i = start + threadIdx.x; i < end_for_fullwarp; i += blockDim.x) {
         T val = (i < end) ? in[i] : dummy_;
         queue_.add(val, i);
       }
-    } else if constexpr (std::is_same_v<WarpSortWarpWide<capacity, greater, T, idxT>,
-                                        WarpBitonic<capacity, greater, T, idxT>>) {
+    } else if constexpr (std::is_same_v<WarpSortWarpWide<capacity, greater, T, IdxT>,
+                                        WarpBitonic<capacity, greater, T, IdxT>>) {
       int num_of_warp   = blockDim.x / WarpSize;
       const int warp_id = threadIdx.x / WarpSize;
-      idxT len_per_warp = (end - start - 1) / num_of_warp + 1;
+      IdxT len_per_warp = (end - start - 1) / num_of_warp + 1;
       len_per_warp      = Pow2<WarpSize>::roundUp(len_per_warp);
 
-      idxT warp_start = start + warp_id * len_per_warp;
-      idxT warp_end   = warp_start + len_per_warp;
+      IdxT warp_start = start + warp_id * len_per_warp;
+      IdxT warp_end   = warp_start + len_per_warp;
       if (warp_end > end) { warp_end = end; }
       queue_.add(in, warp_start, warp_end);
     }
   }
 
-  __device__ void add(T val, idxT idx) { queue_.add(val, idx); }
+  __device__ void add(T val, IdxT idx) { queue_.add(val, idx); }
 
   __device__ void done()
   {
@@ -550,51 +583,55 @@ class WarpSortBlockWide {
     }
   }
 
-  __device__ void dump(T* out, idxT* out_idx) const
+  __device__ void dump(T* out, IdxT* out_idx) const
   {
     if (threadIdx.x < WarpSize) { queue_.dump(out, out_idx); }
   }
 
  private:
-  WarpSortWarpWide<capacity, greater, T, idxT> queue_;
+  WarpSortWarpWide<capacity, greater, T, IdxT> queue_;
   int k_;
   T dummy_;
   T* val_smem_;
-  idxT* idx_smem_;
+  IdxT* idx_smem_;
 };
 
 template <template <int, bool, typename, typename> class WarpSortClass,
           int capacity,
           bool greater,
           typename T,
-          typename idxT>
+          typename IdxT>
 __global__ void block_kernel(const T* in,
-                             const idxT* in_idx,
-                             idxT batch_size,
-                             idxT len,
-                             idxT k,
+                             const IdxT* in_idx,
+                             IdxT batch_size,
+                             IdxT len,
+                             IdxT k,
                              T* out,
-                             idxT* out_idx,
+                             IdxT* out_idx,
                              T dummy)
 {
   extern __shared__ __align__(sizeof(T) * 256) uint8_t smem_buf_bytes[];
   T* smem_buf = (T*)smem_buf_bytes;
 
   const int num_of_block        = gridDim.x / batch_size;
-  const idxT len_per_block      = (len - 1) / num_of_block + 1;
+  const IdxT len_per_block      = (len - 1) / num_of_block + 1;
   const int batch_id            = blockIdx.x / num_of_block;
   const int block_id_in_a_batch = blockIdx.x % num_of_block;
 
-  idxT start = block_id_in_a_batch * len_per_block;
-  idxT end   = start + len_per_block;
+  IdxT start = block_id_in_a_batch * len_per_block;
+  IdxT end   = start + len_per_block;
   if (end >= len) { end = len; }
 
-  WarpSortBlockWide<WarpSortClass, capacity, greater, T, idxT> queue(k, dummy, smem_buf);
-  if constexpr (std::is_same_v<WarpSortClass<capacity, greater, T, idxT>,
-                               WarpMerge<capacity, greater, T, idxT>>) {
+  WarpSortBlockWide<WarpSortClass, capacity, greater, T, IdxT> queue(k, dummy, smem_buf);
+  if constexpr (std::is_same_v<WarpSortClass<capacity, greater, T, IdxT>,
+                               WarpMerge<capacity, greater, T, IdxT>>) {
     queue.add(in + batch_id * len, in_idx + batch_id * len, start, end);
   } else {
-    queue.add(in + batch_id * len, start, end);
+    if (in_idx == nullptr) {
+      queue.add(in + batch_id * len, start, end);
+    } else {
+      queue.add(in + batch_id * len, in_idx + batch_id * len, start, end);
+    }
   }
 
   queue.done();
@@ -603,7 +640,7 @@ __global__ void block_kernel(const T* in,
 
 template <template <int, bool, typename, typename> class WarpSortClass,
           typename T,
-          typename idxT,
+          typename IdxT,
           int Capacity = kMaxCapacity>
 struct launch_setup {
   /**
@@ -621,7 +658,7 @@ struct launch_setup {
     const int capacity = calc_capacity(k);
     if constexpr (Capacity > WarpSize) {  // TODO: replace with `Capacity > 1` to allow small sizes.
       if (capacity < Capacity) {
-        return launch_setup<WarpSortClass, T, idxT, Capacity / 2>::calc_optimal_params(
+        return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::calc_optimal_params(
           capacity, block_size, min_grid_size);
       }
     }
@@ -631,26 +668,26 @@ struct launch_setup {
       return calc_smem_size_for_block_wide<T>(num_of_warp, k);
     };
     RAFT_CUDA_TRY(cudaOccupancyMaxPotentialBlockSizeVariableSMem(
-      min_grid_size, block_size, block_kernel<WarpSortClass, Capacity, true, T, idxT>, calc_smem));
+      min_grid_size, block_size, block_kernel<WarpSortClass, Capacity, true, T, IdxT>, calc_smem));
   }
 
   static void kernel(int k,
                      bool greater,
-                     idxT batch_size,
-                     idxT len,
+                     IdxT batch_size,
+                     IdxT len,
                      int num_blocks,
                      int block_dim,
                      int smem_size,
                      const T* in_key,
-                     const idxT* in_idx,
+                     const IdxT* in_idx,
                      T* out_key,
-                     idxT* out_idx,
+                     IdxT* out_idx,
                      cudaStream_t stream)
   {
     const int capacity = calc_capacity(k);
     if constexpr (Capacity > WarpSize) {  // TODO: replace with `Capacity > 1` to allow small sizes.
       if (capacity < Capacity) {
-        return launch_setup<WarpSortClass, T, idxT, Capacity / 2>::kernel(k,
+        return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::kernel(k,
                                                                           greater,
                                                                           batch_size,
                                                                           len,
@@ -696,22 +733,22 @@ struct LaunchThreshold<WarpBitonic> {
   static constexpr int len_factor_for_single_block = 4;
 };
 
-template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename idxT>
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
 void calc_launch_parameter(
-  int batch_size, idxT len, idxT k, int* p_num_of_block, int* p_num_of_warp)
+  int batch_size, IdxT len, IdxT k, int* p_num_of_block, int* p_num_of_warp)
 {
   const int capacity = calc_capacity(k);
   int block_size     = 0;
   int min_grid_size  = 0;
-  launch_setup<WarpSortClass, T, idxT>::calc_optimal_params(k, &block_size, &min_grid_size);
+  launch_setup<WarpSortClass, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
 
   int num_of_warp;
   int num_of_block;
   if (batch_size < min_grid_size) {  // may use multiple blocks
     num_of_warp        = block_size / WarpSize;
     num_of_block       = min_grid_size / batch_size;
-    idxT len_per_block = (len - 1) / num_of_block + 1;
-    idxT len_per_warp  = (len_per_block - 1) / num_of_warp + 1;
+    IdxT len_per_block = (len - 1) / num_of_block + 1;
+    IdxT len_per_warp  = (len_per_block - 1) / num_of_warp + 1;
 
     len_per_warp  = Pow2<WarpSize>::roundUp(len_per_warp);
     len_per_block = len_per_warp * num_of_warp;
@@ -739,7 +776,7 @@ void calc_launch_parameter(
     }
 
     num_of_warp       = block_size / WarpSize;
-    idxT len_per_warp = (len - 1) / num_of_warp + 1;
+    IdxT len_per_warp = (len - 1) / num_of_warp + 1;
     len_per_warp      = Pow2<WarpSize>::roundUp(len_per_warp);
     num_of_warp       = (len - 1) / len_per_warp + 1;
 
@@ -754,41 +791,42 @@ void calc_launch_parameter(
   *p_num_of_warp  = num_of_warp;
 }
 
-template <typename T, typename idxT>
-void calc_launch_parameter_for_merge(idxT len, idxT k, int* num_of_block, int* num_of_warp)
+template <typename T, typename IdxT>
+void calc_launch_parameter_for_merge(IdxT len, IdxT k, int* num_of_block, int* num_of_warp)
 {
   *num_of_block = 1;
 
   int block_size    = 0;
   int min_grid_size = 0;
-  launch_setup<WarpMerge, T, idxT>::calc_optimal_params(k, &block_size, &min_grid_size);
+  launch_setup<WarpMerge, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
 
   *num_of_warp      = block_size / WarpSize;
-  idxT len_per_warp = (len - 1) / (*num_of_warp) + 1;
+  IdxT len_per_warp = (len - 1) / (*num_of_warp) + 1;
   len_per_warp      = ((len_per_warp - 1) / k + 1) * k;
   *num_of_warp      = (len - 1) / len_per_warp + 1;
 }
 
-template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename idxT>
+template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
 void warp_sort_topk_(int num_of_block,
                      int num_of_warp,
                      void* buf,
                      size_t& buf_size,
                      const T* in,
-                     idxT batch_size,
-                     idxT len,
-                     idxT k,
+                     const IdxT* in_idx,
+                     IdxT batch_size,
+                     IdxT len,
+                     IdxT k,
                      T* out,
-                     idxT* out_idx       = nullptr,
+                     IdxT* out_idx       = nullptr,
                      bool greater        = true,
                      cudaStream_t stream = 0)
 {
   T* tmp_val    = nullptr;
-  idxT* tmp_idx = nullptr;
+  IdxT* tmp_idx = nullptr;
 
   if (num_of_block > 1) {
     std::vector<size_t> sizes = {sizeof(T) * num_of_block * k * batch_size,
-                                 sizeof(idxT) * num_of_block * k * batch_size};
+                                 sizeof(IdxT) * num_of_block * k * batch_size};
     size_t total_size         = calc_aligned_size(sizes);
     if (!buf) {
       buf_size = total_size;
@@ -796,7 +834,7 @@ void warp_sort_topk_(int num_of_block,
     }
     std::vector<void*> aligned_pointers = calc_aligned_pointers(buf, sizes);
     tmp_val                             = static_cast<T*>(aligned_pointers[0]);
-    tmp_idx                             = static_cast<idxT*>(aligned_pointers[1]);
+    tmp_idx                             = static_cast<IdxT*>(aligned_pointers[1]);
   } else if (!buf) {
     // although don't need buf when num_of_block==1, but can't set buf_size=0
     // otherwise, cudaMalloc(&buf, 0) can result in buf==nullptr
@@ -811,10 +849,10 @@ void warp_sort_topk_(int num_of_block,
   int capacity = calc_capacity(k);
 
   T* result_val    = (num_of_block == 1) ? out : tmp_val;
-  idxT* result_idx = (num_of_block == 1) ? out_idx : tmp_idx;
+  IdxT* result_idx = (num_of_block == 1) ? out_idx : tmp_idx;
   int block_dim    = num_of_warp * WarpSize;
   int smem_size    = calc_smem_size_for_block_wide<T>(num_of_warp, k);
-  launch_setup<WarpSortClass, T, idxT>::kernel(k,
+  launch_setup<WarpSortClass, T, IdxT>::kernel(k,
                                                greater,
                                                batch_size,
                                                len,
@@ -822,7 +860,7 @@ void warp_sort_topk_(int num_of_block,
                                                block_dim,
                                                smem_size,
                                                in,
-                                               static_cast<idxT*>(nullptr),
+                                               in_idx,
                                                result_val,
                                                result_idx,
                                                stream);
@@ -833,7 +871,7 @@ void warp_sort_topk_(int num_of_block,
     // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
     block_dim = num_of_warp * WarpSize;
     smem_size = calc_smem_size_for_block_wide<T>(num_of_warp, k);
-    launch_setup<WarpSortClass, T, idxT>::kernel(k,
+    launch_setup<WarpSortClass, T, IdxT>::kernel(k,
                                                  greater,
                                                  batch_size,
                                                  len,
@@ -848,16 +886,16 @@ void warp_sort_topk_(int num_of_block,
   }
 }
 
-template <typename T, typename idxT>
+template <typename T, typename IdxT>
 void warp_sort_topk(void* buf,
                     size_t& buf_size,
                     const T* in,
-                    const idxT*,
-                    idxT batch_size,
-                    idxT len,
-                    idxT k,
+                    const IdxT* in_idx,
+                    IdxT batch_size,
+                    IdxT len,
+                    IdxT k,
                     T* out,
-                    idxT* out_idx       = nullptr,
+                    IdxT* out_idx       = nullptr,
                     bool greater        = true,
                     cudaStream_t stream = 0)
 {
@@ -870,11 +908,12 @@ void warp_sort_topk(void* buf,
   int len_per_warp = len / (num_of_block * num_of_warp);
 
   if (len_per_warp <= capacity * LaunchThreshold<WarpBitonic>::len_factor_for_choosing) {
-    warp_sort_topk_<WarpBitonic, T, idxT>(num_of_block,
+    warp_sort_topk_<WarpBitonic, T, IdxT>(num_of_block,
                                           num_of_warp,
                                           buf,
                                           buf_size,
                                           in,
+                                          in_idx,
                                           batch_size,
                                           len,
                                           k,
@@ -884,11 +923,12 @@ void warp_sort_topk(void* buf,
                                           stream);
   } else {
     calc_launch_parameter<WarpSelect, T>(batch_size, len, k, &num_of_block, &num_of_warp);
-    warp_sort_topk_<WarpSelect, T, idxT>(num_of_block,
+    warp_sort_topk_<WarpSelect, T, IdxT>(num_of_block,
                                          num_of_warp,
                                          buf,
                                          buf_size,
                                          in,
+                                         in_idx,
                                          batch_size,
                                          len,
                                          k,

From cef3253ed5175ac3a961dd74a5325610da2a1e3d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 10 Mar 2022 11:43:59 +0100
Subject: [PATCH 04/41] Adapt to the new bench

---
 cpp/bench/spatial/selection.cu | 51 +++++++++++++---------------------
 1 file changed, 20 insertions(+), 31 deletions(-)

diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
index 644b983a7e..4b15a00b7d 100644
--- a/cpp/bench/spatial/selection.cu
+++ b/cpp/bench/spatial/selection.cu
@@ -33,31 +33,20 @@ struct params {
 };
 
 template <typename KeyT, typename IdxT, raft::spatial::knn::SelectKAlgo Algo>
-struct selection : public Fixture {
-  selection(const std::string& name, const params& p) : Fixture(name), params_(p) {}
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override
-  {
-    auto in_len = params_.n_inputs * params_.input_len;
-    alloc(in_dists_, in_len, false);
-    alloc(in_ids_, in_len, false);
-    alloc(out_dists_, params_.n_inputs * params_.k, false);
-    alloc(out_ids_, params_.n_inputs * params_.k, false);
-
-    raft::sparse::iota_fill(in_ids_, IdxT(params_.n_inputs), IdxT(params_.input_len), stream);
-    raft::random::Rng(42).uniform(in_dists_, in_len, KeyT(-1.0), KeyT(1.0), stream);
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
+struct selection : public fixture {
+  explicit selection(const params& p)
+    : params_(p),
+      in_dists_(p.n_inputs * p.input_len, stream),
+      in_ids_(p.n_inputs * p.input_len, stream),
+      out_dists_(p.n_inputs * p.k, stream),
+      out_ids_(p.n_inputs * p.k, stream)
   {
-    dealloc(in_dists_, params_.n_inputs * params_.input_len);
-    dealloc(in_ids_, params_.n_inputs * params_.input_len);
-    dealloc(out_dists_, params_.n_inputs * params_.k);
-    dealloc(out_ids_, params_.n_inputs * params_.k);
+    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.n_inputs), IdxT(p.input_len), stream);
+    raft::random::Rng(42).uniform(
+      in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0), stream);
   }
 
-  void runBenchmark(::benchmark::State& state) override
+  void run_benchmark(::benchmark::State& state) override
   {
     rmm::mr::cuda_memory_resource cuda_mr;
     rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{
@@ -67,13 +56,13 @@ struct selection : public Fixture {
       std::ostringstream label_stream;
       label_stream << params_.n_inputs << "#" << params_.input_len << "#" << params_.k;
       state.SetLabel(label_stream.str());
-      loopOnState(state, [this]() {
-        raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_,
-                                                 in_ids_,
+      loop_on_state(state, [this]() {
+        raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_.data(),
+                                                 in_ids_.data(),
                                                  params_.n_inputs,
                                                  params_.input_len,
-                                                 out_dists_,
-                                                 out_ids_,
+                                                 out_dists_.data(),
+                                                 out_ids_.data(),
                                                  params_.select_min,
                                                  params_.k,
                                                  stream,
@@ -86,9 +75,9 @@ struct selection : public Fixture {
   }
 
  private:
-  params params_;
-  KeyT *in_dists_, *out_dists_;
-  IdxT *in_ids_, *out_ids_;
+  const params params_;
+  rmm::device_uvector<KeyT> in_dists_, out_dists_;
+  rmm::device_uvector<IdxT> in_ids_, out_ids_;
 };
 
 const std::vector<params> kInputs{
@@ -116,7 +105,7 @@ const std::vector<params> kInputs{
   namespace BENCHMARK_PRIVATE_NAME(selection)                                     \
   {                                                                               \
     using SelectK = selection<KeyT, IdxT, raft::spatial::knn::SelectKAlgo::Algo>; \
-    RAFT_BENCH_REGISTER(params, SelectK, #KeyT "/" #IdxT "/" #Algo, kInputs);     \
+    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #Algo, kInputs);             \
   }
 
 SELECTION_REGISTER(float, int, FAISS);

From 535fa0d4149d99b0d58d1008842f97b2cce7a0f0 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 10 Mar 2022 14:03:54 +0100
Subject: [PATCH 05/41] Use the pooled allocator helper

---
 cpp/bench/spatial/selection.cu | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
index 4b15a00b7d..fbe65c2dbc 100644
--- a/cpp/bench/spatial/selection.cu
+++ b/cpp/bench/spatial/selection.cu
@@ -48,10 +48,7 @@ struct selection : public fixture {
 
   void run_benchmark(::benchmark::State& state) override
   {
-    rmm::mr::cuda_memory_resource cuda_mr;
-    rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{
-      &cuda_mr, size_t(1) << size_t(30), size_t(16) << size_t(30)};
-    rmm::mr::set_current_device_resource(&pool_mr);
+    using_pool_memory_res res;
     try {
       std::ostringstream label_stream;
       label_stream << params_.n_inputs << "#" << params_.input_len << "#" << params_.k;
@@ -71,7 +68,6 @@ struct selection : public fixture {
     } catch (raft::exception& e) {
       state.SkipWithError(e.what());
     }
-    rmm::mr::set_current_device_resource(nullptr);
   }
 
  private:

From 7d10507384a9e926c0b9c1d7d4d70d5198678828 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 11 Mar 2022 08:56:35 +0100
Subject: [PATCH 06/41] Remove the step of calculating required buf size.

---
 .../knn/detail/ivf_flat/radix_topk.cuh        | 283 ++++++++----------
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 129 +++-----
 cpp/include/raft/spatial/knn/knn.cuh          |  87 +-----
 cpp/include/raft/spatial/knn/knn.hpp          |  87 +-----
 4 files changed, 191 insertions(+), 395 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
index a48f7a1e3c..d91c4c328d 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
@@ -114,15 +114,15 @@ __device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
   return (twiddle_in(x, greater) >> start_bit) & mask;
 }
 
-template <typename T, typename idxT, typename Func>
-__device__ void vectorized_process(const T* in, idxT len, Func f)
+template <typename T, typename IdxT, typename Func>
+__device__ void vectorized_process(const T* in, IdxT len, Func f)
 {
   using WideT = float4;
 
-  const idxT stride = blockDim.x * gridDim.x;
+  const IdxT stride = blockDim.x * gridDim.x;
   const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
   if constexpr (sizeof(T) >= sizeof(WideT)) {
-    for (idxT i = tid; i < len; i += stride) {
+    for (IdxT i = tid; i < len; i += stride) {
       f(in[i], i);
     }
   } else {
@@ -139,10 +139,10 @@ __device__ void vectorized_process(const T* in, idxT len, Func f)
                      : 0;
     if (skip_cnt > len) { skip_cnt = len; }
     const WideT* in_cast = reinterpret_cast<decltype(in_cast)>(in + skip_cnt);
-    const idxT len_cast  = (len - skip_cnt) / items_per_scalar;
-    for (idxT i = tid; i < len_cast; i += stride) {
+    const IdxT len_cast  = (len - skip_cnt) / items_per_scalar;
+    for (IdxT i = tid; i < len_cast; i += stride) {
       wide.scalar       = in_cast[i];
-      const idxT real_i = skip_cnt + i * items_per_scalar;
+      const IdxT real_i = skip_cnt + i * items_per_scalar;
 #pragma unroll
       for (int j = 0; j < items_per_scalar; ++j) {
         f(wide.array[j], real_i + j);
@@ -156,44 +156,44 @@ __device__ void vectorized_process(const T* in, idxT len, Func f)
     // because len_cast = (len - skip_cnt) / items_per_scalar,
     // len_cast * items_per_scalar + items_per_scalar > len - skip_cnt;
     // and so
-    // len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <= WarpSize
-    // no need to use loop
-    const idxT remain_i = skip_cnt + len_cast * items_per_scalar + tid;
+    // len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <=
+    // WarpSize no need to use loop
+    const IdxT remain_i = skip_cnt + len_cast * items_per_scalar + tid;
     if (remain_i < len) { f(in[remain_i], remain_i); }
   }
 }
 
-template <typename T, typename idxT>
+template <typename T, typename IdxT>
 struct Counter {
-  idxT k;
-  idxT len;
-  idxT previous_len;
+  IdxT k;
+  IdxT len;
+  IdxT previous_len;
   int bucket;
 
-  idxT filter_cnt;
+  IdxT filter_cnt;
   unsigned int finished_block_cnt;
-  idxT out_cnt;
-  idxT out_back_cnt;
+  IdxT out_cnt;
+  IdxT out_back_cnt;
   T kth_value;
 };
 
-template <typename T, typename idxT, int BITS_PER_PASS>
+template <typename T, typename IdxT, int BITS_PER_PASS>
 __device__ void filter_and_histogram(const T* in_buf,
-                                     const idxT* in_idx_buf,
+                                     const IdxT* in_idx_buf,
                                      T* out_buf,
-                                     idxT* out_idx_buf,
+                                     IdxT* out_idx_buf,
                                      T* out,
-                                     idxT* out_idx,
-                                     idxT len,
-                                     Counter<T, idxT>* counter,
-                                     idxT* histogram,
+                                     IdxT* out_idx,
+                                     IdxT len,
+                                     Counter<T, IdxT>* counter,
+                                     IdxT* histogram,
                                      bool greater,
                                      int pass,
                                      int k)
 {
   constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
-  __shared__ idxT histogram_smem[num_buckets];
-  for (idxT i = threadIdx.x; i < num_buckets; i += blockDim.x) {
+  __shared__ IdxT histogram_smem[num_buckets];
+  for (IdxT i = threadIdx.x; i < num_buckets; i += blockDim.x) {
     histogram_smem[i] = 0;
   }
   __syncthreads();
@@ -202,18 +202,18 @@ __device__ void filter_and_histogram(const T* in_buf,
   const unsigned mask = calc_mask<T, BITS_PER_PASS>(pass);
 
   if (pass == 0) {
-    auto f = [greater, start_bit, mask](T value, idxT) {
+    auto f = [greater, start_bit, mask](T value, IdxT) {
       int bucket = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
       atomicAdd(histogram_smem + bucket, 1);
     };
     vectorized_process(in_buf, len, f);
   } else {
-    const idxT previous_len      = counter->previous_len;
+    const IdxT previous_len      = counter->previous_len;
     const int want_bucket        = counter->bucket;
-    idxT& filter_cnt             = counter->filter_cnt;
-    idxT& out_cnt                = counter->out_cnt;
+    IdxT& filter_cnt             = counter->filter_cnt;
+    IdxT& out_cnt                = counter->out_cnt;
     T& kth_value                 = counter->kth_value;
-    const idxT counter_len       = counter->len;
+    const IdxT counter_len       = counter->len;
     const int previous_start_bit = calc_start_bit<T, BITS_PER_PASS>(pass - 1);
     const unsigned previous_mask = calc_mask<T, BITS_PER_PASS>(pass - 1);
 
@@ -232,11 +232,11 @@ __device__ void filter_and_histogram(const T* in_buf,
               &filter_cnt,
               &out_cnt,
               &kth_value,
-              counter_len](T value, idxT i) {
+              counter_len](T value, IdxT i) {
       int prev_bucket =
         calc_bucket<T, BITS_PER_PASS>(value, previous_start_bit, previous_mask, greater);
       if (prev_bucket == want_bucket) {
-        idxT pos     = atomicAdd(&filter_cnt, 1);
+        IdxT pos     = atomicAdd(&filter_cnt, 1);
         out_buf[pos] = value;
         if (out_idx_buf) { out_idx_buf[pos] = in_idx_buf ? in_idx_buf[i] : i; }
         int bucket = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
@@ -251,7 +251,7 @@ __device__ void filter_and_histogram(const T* in_buf,
           }
         }
       } else if (out && prev_bucket < want_bucket) {
-        idxT pos     = atomicAdd(&out_cnt, 1);
+        IdxT pos     = atomicAdd(&out_cnt, 1);
         out[pos]     = value;
         out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
       }
@@ -266,32 +266,32 @@ __device__ void filter_and_histogram(const T* in_buf,
   }
 }
 
-template <typename idxT, int BITS_PER_PASS, int NUM_THREAD>
-__device__ void scan(volatile idxT* histogram,
+template <typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
+__device__ void scan(volatile IdxT* histogram,
                      const int start,
                      const int num_buckets,
-                     const idxT current)
+                     const IdxT current)
 {
-  typedef cub::BlockScan<idxT, NUM_THREAD> BlockScan;
+  typedef cub::BlockScan<IdxT, NUM_THREAD> BlockScan;
   __shared__ typename BlockScan::TempStorage temp_storage;
 
-  idxT thread_data = 0;
+  IdxT thread_data = 0;
   int index        = start + threadIdx.x;
   if (index < num_buckets) { thread_data = histogram[index]; }
 
   BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
   __syncthreads();
   if (index < num_buckets) { histogram[index] = thread_data + current; }
-  __syncthreads();  // This sync is necessary, as the content of histogram needs to be
-                    // read after
+  __syncthreads();  // This sync is necessary, as the content of histogram needs
+                    // to be read after
 }
 
-template <typename T, typename idxT, int BITS_PER_PASS, int NUM_THREAD>
-__device__ void choose_bucket(Counter<T, idxT>* counter, idxT* histogram, const idxT k)
+template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
+__device__ void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
 {
   constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
   int index                 = threadIdx.x;
-  idxT current_value        = 0;
+  IdxT current_value        = 0;
   int num_pass              = 1;
   if constexpr (num_buckets >= NUM_THREAD) {
     static_assert(num_buckets % NUM_THREAD == 0);
@@ -299,10 +299,10 @@ __device__ void choose_bucket(Counter<T, idxT>* counter, idxT* histogram, const
   }
 
   for (int i = 0; i < num_pass && (current_value < k); i++) {
-    scan<idxT, BITS_PER_PASS, NUM_THREAD>(histogram, i * NUM_THREAD, num_buckets, current_value);
+    scan<IdxT, BITS_PER_PASS, NUM_THREAD>(histogram, i * NUM_THREAD, num_buckets, current_value);
     if (index < num_buckets) {
-      idxT prev = (index == 0) ? 0 : histogram[index - 1];
-      idxT cur  = histogram[index];
+      IdxT prev = (index == 0) ? 0 : histogram[index - 1];
+      IdxT cur  = histogram[index];
 
       // one and only one thread will satisfy this condition, so only write once
       if (prev < k && cur >= k) {
@@ -317,17 +317,17 @@ __device__ void choose_bucket(Counter<T, idxT>* counter, idxT* histogram, const
   }
 }
 
-template <typename T, typename idxT, int BITS_PER_PASS, int NUM_THREAD>
+template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
 __global__ void radix_kernel(const T* in_buf,
-                             const idxT* in_idx_buf,
+                             const IdxT* in_idx_buf,
                              T* out_buf,
-                             idxT* out_idx_buf,
+                             IdxT* out_idx_buf,
                              T* out,
-                             idxT* out_idx,
-                             Counter<T, idxT>* counters,
-                             idxT* histograms,
-                             const idxT len,
-                             const idxT k,
+                             IdxT* out_idx,
+                             Counter<T, IdxT>* counters,
+                             IdxT* histograms,
+                             const IdxT len,
+                             const int k,
                              const bool greater,
                              const int pass)
 {
@@ -347,7 +347,7 @@ __global__ void radix_kernel(const T* in_buf,
   auto counter   = counters + batch_id;
   auto histogram = histograms + batch_id * num_buckets;
 
-  filter_and_histogram<T, idxT, BITS_PER_PASS>(in_buf,
+  filter_and_histogram<T, IdxT, BITS_PER_PASS>(in_buf,
                                                in_idx_buf,
                                                out_buf,
                                                out_idx_buf,
@@ -374,7 +374,8 @@ __global__ void radix_kernel(const T* in_buf,
       counter->previous_len = 0;
       counter->len          = 0;
     }
-    // init counter, other members of counter is initialized with 0 by cudaMemset()
+    // init counter, other members of counter is initialized with 0 by
+    // cudaMemset()
     if (pass == 0 && threadIdx.x == 0) {
       counter->k   = k;
       counter->len = len;
@@ -382,21 +383,21 @@ __global__ void radix_kernel(const T* in_buf,
     }
     __syncthreads();
 
-    idxT ori_k = counter->k;
+    IdxT ori_k = counter->k;
 
     if (counter->len > 0) {
-      choose_bucket<T, idxT, BITS_PER_PASS, NUM_THREAD>(counter, histogram, ori_k);
+      choose_bucket<T, IdxT, BITS_PER_PASS, NUM_THREAD>(counter, histogram, ori_k);
     }
 
     __syncthreads();
     if (pass == num_passes - 1) {
-      const idxT previous_len = counter->previous_len;
+      const IdxT previous_len = counter->previous_len;
       const int want_bucket   = counter->bucket;
       int start_bit           = calc_start_bit<T, BITS_PER_PASS>(pass);
       unsigned mask           = calc_mask<T, BITS_PER_PASS>(pass);
 
       if (!out) {  // radix select
-        for (idxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
+        for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
           const T value = out_buf[i];
           int bucket    = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
           if (bucket == want_bucket) {
@@ -407,19 +408,19 @@ __global__ void radix_kernel(const T* in_buf,
           }
         }
       } else {  // radix topk
-        idxT& out_cnt = counter->out_cnt;
-        for (idxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
+        IdxT& out_cnt = counter->out_cnt;
+        for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
           const T value = out_buf[i];
           int bucket    = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
           if (bucket < want_bucket) {
-            idxT pos     = atomicAdd(&out_cnt, 1);
+            IdxT pos     = atomicAdd(&out_cnt, 1);
             out[pos]     = value;
             out_idx[pos] = out_idx_buf[i];
           } else if (bucket == want_bucket) {
-            idxT needed_num_of_kth = counter->k;
-            idxT back_pos          = atomicAdd(&(counter->out_back_cnt), 1);
+            IdxT needed_num_of_kth = counter->k;
+            IdxT back_pos          = atomicAdd(&(counter->out_back_cnt), 1);
             if (back_pos < needed_num_of_kth) {
-              idxT pos     = k - 1 - back_pos;
+              IdxT pos     = k - 1 - back_pos;
               out[pos]     = value;
               out_idx[pos] = out_idx_buf[i];
             }
@@ -437,35 +438,35 @@ __global__ void radix_kernel(const T* in_buf,
   }
 }
 
-template <typename T, typename idxT>
+template <typename T, typename IdxT>
 __global__ void final_filter(const T* in,
-                             const idxT len,
-                             const idxT k,
-                             Counter<T, idxT>* counters,
+                             const IdxT len,
+                             const IdxT k,
+                             Counter<T, IdxT>* counters,
                              T* out,
-                             idxT* out_idx,
+                             IdxT* out_idx,
                              bool greater)
 {
   const int batch_id           = blockIdx.y;
   const T kth_value            = counters[batch_id].kth_value;
-  const idxT needed_num_of_kth = counters[batch_id].k;
-  idxT& out_cnt                = counters[batch_id].out_cnt;
-  idxT& out_back_cnt           = counters[batch_id].out_back_cnt;
+  const IdxT needed_num_of_kth = counters[batch_id].k;
+  IdxT& out_cnt                = counters[batch_id].out_cnt;
+  IdxT& out_back_cnt           = counters[batch_id].out_back_cnt;
 
   in      = in + batch_id * len;
   out     = out + batch_id * k;
   out_idx = out_idx + batch_id * k;
 
   auto f = [k, greater, kth_value, needed_num_of_kth, &out_cnt, &out_back_cnt, out, out_idx](
-             T val, idxT i) {
+             T val, IdxT i) {
     if ((greater && val > kth_value) || (!greater && val < kth_value)) {
-      idxT pos     = atomicAdd(&out_cnt, 1);
+      IdxT pos     = atomicAdd(&out_cnt, 1);
       out[pos]     = val;
       out_idx[pos] = i;
     } else if (val == kth_value) {
-      idxT back_pos = atomicAdd(&out_back_cnt, 1);
+      IdxT back_pos = atomicAdd(&out_back_cnt, 1);
       if (back_pos < needed_num_of_kth) {
-        idxT pos     = k - 1 - back_pos;
+        IdxT pos     = k - 1 - back_pos;
         out[pos]     = val;
         out_idx[pos] = i;
       }
@@ -474,15 +475,15 @@ __global__ void final_filter(const T* in,
   vectorized_process(in, len, f);
 }
 
-template <typename T, typename idxT, int BITS_PER_PASS, int NUM_THREAD>
+template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
 void radix_select_topk(void* buf,
                        size_t& buf_size,
                        const T* in,
-                       idxT batch_size,
-                       idxT len,
-                       idxT k,
+                       IdxT batch_size,
+                       IdxT len,
+                       IdxT k,
                        T* out,
-                       idxT* out_idx,
+                       IdxT* out_idx,
                        bool greater,
                        cudaStream_t stream)
 {
@@ -490,8 +491,8 @@ void radix_select_topk(void* buf,
   static_assert(calc_num_passes<T, BITS_PER_PASS>() > 1);
   constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
 
-  Counter<T, idxT>* counters = nullptr;
-  idxT* histograms           = nullptr;
+  Counter<T, IdxT>* counters = nullptr;
+  IdxT* histograms           = nullptr;
   T* buf1                    = nullptr;
   T* buf2                    = nullptr;
   {
@@ -535,7 +536,7 @@ void radix_select_topk(void* buf,
       in_buf  = (pass % 2 == 0) ? buf1 : buf2;
       out_buf = (pass % 2 == 0) ? buf2 : buf1;
     }
-    radix_kernel<T, idxT, BITS_PER_PASS, NUM_THREAD><<<blocks, NUM_THREAD, 0, stream>>>(in_buf,
+    radix_kernel<T, IdxT, BITS_PER_PASS, NUM_THREAD><<<blocks, NUM_THREAD, 0, stream>>>(in_buf,
                                                                                         nullptr,
                                                                                         out_buf,
                                                                                         nullptr,
@@ -556,61 +557,36 @@ void radix_select_topk(void* buf,
     in, len, k, counters, out, out_idx, greater);
 }
 
-template <typename T, typename idxT, int BITS_PER_PASS, int NUM_THREAD>
-void radix_topk(void* buf,
-                size_t& buf_size,
-                const T* in,
-                const idxT* in_idx,
-                idxT batch_size,
-                idxT len,
-                idxT k,
+template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
+void radix_topk(const T* in,
+                const IdxT* in_idx,
+                size_t batch_size,
+                size_t len,
+                int k,
                 T* out,
-                idxT* out_idx,
+                IdxT* out_idx,
                 bool greater,
-                cudaStream_t stream)
+                rmm::cuda_stream_view stream)
 {
   // TODO: is it possible to relax this restriction?
   static_assert(calc_num_passes<T, BITS_PER_PASS>() > 1);
   constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
 
-  Counter<T, idxT>* counters = nullptr;
-  idxT* histograms           = nullptr;
-  T* buf1                    = nullptr;
-  idxT* idx_buf1             = nullptr;
-  T* buf2                    = nullptr;
-  idxT* idx_buf2             = nullptr;
-  {
-    std::vector<size_t> sizes = {sizeof(*counters) * batch_size,
-                                 sizeof(*histograms) * num_buckets * batch_size,
-                                 sizeof(*buf1) * len * batch_size,
-                                 sizeof(*idx_buf1) * len * batch_size,
-                                 sizeof(*buf2) * len * batch_size,
-                                 sizeof(*idx_buf2) * len * batch_size};
-    size_t total_size         = calc_aligned_size(sizes);
-    if (!buf) {
-      buf_size = total_size;
-      return;
-    }
-
-    std::vector<void*> aligned_pointers = calc_aligned_pointers(buf, sizes);
-    counters                            = static_cast<decltype(counters)>(aligned_pointers[0]);
-    histograms                          = static_cast<decltype(histograms)>(aligned_pointers[1]);
-    buf1                                = static_cast<decltype(buf1)>(aligned_pointers[2]);
-    idx_buf1                            = static_cast<decltype(idx_buf1)>(aligned_pointers[3]);
-    buf2                                = static_cast<decltype(buf2)>(aligned_pointers[4]);
-    idx_buf2                            = static_cast<decltype(idx_buf2)>(aligned_pointers[5]);
+  rmm::device_uvector<Counter<T, IdxT>> counters(batch_size, stream);
+  rmm::device_uvector<IdxT> histograms(num_buckets * batch_size, stream);
+  rmm::device_uvector<T> buf1(len * batch_size, stream);
+  rmm::device_uvector<IdxT> idx_buf1(len * batch_size, stream);
+  rmm::device_uvector<T> buf2(len * batch_size, stream);
+  rmm::device_uvector<IdxT> idx_buf2(len * batch_size, stream);
 
-    RAFT_CUDA_TRY(cudaMemsetAsync(
-      buf,
-      0,
-      static_cast<char*>(aligned_pointers[2]) - static_cast<char*>(aligned_pointers[0]),
-      stream));
-  }
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
 
   const T* in_buf        = nullptr;
-  const idxT* in_idx_buf = nullptr;
+  const IdxT* in_idx_buf = nullptr;
   T* out_buf             = nullptr;
-  idxT* out_idx_buf      = nullptr;
+  IdxT* out_idx_buf      = nullptr;
 
   dim3 blocks((len - 1) / (NUM_THREAD * ITEM_PER_THREAD) + 1, batch_size);
 
@@ -625,32 +601,33 @@ void radix_topk(void* buf,
     } else if (pass == 1) {
       in_buf      = in;
       in_idx_buf  = in_idx ? in_idx : nullptr;
-      out_buf     = buf1;
-      out_idx_buf = idx_buf1;
+      out_buf     = buf1.data();
+      out_idx_buf = idx_buf1.data();
     } else if (pass % 2 == 0) {
-      in_buf      = buf1;
-      in_idx_buf  = idx_buf1;
-      out_buf     = buf2;
-      out_idx_buf = idx_buf2;
+      in_buf      = buf1.data();
+      in_idx_buf  = idx_buf1.data();
+      out_buf     = buf2.data();
+      out_idx_buf = idx_buf2.data();
     } else {
-      in_buf      = buf2;
-      in_idx_buf  = idx_buf2;
-      out_buf     = buf1;
-      out_idx_buf = idx_buf1;
+      in_buf      = buf2.data();
+      in_idx_buf  = idx_buf2.data();
+      out_buf     = buf1.data();
+      out_idx_buf = idx_buf1.data();
     }
 
-    radix_kernel<T, idxT, BITS_PER_PASS, NUM_THREAD><<<blocks, NUM_THREAD, 0, stream>>>(in_buf,
-                                                                                        in_idx_buf,
-                                                                                        out_buf,
-                                                                                        out_idx_buf,
-                                                                                        out,
-                                                                                        out_idx,
-                                                                                        counters,
-                                                                                        histograms,
-                                                                                        len,
-                                                                                        k,
-                                                                                        greater,
-                                                                                        pass);
+    radix_kernel<T, IdxT, BITS_PER_PASS, NUM_THREAD>
+      <<<blocks, NUM_THREAD, 0, stream>>>(in_buf,
+                                          in_idx_buf,
+                                          out_buf,
+                                          out_idx_buf,
+                                          out,
+                                          out_idx,
+                                          counters.data(),
+                                          histograms.data(),
+                                          len,
+                                          k,
+                                          greater,
+                                          pass);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index 7cf0ff566d..47221b92bb 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -117,7 +117,7 @@
 
 namespace raft::spatial::knn::detail::ivf_flat {
 
-static constexpr int kMaxCapacity = 1024;
+static constexpr int kMaxCapacity = 512;
 
 namespace {
 
@@ -734,8 +734,7 @@ struct LaunchThreshold<WarpBitonic> {
 };
 
 template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
-void calc_launch_parameter(
-  int batch_size, IdxT len, IdxT k, int* p_num_of_block, int* p_num_of_warp)
+void calc_launch_parameter(int batch_size, IdxT len, int k, int* p_num_of_block, int* p_num_of_warp)
 {
   const int capacity = calc_capacity(k);
   int block_size     = 0;
@@ -745,10 +744,10 @@ void calc_launch_parameter(
   int num_of_warp;
   int num_of_block;
   if (batch_size < min_grid_size) {  // may use multiple blocks
-    num_of_warp        = block_size / WarpSize;
-    num_of_block       = min_grid_size / batch_size;
-    IdxT len_per_block = (len - 1) / num_of_block + 1;
-    IdxT len_per_warp  = (len_per_block - 1) / num_of_warp + 1;
+    num_of_warp       = block_size / WarpSize;
+    num_of_block      = min_grid_size / batch_size;
+    int len_per_block = (len - 1) / num_of_block + 1;
+    int len_per_warp  = (len_per_block - 1) / num_of_warp + 1;
 
     len_per_warp  = Pow2<WarpSize>::roundUp(len_per_warp);
     len_per_block = len_per_warp * num_of_warp;
@@ -758,7 +757,7 @@ void calc_launch_parameter(
     if (len_per_warp < capacity * len_factor) {
       len_per_warp  = capacity * len_factor;
       len_per_block = num_of_warp * len_per_warp;
-      if (len_per_block > len) { len_per_block = len; }
+      if ((IdxT)len_per_block > len) { len_per_block = len; }
       num_of_block = (len - 1) / len_per_block + 1;
       num_of_warp  = (len_per_block - 1) / len_per_warp + 1;
     }
@@ -775,10 +774,10 @@ void calc_launch_parameter(
       block_size = Pow2<WarpSize>::roundUp(block_size);
     }
 
-    num_of_warp       = block_size / WarpSize;
-    IdxT len_per_warp = (len - 1) / num_of_warp + 1;
-    len_per_warp      = Pow2<WarpSize>::roundUp(len_per_warp);
-    num_of_warp       = (len - 1) / len_per_warp + 1;
+    num_of_warp      = block_size / WarpSize;
+    int len_per_warp = (len - 1) / num_of_warp + 1;
+    len_per_warp     = Pow2<WarpSize>::roundUp(len_per_warp);
+    num_of_warp      = (len - 1) / len_per_warp + 1;
 
     constexpr int len_factor = LaunchThreshold<WarpSortClass>::len_factor_for_single_block;
     if (len_per_warp < capacity * len_factor) {
@@ -792,7 +791,7 @@ void calc_launch_parameter(
 }
 
 template <typename T, typename IdxT>
-void calc_launch_parameter_for_merge(IdxT len, IdxT k, int* num_of_block, int* num_of_warp)
+void calc_launch_parameter_for_merge(IdxT len, int k, int* num_of_block, int* num_of_warp)
 {
   *num_of_block = 1;
 
@@ -809,53 +808,31 @@ void calc_launch_parameter_for_merge(IdxT len, IdxT k, int* num_of_block, int* n
 template <template <int, bool, typename, typename> class WarpSortClass, typename T, typename IdxT>
 void warp_sort_topk_(int num_of_block,
                      int num_of_warp,
-                     void* buf,
-                     size_t& buf_size,
                      const T* in,
                      const IdxT* in_idx,
-                     IdxT batch_size,
-                     IdxT len,
-                     IdxT k,
+                     size_t batch_size,
+                     size_t len,
+                     int k,
                      T* out,
                      IdxT* out_idx       = nullptr,
                      bool greater        = true,
                      cudaStream_t stream = 0)
 {
-  T* tmp_val    = nullptr;
-  IdxT* tmp_idx = nullptr;
-
-  if (num_of_block > 1) {
-    std::vector<size_t> sizes = {sizeof(T) * num_of_block * k * batch_size,
-                                 sizeof(IdxT) * num_of_block * k * batch_size};
-    size_t total_size         = calc_aligned_size(sizes);
-    if (!buf) {
-      buf_size = total_size;
-      return;
-    }
-    std::vector<void*> aligned_pointers = calc_aligned_pointers(buf, sizes);
-    tmp_val                             = static_cast<T*>(aligned_pointers[0]);
-    tmp_idx                             = static_cast<IdxT*>(aligned_pointers[1]);
-  } else if (!buf) {
-    // although don't need buf when num_of_block==1, but can't set buf_size=0
-    // otherwise, cudaMalloc(&buf, 0) can result in buf==nullptr
-    // then the next call of topk() won't do anything but set buf_size again
-    // so set buf_size to 1 here to avoid such case
-    buf_size = 1;
-    return;
-  }
+  rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream);
+  rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream);
 
   // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
   // T dummy      = get_dummy<T>(greater);
   int capacity = calc_capacity(k);
 
-  T* result_val    = (num_of_block == 1) ? out : tmp_val;
-  IdxT* result_idx = (num_of_block == 1) ? out_idx : tmp_idx;
+  T* result_val    = (num_of_block == 1) ? out : tmp_val.data();
+  IdxT* result_idx = (num_of_block == 1) ? out_idx : tmp_idx.data();
   int block_dim    = num_of_warp * WarpSize;
-  int smem_size    = calc_smem_size_for_block_wide<T>(num_of_warp, k);
-  launch_setup<WarpSortClass, T, IdxT>::kernel(k,
+  int smem_size    = calc_smem_size_for_block_wide<T>(num_of_warp, (IdxT)k);
+  launch_setup<WarpSortClass, T, IdxT>::kernel((IdxT)k,
                                                greater,
-                                               batch_size,
-                                               len,
+                                               (IdxT)batch_size,
+                                               (IdxT)len,
                                                num_of_block,
                                                block_dim,
                                                smem_size,
@@ -870,16 +847,16 @@ void warp_sort_topk_(int num_of_block,
     calc_launch_parameter_for_merge<T>(len, k, &num_of_block, &num_of_warp);
     // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
     block_dim = num_of_warp * WarpSize;
-    smem_size = calc_smem_size_for_block_wide<T>(num_of_warp, k);
-    launch_setup<WarpSortClass, T, IdxT>::kernel(k,
+    smem_size = calc_smem_size_for_block_wide<T>(num_of_warp, (IdxT)k);
+    launch_setup<WarpSortClass, T, IdxT>::kernel((IdxT)k,
                                                  greater,
-                                                 batch_size,
-                                                 len,
+                                                 (IdxT)batch_size,
+                                                 (IdxT)len,
                                                  num_of_block,
                                                  block_dim,
                                                  smem_size,
-                                                 tmp_val,
-                                                 tmp_idx,
+                                                 tmp_val.data(),
+                                                 tmp_idx.data(),
                                                  out,
                                                  out_idx,
                                                  stream);
@@ -887,55 +864,31 @@ void warp_sort_topk_(int num_of_block,
 }
 
 template <typename T, typename IdxT>
-void warp_sort_topk(void* buf,
-                    size_t& buf_size,
-                    const T* in,
+void warp_sort_topk(const T* in,
                     const IdxT* in_idx,
-                    IdxT batch_size,
-                    IdxT len,
-                    IdxT k,
+                    size_t batch_size,
+                    size_t len,
+                    int k,
                     T* out,
-                    IdxT* out_idx       = nullptr,
-                    bool greater        = true,
-                    cudaStream_t stream = 0)
+                    IdxT* out_idx                = nullptr,
+                    bool greater                 = true,
+                    rmm::cuda_stream_view stream = 0)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
 
   int capacity     = calc_capacity(k);
   int num_of_block = 0;
   int num_of_warp  = 0;
-  calc_launch_parameter<WarpBitonic, T>(batch_size, len, k, &num_of_block, &num_of_warp);
+  calc_launch_parameter<WarpBitonic, T>(batch_size, len, (IdxT)k, &num_of_block, &num_of_warp);
   int len_per_warp = len / (num_of_block * num_of_warp);
 
   if (len_per_warp <= capacity * LaunchThreshold<WarpBitonic>::len_factor_for_choosing) {
-    warp_sort_topk_<WarpBitonic, T, IdxT>(num_of_block,
-                                          num_of_warp,
-                                          buf,
-                                          buf_size,
-                                          in,
-                                          in_idx,
-                                          batch_size,
-                                          len,
-                                          k,
-                                          out,
-                                          out_idx,
-                                          greater,
-                                          stream);
+    warp_sort_topk_<WarpBitonic, T, IdxT>(
+      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, greater, stream);
   } else {
     calc_launch_parameter<WarpSelect, T>(batch_size, len, k, &num_of_block, &num_of_warp);
-    warp_sort_topk_<WarpSelect, T, IdxT>(num_of_block,
-                                         num_of_warp,
-                                         buf,
-                                         buf_size,
-                                         in,
-                                         in_idx,
-                                         batch_size,
-                                         len,
-                                         k,
-                                         out,
-                                         out_idx,
-                                         greater,
-                                         stream);
+    warp_sort_topk_<WarpSelect, T, IdxT>(
+      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, greater, stream);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index d49a718559..07b921b429 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -134,7 +134,7 @@ inline void select_k(value_t* in_keys,
          "Size of the input (input_len = %zu) must be not smaller than the selection (k = %zu).",
          size_t(input_len),
          size_t(k));
-  size_t buf_size;
+
   switch (algo) {
     case SelectKAlgo::FAISS:
       detail::select_k(
@@ -142,88 +142,21 @@ inline void select_k(value_t* in_keys,
       break;
 
     case SelectKAlgo::RADIX_8_BITS:
-      detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(nullptr,
-                                                           buf_size,
-                                                           in_keys,
-                                                           in_values,
-                                                           (idx_t)n_inputs,
-                                                           (idx_t)input_len,
-                                                           (idx_t)k,
-                                                           out_keys,
-                                                           out_values,
-                                                           !select_min,
-                                                           stream);
-      {
-        rmm::device_uvector<uint8_t> workspace(buf_size, stream);
-        detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(static_cast<void*>(workspace.data()),
-                                                             buf_size,
-                                                             in_keys,
-                                                             in_values,
-                                                             (idx_t)n_inputs,
-                                                             (idx_t)input_len,
-                                                             (idx_t)k,
-                                                             out_keys,
-                                                             out_values,
-                                                             !select_min,
-                                                             stream);
-      }
+      detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
       break;
 
     case SelectKAlgo::RADIX_11_BITS:
-      detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(nullptr,
-                                                            buf_size,
-                                                            in_keys,
-                                                            in_values,
-                                                            (idx_t)n_inputs,
-                                                            (idx_t)input_len,
-                                                            (idx_t)k,
-                                                            out_keys,
-                                                            out_values,
-                                                            !select_min,
-                                                            stream);
-      {
-        rmm::device_uvector<uint8_t> workspace(buf_size, stream);
-        detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(static_cast<void*>(workspace.data()),
-                                                              buf_size,
-                                                              in_keys,
-                                                              in_values,
-                                                              (idx_t)n_inputs,
-                                                              (idx_t)input_len,
-                                                              (idx_t)k,
-                                                              out_keys,
-                                                              out_values,
-                                                              !select_min,
-                                                              stream);
-      }
+      detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
       break;
 
-    case SelectKAlgo::WARP_SORT: {
-      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(nullptr,
-                                                       buf_size,
-                                                       in_keys,
-                                                       in_values,
-                                                       (idx_t)n_inputs,
-                                                       (idx_t)input_len,
-                                                       (idx_t)k,
-                                                       out_keys,
-                                                       out_values,
-                                                       !select_min,
-                                                       stream);
-      rmm::device_uvector<uint8_t> workspace(buf_size, stream);
-      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(static_cast<void*>(workspace.data()),
-                                                       buf_size,
-                                                       in_keys,
-                                                       in_values,
-                                                       (idx_t)n_inputs,
-                                                       (idx_t)input_len,
-                                                       (idx_t)k,
-                                                       out_keys,
-                                                       out_values,
-                                                       !select_min,
-                                                       stream);
-    }
+    case SelectKAlgo::WARP_SORT:
+      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
+      break;
 
-    default: break;
+    default: ASSERT(false, "Unknown algorithm (id = %d)", int(algo));
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index d49a718559..07b921b429 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -134,7 +134,7 @@ inline void select_k(value_t* in_keys,
          "Size of the input (input_len = %zu) must be not smaller than the selection (k = %zu).",
          size_t(input_len),
          size_t(k));
-  size_t buf_size;
+
   switch (algo) {
     case SelectKAlgo::FAISS:
       detail::select_k(
@@ -142,88 +142,21 @@ inline void select_k(value_t* in_keys,
       break;
 
     case SelectKAlgo::RADIX_8_BITS:
-      detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(nullptr,
-                                                           buf_size,
-                                                           in_keys,
-                                                           in_values,
-                                                           (idx_t)n_inputs,
-                                                           (idx_t)input_len,
-                                                           (idx_t)k,
-                                                           out_keys,
-                                                           out_values,
-                                                           !select_min,
-                                                           stream);
-      {
-        rmm::device_uvector<uint8_t> workspace(buf_size, stream);
-        detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(static_cast<void*>(workspace.data()),
-                                                             buf_size,
-                                                             in_keys,
-                                                             in_values,
-                                                             (idx_t)n_inputs,
-                                                             (idx_t)input_len,
-                                                             (idx_t)k,
-                                                             out_keys,
-                                                             out_values,
-                                                             !select_min,
-                                                             stream);
-      }
+      detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
       break;
 
     case SelectKAlgo::RADIX_11_BITS:
-      detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(nullptr,
-                                                            buf_size,
-                                                            in_keys,
-                                                            in_values,
-                                                            (idx_t)n_inputs,
-                                                            (idx_t)input_len,
-                                                            (idx_t)k,
-                                                            out_keys,
-                                                            out_values,
-                                                            !select_min,
-                                                            stream);
-      {
-        rmm::device_uvector<uint8_t> workspace(buf_size, stream);
-        detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(static_cast<void*>(workspace.data()),
-                                                              buf_size,
-                                                              in_keys,
-                                                              in_values,
-                                                              (idx_t)n_inputs,
-                                                              (idx_t)input_len,
-                                                              (idx_t)k,
-                                                              out_keys,
-                                                              out_values,
-                                                              !select_min,
-                                                              stream);
-      }
+      detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
       break;
 
-    case SelectKAlgo::WARP_SORT: {
-      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(nullptr,
-                                                       buf_size,
-                                                       in_keys,
-                                                       in_values,
-                                                       (idx_t)n_inputs,
-                                                       (idx_t)input_len,
-                                                       (idx_t)k,
-                                                       out_keys,
-                                                       out_values,
-                                                       !select_min,
-                                                       stream);
-      rmm::device_uvector<uint8_t> workspace(buf_size, stream);
-      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(static_cast<void*>(workspace.data()),
-                                                       buf_size,
-                                                       in_keys,
-                                                       in_values,
-                                                       (idx_t)n_inputs,
-                                                       (idx_t)input_len,
-                                                       (idx_t)k,
-                                                       out_keys,
-                                                       out_values,
-                                                       !select_min,
-                                                       stream);
-    }
+    case SelectKAlgo::WARP_SORT:
+      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
+      break;
 
-    default: break;
+    default: ASSERT(false, "Unknown algorithm (id = %d)", int(algo));
   }
 }
 

From ba66efa76bec61565c7c410ea560856f4dc05270 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 11 Mar 2022 09:05:32 +0100
Subject: [PATCH 07/41] Remove unused code

---
 .../knn/detail/ivf_flat/radix_topk.cuh        | 147 ------------------
 1 file changed, 147 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
index d91c4c328d..31d4f7a009 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
@@ -36,34 +36,6 @@
 
 namespace raft::spatial::knn::detail::ivf_flat {
 
-inline size_t calc_aligned_size(const std::vector<size_t>& sizes)
-{
-  const size_t ALIGN_BYTES = 256;
-  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
-  size_t total             = 0;
-  for (auto sz : sizes) {
-    total += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
-  }
-  return total + ALIGN_BYTES - 1;
-}
-
-inline std::vector<void*> calc_aligned_pointers(const void* p, const std::vector<size_t>& sizes)
-{
-  const size_t ALIGN_BYTES = 256;
-  const size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
-
-  char* ptr = reinterpret_cast<char*>((reinterpret_cast<size_t>(p) + ALIGN_BYTES - 1) & ALIGN_MASK);
-
-  std::vector<void*> aligned_pointers;
-  aligned_pointers.reserve(sizes.size());
-  for (auto sz : sizes) {
-    aligned_pointers.push_back(ptr);
-    ptr += (sz + ALIGN_BYTES - 1) & ALIGN_MASK;
-  }
-
-  return aligned_pointers;
-}
-
 constexpr int BLOCK_DIM       = 512;
 constexpr int ITEM_PER_THREAD = 32;
 
@@ -438,125 +410,6 @@ __global__ void radix_kernel(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT>
-__global__ void final_filter(const T* in,
-                             const IdxT len,
-                             const IdxT k,
-                             Counter<T, IdxT>* counters,
-                             T* out,
-                             IdxT* out_idx,
-                             bool greater)
-{
-  const int batch_id           = blockIdx.y;
-  const T kth_value            = counters[batch_id].kth_value;
-  const IdxT needed_num_of_kth = counters[batch_id].k;
-  IdxT& out_cnt                = counters[batch_id].out_cnt;
-  IdxT& out_back_cnt           = counters[batch_id].out_back_cnt;
-
-  in      = in + batch_id * len;
-  out     = out + batch_id * k;
-  out_idx = out_idx + batch_id * k;
-
-  auto f = [k, greater, kth_value, needed_num_of_kth, &out_cnt, &out_back_cnt, out, out_idx](
-             T val, IdxT i) {
-    if ((greater && val > kth_value) || (!greater && val < kth_value)) {
-      IdxT pos     = atomicAdd(&out_cnt, 1);
-      out[pos]     = val;
-      out_idx[pos] = i;
-    } else if (val == kth_value) {
-      IdxT back_pos = atomicAdd(&out_back_cnt, 1);
-      if (back_pos < needed_num_of_kth) {
-        IdxT pos     = k - 1 - back_pos;
-        out[pos]     = val;
-        out_idx[pos] = i;
-      }
-    }
-  };
-  vectorized_process(in, len, f);
-}
-
-template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
-void radix_select_topk(void* buf,
-                       size_t& buf_size,
-                       const T* in,
-                       IdxT batch_size,
-                       IdxT len,
-                       IdxT k,
-                       T* out,
-                       IdxT* out_idx,
-                       bool greater,
-                       cudaStream_t stream)
-{
-  // TODO: is it possible to relax this restriction?
-  static_assert(calc_num_passes<T, BITS_PER_PASS>() > 1);
-  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
-
-  Counter<T, IdxT>* counters = nullptr;
-  IdxT* histograms           = nullptr;
-  T* buf1                    = nullptr;
-  T* buf2                    = nullptr;
-  {
-    std::vector<size_t> sizes = {sizeof(*counters) * batch_size,
-                                 sizeof(*histograms) * num_buckets * batch_size,
-                                 sizeof(*buf1) * len * batch_size,
-                                 sizeof(*buf2) * len * batch_size};
-    size_t total_size         = calc_aligned_size(sizes);
-    if (!buf) {
-      buf_size = total_size;
-      return;
-    }
-
-    std::vector<void*> aligned_pointers = calc_aligned_pointers(buf, sizes);
-    counters                            = static_cast<decltype(counters)>(aligned_pointers[0]);
-    histograms                          = static_cast<decltype(histograms)>(aligned_pointers[1]);
-    buf1                                = static_cast<decltype(buf1)>(aligned_pointers[2]);
-    buf2                                = static_cast<decltype(buf2)>(aligned_pointers[3]);
-
-    RAFT_CUDA_TRY(cudaMemsetAsync(
-      buf,
-      0,
-      static_cast<char*>(aligned_pointers[2]) - static_cast<char*>(aligned_pointers[0]),
-      stream));
-  }
-
-  const T* in_buf = nullptr;
-  T* out_buf      = nullptr;
-
-  dim3 blocks((len - 1) / (NUM_THREAD * ITEM_PER_THREAD) + 1, batch_size);
-
-  constexpr int num_passes = calc_num_passes<T, BITS_PER_PASS>();
-  for (int pass = 0; pass < num_passes; ++pass) {
-    if (pass == 0) {
-      in_buf  = in;
-      out_buf = nullptr;
-    } else if (pass == 1) {
-      in_buf  = in;
-      out_buf = buf1;
-    } else {
-      in_buf  = (pass % 2 == 0) ? buf1 : buf2;
-      out_buf = (pass % 2 == 0) ? buf2 : buf1;
-    }
-    radix_kernel<T, IdxT, BITS_PER_PASS, NUM_THREAD><<<blocks, NUM_THREAD, 0, stream>>>(in_buf,
-                                                                                        nullptr,
-                                                                                        out_buf,
-                                                                                        nullptr,
-                                                                                        nullptr,
-                                                                                        nullptr,
-                                                                                        counters,
-                                                                                        histograms,
-                                                                                        len,
-                                                                                        k,
-                                                                                        greater,
-                                                                                        pass);
-  }
-
-  constexpr int FILTER_BLOCK_DIM       = 256;
-  constexpr int FILTER_ITEM_PER_THREAD = 32;
-  dim3 filter_blocks((len - 1) / (FILTER_BLOCK_DIM * FILTER_ITEM_PER_THREAD) + 1, batch_size);
-  final_filter<<<filter_blocks, FILTER_BLOCK_DIM, 0, stream>>>(
-    in, len, k, counters, out, out_idx, greater);
-}
-
 template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
 void radix_topk(const T* in,
                 const IdxT* in_idx,

From 3eab24b5704779290d3ac10fc22de85cad222807 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 11 Mar 2022 12:10:49 +0100
Subject: [PATCH 08/41] Allow different types in select-k functions
 (float/double, int/size_t)

---
 cpp/bench/spatial/selection.cu                | 13 ++-
 cpp/include/raft/cudart_utils.h               | 18 +++-
 .../knn/detail/ivf_flat/radix_topk.cuh        | 13 +--
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 47 ++---------
 .../spatial/knn/detail/selection_faiss.cuh    | 16 ++--
 cpp/test/spatial/selection.cu                 | 83 ++++++++++++-------
 6 files changed, 106 insertions(+), 84 deletions(-)

diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
index fbe65c2dbc..314a6b007e 100644
--- a/cpp/bench/spatial/selection.cu
+++ b/cpp/bench/spatial/selection.cu
@@ -109,9 +109,14 @@ SELECTION_REGISTER(float, int, RADIX_8_BITS);
 SELECTION_REGISTER(float, int, RADIX_11_BITS);
 SELECTION_REGISTER(float, int, WARP_SORT);
 
-// SELECTION_REGISTER(double, int, FAISS);
-// SELECTION_REGISTER(double, int, RADIX_8_BITS);
-// SELECTION_REGISTER(double, int, RADIX_11_BITS);
-// SELECTION_REGISTER(double, int, WARP_SORT);
+SELECTION_REGISTER(double, int, FAISS);
+SELECTION_REGISTER(double, int, RADIX_8_BITS);
+SELECTION_REGISTER(double, int, RADIX_11_BITS);
+SELECTION_REGISTER(double, int, WARP_SORT);
+
+SELECTION_REGISTER(double, size_t, FAISS);
+SELECTION_REGISTER(double, size_t, RADIX_8_BITS);
+SELECTION_REGISTER(double, size_t, RADIX_11_BITS);
+SELECTION_REGISTER(double, size_t, WARP_SORT);
 
 }  // namespace raft::bench::spatial
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 936065afba..1b278576ca 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -430,4 +430,20 @@ IntType gcd(IntType a, IntType b)
   return a;
 }
 
+template <typename T>
+constexpr T lower_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
+    return -std::numeric_limits<T>::infinity();
+  }
+  return std::numeric_limits<T>::lowest();
+}
+
+template <typename T>
+constexpr T upper_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity) { return std::numeric_limits<T>::infinity(); }
+  return std::numeric_limits<T>::max();
+}
+
 }  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
index 31d4f7a009..e88ac602f7 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
@@ -22,6 +22,7 @@
 #include <cub/block/radix_rank_sort_operations.cuh>
 
 #include <raft/cudart_utils.h>
+#include <raft/device_atomics.cuh>
 
 /*
   Two implementations:
@@ -176,7 +177,7 @@ __device__ void filter_and_histogram(const T* in_buf,
   if (pass == 0) {
     auto f = [greater, start_bit, mask](T value, IdxT) {
       int bucket = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
-      atomicAdd(histogram_smem + bucket, 1);
+      atomicAdd(histogram_smem + bucket, IdxT(1));
     };
     vectorized_process(in_buf, len, f);
   } else {
@@ -208,11 +209,11 @@ __device__ void filter_and_histogram(const T* in_buf,
       int prev_bucket =
         calc_bucket<T, BITS_PER_PASS>(value, previous_start_bit, previous_mask, greater);
       if (prev_bucket == want_bucket) {
-        IdxT pos     = atomicAdd(&filter_cnt, 1);
+        IdxT pos     = atomicAdd(&filter_cnt, IdxT(1));
         out_buf[pos] = value;
         if (out_idx_buf) { out_idx_buf[pos] = in_idx_buf ? in_idx_buf[i] : i; }
         int bucket = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
-        atomicAdd(histogram_smem + bucket, 1);
+        atomicAdd(histogram_smem + bucket, IdxT(1));
 
         if (counter_len == 1) {
           if (out) {
@@ -223,7 +224,7 @@ __device__ void filter_and_histogram(const T* in_buf,
           }
         }
       } else if (out && prev_bucket < want_bucket) {
-        IdxT pos     = atomicAdd(&out_cnt, 1);
+        IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
         out[pos]     = value;
         out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
       }
@@ -385,12 +386,12 @@ __global__ void radix_kernel(const T* in_buf,
           const T value = out_buf[i];
           int bucket    = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
           if (bucket < want_bucket) {
-            IdxT pos     = atomicAdd(&out_cnt, 1);
+            IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
             out[pos]     = value;
             out_idx[pos] = out_idx_buf[i];
           } else if (bucket == want_bucket) {
             IdxT needed_num_of_kth = counter->k;
-            IdxT back_pos          = atomicAdd(&(counter->out_back_cnt), 1);
+            IdxT back_pos          = atomicAdd(&(counter->out_back_cnt), IdxT(1));
             if (back_pos < needed_num_of_kth) {
               IdxT pos     = k - 1 - back_pos;
               out[pos]     = value;
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index 47221b92bb..3d8835c254 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -121,32 +121,6 @@ static constexpr int kMaxCapacity = 512;
 
 namespace {
 
-template <typename T>
-constexpr T get_lower_bound()
-{
-  if (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
-    return -std::numeric_limits<T>::infinity();
-  } else {
-    return std::numeric_limits<T>::lowest();
-  }
-}
-
-template <typename T>
-constexpr T get_upper_bound()
-{
-  if (std::numeric_limits<T>::has_infinity) {
-    return std::numeric_limits<T>::infinity();
-  } else {
-    return std::numeric_limits<T>::max();
-  }
-}
-
-template <typename T>
-constexpr T get_dummy(bool greater)
-{
-  return greater ? get_lower_bound<T>() : get_upper_bound<T>();
-}
-
 template <bool greater, typename T>
 __device__ inline bool is_greater_than(T val, T baseline)
 {
@@ -217,7 +191,7 @@ class WarpSort {
 template <int capacity, bool greater, typename T, typename IdxT>
 class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
  public:
-  __device__ WarpSelect(IdxT k, T dummy)
+  __device__ WarpSelect(int k, T dummy)
     : WarpSort<capacity, greater, T, IdxT>(k, dummy),
       buf_len_(0),
       k_th_(dummy),
@@ -335,7 +309,7 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
 template <int capacity, bool greater, typename T, typename IdxT>
 class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
  public:
-  __device__ WarpBitonic(IdxT k, T dummy)
+  __device__ WarpBitonic(int k, T dummy)
     : WarpSort<capacity, greater, T, IdxT>(k, dummy), buf_len_(0)
   {
     for (int i = 0; i < max_arr_len_; ++i) {
@@ -465,7 +439,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
 template <int capacity, bool greater, typename T, typename IdxT>
 class WarpMerge : public WarpSort<capacity, greater, T, IdxT> {
  public:
-  __device__ WarpMerge(IdxT k, T dummy) : WarpSort<capacity, greater, T, IdxT>(k, dummy) {}
+  __device__ WarpMerge(int k, T dummy) : WarpSort<capacity, greater, T, IdxT>(k, dummy) {}
 
   __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
@@ -507,7 +481,7 @@ template <template <int, bool, typename, typename> class WarpSortWarpWide,
           typename IdxT>
 class WarpSortBlockWide {
  public:
-  __device__ WarpSortBlockWide(IdxT k, T dummy, void* smem_buf)
+  __device__ WarpSortBlockWide(int k, T dummy, void* smem_buf)
     : queue_(k, dummy), k_(k), dummy_(dummy)
   {
     val_smem_             = static_cast<T*>(smem_buf);
@@ -601,14 +575,8 @@ template <template <int, bool, typename, typename> class WarpSortClass,
           bool greater,
           typename T,
           typename IdxT>
-__global__ void block_kernel(const T* in,
-                             const IdxT* in_idx,
-                             IdxT batch_size,
-                             IdxT len,
-                             IdxT k,
-                             T* out,
-                             IdxT* out_idx,
-                             T dummy)
+__global__ void block_kernel(
+  const T* in, const IdxT* in_idx, IdxT batch_size, IdxT len, int k, T* out, IdxT* out_idx, T dummy)
 {
   extern __shared__ __align__(sizeof(T) * 256) uint8_t smem_buf_bytes[];
   T* smem_buf = (T*)smem_buf_bytes;
@@ -702,7 +670,7 @@ struct launch_setup {
       }
     }
     ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
-    T dummy = get_dummy<T>(greater);
+    T dummy = greater ? lower_bound<T>() : upper_bound<T>();
     if (greater) {
       block_kernel<WarpSortClass, Capacity, true>
         <<<batch_size * num_blocks, block_dim, smem_size, stream>>>(
@@ -822,7 +790,6 @@ void warp_sort_topk_(int num_of_block,
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream);
 
   // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
-  // T dummy      = get_dummy<T>(greater);
   int capacity = calc_capacity(k);
 
   T* result_val    = (num_of_block == 1) ? out : tmp_val.data();
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index c49e49575e..2d2fabd9d6 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -31,6 +31,12 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
+template <typename key_t, typename payload_t>
+constexpr int kFaissMaxK()
+{
+  return (sizeof(key_t) + sizeof(payload_t) > 8) ? 512 : 1024;
+}
+
 template <typename key_t, typename payload_t, bool select_min, int warp_q, int thread_q, int tpb>
 __global__ void select_k_kernel(key_t* inK,
                                 payload_t* inV,
@@ -100,8 +106,7 @@ inline void select_k_impl(key_t* inK,
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
   auto block              = dim3(n_threads);
 
-  auto kInit =
-    select_min ? faiss::gpu::Limits<key_t>::getMax() : faiss::gpu::Limits<key_t>::getMin();
+  auto kInit = select_min ? upper_bound<key_t>() : lower_bound<key_t>();
   auto vInit = -1;
   if (select_min) {
     select_k_kernel<key_t, payload_t, false, warp_q, thread_q, n_threads>
@@ -138,6 +143,7 @@ inline void select_k(key_t* inK,
                      int k,
                      cudaStream_t stream)
 {
+  constexpr int max_k = kFaissMaxK<payload_t, key_t>();
   if (k == 1)
     select_k_impl<payload_t, key_t, 1, 1>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
@@ -156,11 +162,11 @@ inline void select_k(key_t* inK,
   else if (k <= 512)
     select_k_impl<payload_t, key_t, 512, 8>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 1024)
-    select_k_impl<payload_t, key_t, 1024, 8>(
+  else if (k <= 1024 && k <= max_k)
+    select_k_impl<payload_t, key_t, max_k, 8>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else
-    ASSERT(k <= 1024, "Current max k is 1024 (requested %d)", k);
+    ASSERT(k <= max_k, "Current max k is %d (requested %d)", max_k, k);
 }
 
 };  // namespace detail
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 9a326f7654..386d9d75f4 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -95,6 +95,8 @@ struct SelectInOutSimple {
 template <typename KeyT, typename IdxT>
 struct SelectInOutComputed {
  public:
+  bool not_supported = false;
+
   SelectInOutComputed(const SelectTestSpec& spec,
                       knn::SelectKAlgo algo,
                       const std::vector<KeyT>& in_dists,
@@ -104,6 +106,23 @@ struct SelectInOutComputed {
       out_dists_(spec.n_inputs * spec.k),
       out_ids_(spec.n_inputs * spec.k)
   {
+    // check if the size is supported by the algorithm
+    switch (algo) {
+      case knn::SelectKAlgo::WARP_SORT:
+        if (spec.k > raft::spatial::knn::detail::ivf_flat::kMaxCapacity) {
+          not_supported = true;
+          return;
+        }
+        break;
+      case knn::SelectKAlgo::FAISS:
+        if (spec.k > raft::spatial::knn::detail::kFaissMaxK<IdxT, KeyT>()) {
+          not_supported = true;
+          return;
+        }
+        break;
+      default: break;
+    }
+
     auto stream = rmm::cuda_stream_default;
 
     rmm::device_uvector<KeyT> in_dists_d(in_dists_.size(), stream);
@@ -234,6 +253,7 @@ class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT,
 
   void run()
   {
+    if (res.not_supported) { GTEST_SKIP(); }
     ASSERT_TRUE(hostArrMatch(ref.get_out_dists().data(),
                              res.get_out_dists().data(),
                              spec.n_inputs * spec.k,
@@ -325,39 +345,46 @@ struct with_ref {
   };
 };
 
-auto inputs_random_f = testing::Values(SelectTestSpec{20, 700, 1, true},
-                                       SelectTestSpec{20, 700, 2, true},
-                                       SelectTestSpec{20, 700, 3, true},
-                                       SelectTestSpec{20, 700, 4, true},
-                                       SelectTestSpec{20, 700, 5, true},
-                                       SelectTestSpec{20, 700, 6, true},
-                                       SelectTestSpec{20, 700, 7, true},
-                                       SelectTestSpec{20, 700, 8, true},
-                                       SelectTestSpec{20, 700, 9, true},
-                                       SelectTestSpec{20, 700, 10, true},
-                                       SelectTestSpec{20, 700, 11, true},
-                                       SelectTestSpec{20, 700, 12, true},
-                                       SelectTestSpec{20, 700, 16, true},
-                                       SelectTestSpec{100, 1700, 17, true},
-                                       SelectTestSpec{100, 1700, 31, true},
-                                       SelectTestSpec{100, 1700, 32, false},
-                                       SelectTestSpec{100, 1700, 33, false},
-                                       SelectTestSpec{100, 1700, 63, false},
-                                       SelectTestSpec{100, 1700, 64, false},
-                                       SelectTestSpec{100, 1700, 65, false},
-                                       SelectTestSpec{100, 1700, 255, true},
-                                       SelectTestSpec{100, 1700, 256, true},
-                                       SelectTestSpec{100, 1700, 511, false},
-                                       SelectTestSpec{100, 1700, 512, true},
-                                       SelectTestSpec{100, 1700, 1023, false},
-                                       SelectTestSpec{100, 1700, 1024, true},
-                                       SelectTestSpec{100, 1700, 1700, true});
+auto inputs_random = testing::Values(SelectTestSpec{20, 700, 1, true},
+                                     SelectTestSpec{20, 700, 2, true},
+                                     SelectTestSpec{20, 700, 3, true},
+                                     SelectTestSpec{20, 700, 4, true},
+                                     SelectTestSpec{20, 700, 5, true},
+                                     SelectTestSpec{20, 700, 6, true},
+                                     SelectTestSpec{20, 700, 7, true},
+                                     SelectTestSpec{20, 700, 8, true},
+                                     SelectTestSpec{20, 700, 9, true},
+                                     SelectTestSpec{20, 700, 10, true},
+                                     SelectTestSpec{20, 700, 11, true},
+                                     SelectTestSpec{20, 700, 12, true},
+                                     SelectTestSpec{20, 700, 16, true},
+                                     SelectTestSpec{100, 1700, 17, true},
+                                     SelectTestSpec{100, 1700, 31, true},
+                                     SelectTestSpec{100, 1700, 32, false},
+                                     SelectTestSpec{100, 1700, 33, false},
+                                     SelectTestSpec{100, 1700, 63, false},
+                                     SelectTestSpec{100, 1700, 64, false},
+                                     SelectTestSpec{100, 1700, 65, false},
+                                     SelectTestSpec{100, 1700, 255, true},
+                                     SelectTestSpec{100, 1700, 256, true},
+                                     SelectTestSpec{100, 1700, 511, false},
+                                     SelectTestSpec{100, 1700, 512, true},
+                                     SelectTestSpec{100, 1700, 1023, false},
+                                     SelectTestSpec{100, 1700, 1024, true},
+                                     SelectTestSpec{100, 1700, 1700, true});
 
 typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::FAISS>::params_random>
   ReferencedRandomFloatInt;
 TEST_P(ReferencedRandomFloatInt, Run) { run(); }
 INSTANTIATE_TEST_CASE_P(SelectionTest,
                         ReferencedRandomFloatInt,
-                        testing::Combine(inputs_random_f, selection_algos));
+                        testing::Combine(inputs_random, selection_algos));
+
+typedef SelectionTest<double, int, with_ref<knn::SelectKAlgo::FAISS>::params_random>
+  ReferencedRandomDoubleInt;
+TEST_P(ReferencedRandomDoubleInt, Run) { run(); }
+INSTANTIATE_TEST_CASE_P(SelectionTest,
+                        ReferencedRandomDoubleInt,
+                        testing::Combine(inputs_random, selection_algos));
 
 }  // namespace raft::spatial::selection

From 659bc1842448a3f4b3bd4c9e329008470484b66f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 17 Mar 2022 16:38:43 +0100
Subject: [PATCH 09/41] More refactoring and comments

---
 cpp/bench/spatial/selection.cu                |  66 +++--
 .../knn/detail/ivf_flat/bitonic_sort.cuh      | 149 ++++++++--
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 261 +++++++++++-------
 cpp/include/raft/spatial/knn/knn.cuh          |  13 +-
 cpp/include/raft/spatial/knn/knn.hpp          | 206 +-------------
 cpp/test/spatial/selection.cu                 |  63 +++--
 6 files changed, 364 insertions(+), 394 deletions(-)

diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
index 314a6b007e..01f3fb8c17 100644
--- a/cpp/bench/spatial/selection.cu
+++ b/cpp/bench/spatial/selection.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/knn.cuh>
 
 #include <raft/random/rng.hpp>
 #include <raft/sparse/detail/utils.h>
@@ -77,24 +77,52 @@ struct selection : public fixture {
 };
 
 const std::vector<params> kInputs{
-  {10000, 10, 3, true},     {10000, 10, 10, true},     {10000, 700, 3, true},
-  {10000, 700, 32, true},   {10000, 2000, 64, true},   {10000, 10000, 7, true},
-  {10000, 10000, 19, true}, {10000, 10000, 127, true},
-
-  {1000, 10000, 1, true},   {1000, 10000, 2, true},    {1000, 10000, 4, true},
-  {1000, 10000, 8, true},   {1000, 10000, 16, true},   {1000, 10000, 32, true},
-  {1000, 10000, 64, true},  {1000, 10000, 128, true},  {1000, 10000, 256, true},
-  {1000, 10000, 512, true}, {1000, 10000, 1024, true}, {1000, 10000, 2048, true},
-
-  {100, 100000, 1, true},   {100, 100000, 2, true},    {100, 100000, 4, true},
-  {100, 100000, 8, true},   {100, 100000, 16, true},   {100, 100000, 32, true},
-  {100, 100000, 64, true},  {100, 100000, 128, true},  {100, 100000, 256, true},
-  {100, 100000, 512, true}, {100, 100000, 1024, true}, {100, 100000, 2048, true},
-
-  {10, 1000000, 1, true},   {10, 1000000, 2, true},    {10, 1000000, 4, true},
-  {10, 1000000, 8, true},   {10, 1000000, 16, true},   {10, 1000000, 32, true},
-  {10, 1000000, 64, true},  {10, 1000000, 128, true},  {10, 1000000, 256, true},
-  {10, 1000000, 512, true}, {10, 1000000, 1024, true}, {10, 1000000, 2048, true},
+  {10000, 10, 3, true},
+  {10000, 10, 10, true},
+  {10, 40, 15, true},
+  {10, 80, 15, true},
+  {10, 80, 1, true},
+  {10, 80, 7, true},
+  {10, 80, 8, true},
+  {10, 700, 3, true},
+  {10, 700, 32, true},
+  {10, 2000, 64, true},
+  {10, 10000, 7, true},
+  {10, 10000, 19, true},
+  {10, 10000, 127, true},
+
+  {1000, 10000, 1, true},
+  {1000, 10000, 2, true},
+  {1000, 10000, 4, true},
+  {1000, 10000, 8, true},
+  {1000, 10000, 16, true},
+  {1000, 10000, 32, true},
+  {1000, 10000, 64, true},
+  {1000, 10000, 128, true},
+  {1000, 10000, 256, true},
+  // {1000, 10000, 512, true}, {1000, 10000, 1024, true}, {1000, 10000, 2048, true},
+
+  {100, 100000, 1, true},
+  {100, 100000, 2, true},
+  {100, 100000, 4, true},
+  {100, 100000, 8, true},
+  {100, 100000, 16, true},
+  {100, 100000, 32, true},
+  {100, 100000, 64, true},
+  {100, 100000, 128, true},
+  {100, 100000, 256, true},
+  // {100, 100000, 512, true}, {100, 100000, 1024, true}, {100, 100000, 2048, true},
+
+  {10, 1000000, 1, true},
+  {10, 1000000, 2, true},
+  {10, 1000000, 4, true},
+  {10, 1000000, 8, true},
+  {10, 1000000, 16, true},
+  {10, 1000000, 32, true},
+  {10, 1000000, 64, true},
+  {10, 1000000, 128, true},
+  {10, 1000000, 256, true},
+  // {10, 1000000, 512, true}, {10, 1000000, 1024, true}, {10, 1000000, 2048, true},
 };
 
 #define SELECTION_REGISTER(KeyT, IdxT, Algo)                                      \
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
index c99d9b0313..3a15cf8a95 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
@@ -36,6 +36,12 @@ __device__ __forceinline__ void assign(bool cond, T* ptr, T x)
   if (cond) { *ptr = x; }
 }
 
+template <typename T>
+__device__ __forceinline__ void assign(bool cond, T& ptr, T x)
+{
+  if (cond) { ptr = x; }
+}
+
 template <typename T, typename... Ts>
 __device__ __forceinline__ auto first(T x, Ts... xs) -> T
 {
@@ -49,8 +55,9 @@ __device__ __forceinline__ auto first(T x, Ts... xs) -> T
  *
  * @tparam Size is the number of elements (must be power of two).
  * @tparam Ascending is the resulting order (true: ascending, false: descending).
+ * @tparam Cross whether the right half of the input is sorted in the opposite direction.
  */
-template <int Size, bool Ascending>
+template <int Size, bool Ascending, bool Cross = true>
 struct bitonic_merge {
   static_assert(isPo2(Size));
 
@@ -63,10 +70,12 @@ struct bitonic_merge {
     std::enable_if_t<(Fits == (Size <= WarpSize)) && std::is_same_v<Dummy, Dummy>, void>;
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ auto run(bool reverse, KeyT* keys, PayloadTs*... payloads)
-    -> when_fits_in_warp<false, KeyT>
+  static __device__ auto run(bool reverse,
+                             KeyT* __restrict__ keys,
+                             PayloadTs* __restrict__... payloads) -> when_fits_in_warp<false, KeyT>
   {
-    for (int i = 0; i < kStride; ++i) {
+    static_assert(Cross, "Straight merging is not implemented for Size > WarpSize.");
+    for (int i = 0; i < kStride; i++) {
       const int other_i = i + kStride;
       KeyT& key         = keys[i];
       KeyT& other       = keys[other_i];
@@ -85,41 +94,77 @@ struct bitonic_merge {
       }
     }
 
-    bitonic_merge<Size / 2, Ascending>::run(reverse, keys, payloads...);
-    bitonic_merge<Size / 2, Ascending>::run(reverse, keys + kStride, (payloads + kStride)...);
+    bitonic_merge<Size / 2, Ascending, true>::run(reverse, keys, payloads...);
+    bitonic_merge<Size / 2, Ascending, true>::run(reverse, keys + kStride, (payloads + kStride)...);
   }
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ auto run(bool reverse, KeyT* keys, PayloadTs*... payloads)
-    -> when_fits_in_warp<true, KeyT>
+  static __device__ auto run(bool reverse,
+                             KeyT& __restrict__ key,
+                             PayloadTs& __restrict__... payload) -> when_fits_in_warp<true, KeyT>
   {
     const int lane = threadIdx.x % Size;
-    for (int stride = Size / 2; stride > 0; stride /= 2) {
+    int stride     = Size / 2;
+    if constexpr (!Cross) {
+      bool is_second = lane & stride;
+      KeyT other     = shfl(key, Size - lane - 1, Size);
+
+      bool asc       = Ascending != reverse;
+      bool do_assign = key != other && ((key > other) == (asc != is_second));
+      // Normally, we expect `payloads` to be the array of indices from 0 to len;
+      // in that case, the construct below makes the sorting stable.
+      if constexpr (sizeof...(payload) > 0) {
+        auto payload_this = helpers::first(payload...);
+        auto payload_that = shfl(payload_this, Size - lane - 1, Size);
+        if (key == other) { do_assign = reverse != ((payload_this > payload_that) != is_second); }
+      }
+
+      helpers::assign(do_assign, key, other);
+      // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
+      (helpers::assign(do_assign, payload, shfl(payload, Size - lane - 1, Size)), ...);
+
+      stride /= 2;
+    }
+    for (; stride > 0; stride /= 2) {
       bool is_second = lane & stride;
-      KeyT& key      = *keys;
       KeyT other     = shfl_xor(key, stride, Size);
 
       bool asc       = Ascending != reverse;
       bool do_assign = key != other && ((key > other) == (asc != is_second));
       // Normally, we expect `payloads` to be the array of indices from 0 to len;
       // in that case, the construct below makes the sorting stable.
-      if constexpr (sizeof...(payloads) > 0) {
-        auto payload_this = *helpers::first(payloads...);
+      if constexpr (sizeof...(payload) > 0) {
+        auto payload_this = helpers::first(payload...);
         auto payload_that = shfl_xor(payload_this, stride, Size);
         if (key == other) { do_assign = reverse != ((payload_this > payload_that) != is_second); }
       }
 
-      helpers::assign(do_assign, keys, other);
+      helpers::assign(do_assign, key, other);
       // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
-      (helpers::assign(do_assign, payloads, shfl_xor(*payloads, stride, Size)), ...);
+      (helpers::assign(do_assign, payload, shfl_xor(payload, stride, Size)), ...);
     }
   }
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void run(KeyT* keys, PayloadTs*... payloads)
+  static __device__ auto run(bool reverse,
+                             KeyT* __restrict__ keys,
+                             PayloadTs* __restrict__... payloads) -> when_fits_in_warp<true, KeyT>
+  {
+    return run(reverse, *keys, *payloads...);
+  }
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ void run(KeyT* __restrict__ keys, PayloadTs* __restrict__... payloads)
   {
     return run(false, keys, payloads...);
   }
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ auto run(KeyT& __restrict__ key, PayloadTs& __restrict__... payload)
+    -> when_fits_in_warp<true, KeyT>
+  {
+    return run(false, key, payload...);
+  }
 };
 
 /**
@@ -128,25 +173,29 @@ struct bitonic_merge {
  * @tparam Size is the number of elements (must be power of two).
  * @tparam Ascending is the resulting order (true: ascending, false: descending).
  */
-template <int Size, bool Ascending>
+template <int Size, bool Ascending, int AlreadySortedSize = 1>
 struct bitonic_sort {
   static_assert(isPo2(Size));
-
-  static constexpr int kSize2 = Size / 2;
+  static_assert(isPo2(AlreadySortedSize));
+  static_assert(isPo2(Size >= AlreadySortedSize));
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void run(bool reverse, KeyT* keys, PayloadTs*... payloads)
+  static __device__ void run(bool reverse,
+                             KeyT* __restrict__ keys,
+                             PayloadTs* __restrict__... payloads)
   {
-    if constexpr (Size > 2) {
+    constexpr int kSize2 = Size / 2;
+    if constexpr (kSize2 > AlreadySortedSize) {
       // NB: the `reverse` expression here is always `0` (false) when `Size > WarpSize`
-      bitonic_sort<kSize2, Ascending>::run(laneId() & kSize2, keys, payloads...);
-    }
-    if constexpr (Size > WarpSize) {
-      // NB: this part is executed only if the size of the input arrays is larger than the warp.
-      constexpr int kShift = kSize2 / WarpSize;
-      bitonic_sort<kSize2, Ascending>::run(true, keys + kShift, (payloads + kShift)...);
+      bitonic_sort<kSize2, Ascending, AlreadySortedSize>::run(laneId() & kSize2, keys, payloads...);
+      if constexpr (Size > WarpSize) {
+        // NB: this part is executed only if the size of the input arrays is larger than the warp.
+        constexpr int kShift = kSize2 / WarpSize;
+        bitonic_sort<kSize2, Ascending, AlreadySortedSize>::run(
+          true, keys + kShift, (payloads + kShift)...);
+      }
     }
-    bitonic_merge<Size, Ascending>::run(reverse, keys, payloads...);
+    bitonic_merge<Size, Ascending, (kSize2 > AlreadySortedSize)>::run(reverse, keys, payloads...);
   }
 
   /**
@@ -159,10 +208,54 @@ struct bitonic_sort {
    *   the keys.
    */
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void run(KeyT* keys, PayloadTs*... payloads)
+  static __device__ void run(KeyT* __restrict__ keys, PayloadTs* __restrict__... payloads)
   {
     return run(false, keys, payloads...);
   }
 };
 
+template <bool Ascending, bool Cross>
+struct bitonic_merge<1, Ascending, Cross> {
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void run(bool reverse,
+                                             KeyT* __restrict__ keys,
+                                             PayloadTs* __restrict__... payloads)
+  {
+  }
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void run(bool reverse,
+                                             KeyT& __restrict__ keys,
+                                             PayloadTs& __restrict__... payloads)
+  {
+  }
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void run(KeyT* __restrict__ keys,
+                                             PayloadTs* __restrict__... payloads)
+  {
+  }
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void run(KeyT& __restrict__ keys,
+                                             PayloadTs& __restrict__... payloads)
+  {
+  }
+};
+
+template <int Size, bool Ascending>
+struct bitonic_sort<Size, Ascending, Size> {
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void run(bool reverse,
+                                             KeyT* __restrict__ keys,
+                                             PayloadTs* __restrict__... payloads)
+  {
+  }
+
+  template <typename KeyT, typename... PayloadTs>
+  static __device__ __forceinline__ void run(KeyT* __restrict__ keys,
+                                             PayloadTs* __restrict__... payloads)
+  {
+  }
+};
+
 }  // namespace raft::spatial::knn::detail::ivf_flat
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index 3d8835c254..ac75c8ecc0 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -117,7 +117,7 @@
 
 namespace raft::spatial::knn::detail::ivf_flat {
 
-static constexpr int kMaxCapacity = 512;
+static constexpr int kMaxCapacity = 256;
 
 namespace {
 
@@ -131,7 +131,7 @@ __device__ inline bool is_greater_than(T val, T baseline)
 int calc_capacity(int k)
 {
   int capacity = isPo2(k) ? k : (1 << (log2(k) + 1));
-  if (capacity < WarpSize) { capacity = WarpSize; }
+  if (capacity < WarpSize) { capacity = WarpSize; }  // TODO: remove this to allow small sizes.
   return capacity;
 }
 
@@ -139,21 +139,25 @@ int calc_capacity(int k)
 
 template <int capacity, bool greater, typename T, typename IdxT>
 class WarpSort {
+  static_assert(isPo2(capacity));
+
  public:
-  __device__ WarpSort(IdxT k, T dummy) : lane_(threadIdx.x % WarpSize), k_(k), dummy_(dummy)
+  __device__ WarpSort(IdxT k, T dummy) : k_(k), dummy_(dummy)
   {
-    static_assert(capacity >= WarpSize && isPo2(capacity));
-
-    for (int i = 0; i < max_arr_len_; ++i) {
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; i++) {
       val_arr_[i] = dummy_;
     }
   }
 
-  // load and merge k sorted values
+  /**
+   * Load k values from the pointers at the given position, and merge them in the storage.
+   */
   __device__ void load_sorted(const T* in, const IdxT* in_idx, IdxT start)
   {
-    IdxT idx = start + WarpSize - 1 - lane_;
-    for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WarpSize) {
+    IdxT idx = start + kWarpWidth - 1 - Pow2<kWarpWidth>::mod(laneId());
+#pragma unroll
+    for (int i = kMaxArrLen - 1; i >= 0; --i, idx += kWarpWidth) {
       if (idx < start + k_) {
         T t = in[idx];
         if (is_greater_than<greater>(t, val_arr_[i])) {
@@ -162,42 +166,54 @@ class WarpSort {
         }
       }
     }
-
-    bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
+    ivf_flat::bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
   }
 
   __device__ void dump(T* out, IdxT* out_idx) const
   {
-    for (int i = 0; i < max_arr_len_; ++i) {
-      IdxT out_i = i * WarpSize + lane_;
-      if (out_i < k_) {
-        out[out_i]     = val_arr_[i];
-        out_idx[out_i] = idx_arr_[i];
-      }
+    IdxT idx = Pow2<kWarpWidth>::mod(laneId());
+#pragma unroll kMaxArrLen
+    for (int i = 0; i < kMaxArrLen && idx < k_; i++, idx += kWarpWidth) {
+      out[idx]     = val_arr_[i];
+      out_idx[idx] = idx_arr_[i];
     }
   }
 
- protected:
-  static constexpr int max_arr_len_ = capacity / WarpSize;
+  // TODO: do all merging in the bitonic_sort.cuh
+  // /**
+  //  * When capacity < WarpSize, merges sorted queues within the warp.
+  //  * As a result, the top k selected values are placed in the first queue in the group
+  //  * (i.e. within the first kWarpWidth values, since k <= capacity == kWarpWidth).
+  //  *
+  //  * It does nothing when capacity >= WarpSize
+  //  */
+  // __device__ __forceinline__ void merge_within_warp()
+  // {
+  //   if constexpr (kWarpWidth < WarpSize) {
+  //     ivf_flat::bitonic_sort<WarpSize, !greater, kWarpWidth>::run(val_arr_, idx_arr_);
+  //   }
+  // }
 
-  T val_arr_[max_arr_len_];
-  IdxT idx_arr_[max_arr_len_];
+ protected:
+  static constexpr int kWarpWidth = std::min<int>(capacity, WarpSize);
+  static constexpr int kMaxArrLen = capacity / kWarpWidth;
 
-  const int lane_;
   const IdxT k_;
   const T dummy_;
+  T val_arr_[kMaxArrLen];
+  IdxT idx_arr_[kMaxArrLen];
 };
 
 template <int capacity, bool greater, typename T, typename IdxT>
 class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
+  static_assert(capacity >= WarpSize);
+
  public:
   __device__ WarpSelect(int k, T dummy)
-    : WarpSort<capacity, greater, T, IdxT>(k, dummy),
-      buf_len_(0),
-      k_th_(dummy),
-      k_th_lane_((k - 1) % WarpSize)
+    : WarpSort<capacity, greater, T, IdxT>(k, dummy), buf_len_(0), k_th_(dummy)
   {
-    for (int i = 0; i < max_buf_len_; ++i) {
+#pragma unroll
+    for (int i = 0; i < kMaxBufLen; i++) {
       val_buf_[i] = dummy_;
     }
   }
@@ -205,7 +221,7 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
   __device__ void add(const T* in, IdxT start, IdxT end)
   {
     const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
-    for (IdxT i = start + lane_; i < end_for_fullwarp; i += WarpSize) {
+    for (IdxT i = start + laneId(); i < end_for_fullwarp; i += WarpSize) {
       T val = (i < end) ? in[i] : dummy_;
       add(val, i);
     }
@@ -214,7 +230,7 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
   __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
-    for (IdxT i = start + lane_; i < end_for_fullwarp; i += WarpSize) {
+    for (IdxT i = start + laneId(); i < end_for_fullwarp; i += WarpSize) {
       T val    = (i < end) ? in[i] : dummy_;
       IdxT idx = (i < end) ? in_idx[i] : std::numeric_limits<IdxT>::max();
       add(val, idx);
@@ -223,18 +239,21 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
 
   __device__ void add(T val, IdxT idx)
   {
+    // comparing for k_th should reduce the total amount of updates:
+    // `false` means the input value is surely not in the top-k values.
     if (is_greater_than<greater>(val, k_th_)) {
-      for (int i = 0; i < max_buf_len_ - 1; ++i) {
-        val_buf_[i] = val_buf_[i + 1];
-        idx_buf_[i] = idx_buf_[i + 1];
+      // NB: the loop is used here to ensure the constant indexing,
+      //     to not force the buffers spill into the local memory.
+#pragma unroll
+      for (int i = 0; i < kMaxBufLen; i++) {
+        if (i == buf_len_) {
+          val_buf_[i] = val;
+          idx_buf_[i] = idx;
+        }
       }
-      val_buf_[max_buf_len_ - 1] = val;
-      idx_buf_[max_buf_len_ - 1] = idx;
-
       ++buf_len_;
     }
-
-    if (any(buf_len_ == max_buf_len_)) { merge_buf_(); }
+    if (any(buf_len_ == kMaxBufLen)) { merge_buf_(); }
   }
 
   __device__ void done()
@@ -246,33 +265,48 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
   __device__ void set_k_th_()
   {
     // it's the best we can do, should use "val_arr_[k_th_row_]"
-    k_th_ = shfl(val_arr_[max_arr_len_ - 1], k_th_lane_);
+    // NB on using srcLane: it's ok if it is outside the warp size / width;
+    //                      the modulo op will be done inside the __shfl_sync.
+    k_th_ = shfl(val_arr_[kMaxArrLen - 1], Pow2<WarpSize>::mod(k_ - 1));
   }
 
   __device__ void merge_buf_()
   {
-    bitonic_sort<max_buf_len_ * WarpSize, greater>::run(val_buf_, idx_buf_);
-
-    if (max_arr_len_ > max_buf_len_) {
-      for (int i = 0; i < max_buf_len_; ++i) {
-        T& val = val_arr_[max_arr_len_ - max_buf_len_ + i];
+    ivf_flat::bitonic_sort<kMaxBufLen * WarpSize, greater>::run(val_buf_, idx_buf_);
+    //         // merge the tails of both value arrays, which means possibly updating
+    //     // smallest of the val_arr_ with the largest of the val_buf_, because
+    //     // they are sorted in the opposite directions.
+    //     // Essentially, this is the first step of bitonic_merge<capacity * 2, !greater>
+    // #pragma unroll
+    //     for (int i = std::min(kMaxArrLen, kMaxBufLen); i > 0; i--) {
+    //       T& val = val_arr_[kMaxArrLen - i];
+    //       T buf  = val_buf_[kMaxBufLen - i];
+    //       if (is_greater_than<greater>(buf, val)) {
+    //         val                      = buf;
+    //         idx_arr_[kMaxArrLen - i] = idx_buf_[kMaxBufLen - i];
+    //       }
+    //     }
+
+    if (kMaxArrLen > kMaxBufLen) {
+      for (int i = 0; i < kMaxBufLen; ++i) {
+        T& val = val_arr_[kMaxArrLen - kMaxBufLen + i];
         T& buf = val_buf_[i];
         if (is_greater_than<greater>(buf, val)) {
-          val                                       = buf;
-          idx_arr_[max_arr_len_ - max_buf_len_ + i] = idx_buf_[i];
+          val                                   = buf;
+          idx_arr_[kMaxArrLen - kMaxBufLen + i] = idx_buf_[i];
         }
       }
-    } else if (max_arr_len_ < max_buf_len_) {
-      for (int i = 0; i < max_arr_len_; ++i) {
+    } else if (kMaxArrLen < kMaxBufLen) {
+      for (int i = 0; i < kMaxArrLen; ++i) {
         T& val = val_arr_[i];
-        T& buf = val_buf_[max_buf_len_ - max_arr_len_ + i];
+        T& buf = val_buf_[kMaxBufLen - kMaxArrLen + i];
         if (is_greater_than<greater>(buf, val)) {
           val         = buf;
-          idx_arr_[i] = idx_buf_[max_buf_len_ - max_arr_len_ + i];
+          idx_arr_[i] = idx_buf_[kMaxBufLen - kMaxArrLen + i];
         }
       }
     } else {
-      for (int i = 0; i < max_arr_len_; ++i) {
+      for (int i = 0; i < kMaxArrLen; ++i) {
         if (is_greater_than<greater>(val_buf_[i], val_arr_[i])) {
           val_arr_[i] = val_buf_[i];
           idx_arr_[i] = idx_buf_[i];
@@ -280,39 +314,41 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
       }
     }
 
-    bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
+    ivf_flat::bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
 
     buf_len_ = 0;
-    set_k_th_();  // contains sync
-    for (int i = 0; i < max_buf_len_; ++i) {
+    set_k_th_();  // contains warp sync
+#pragma unroll
+    for (int i = 0; i < kMaxBufLen; i++) {
       val_buf_[i] = dummy_;
     }
   }
 
-  using WarpSort<capacity, greater, T, IdxT>::max_arr_len_;
+  using WarpSort<capacity, greater, T, IdxT>::kMaxArrLen;
   using WarpSort<capacity, greater, T, IdxT>::val_arr_;
   using WarpSort<capacity, greater, T, IdxT>::idx_arr_;
-  using WarpSort<capacity, greater, T, IdxT>::lane_;
   using WarpSort<capacity, greater, T, IdxT>::k_;
   using WarpSort<capacity, greater, T, IdxT>::dummy_;
 
-  static constexpr int max_buf_len_ = (capacity <= 64) ? 2 : 4;
+  static constexpr int kMaxBufLen = (capacity <= 64) ? 2 : 4;
 
-  T val_buf_[max_buf_len_];
-  IdxT idx_buf_[max_buf_len_];
+  T val_buf_[kMaxBufLen];
+  IdxT idx_buf_[kMaxBufLen];
   int buf_len_;
 
   T k_th_;
-  const int k_th_lane_;
 };
 
 template <int capacity, bool greater, typename T, typename IdxT>
 class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
+  static_assert(capacity >= WarpSize);
+
  public:
   __device__ WarpBitonic(int k, T dummy)
     : WarpSort<capacity, greater, T, IdxT>(k, dummy), buf_len_(0)
   {
-    for (int i = 0; i < max_arr_len_; ++i) {
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; i++) {
       val_buf_[i] = dummy_;
     }
   }
@@ -341,7 +377,10 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
 
   __device__ void add(T val, IdxT idx)
   {
-    for (int i = 0; i < max_arr_len_; ++i) {
+    // NB: the loop is used here to ensure the constant indexing,
+    //     to not force the buffers spill into the local memory.
+#pragma unroll
+    for (int i = 0; i < kMaxArrLen; ++i) {
       if (i == buf_len_) {
         val_buf_[i] = val;
         idx_buf_[i] = idx;
@@ -349,11 +388,11 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
     }
 
     ++buf_len_;
-    if (buf_len_ == max_arr_len_) {
-      bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+    if (buf_len_ == kMaxArrLen) {
+      ivf_flat::bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
       merge_();
-
-      for (int i = 0; i < max_arr_len_; ++i) {
+#pragma unroll
+      for (int i = 0; i < kMaxArrLen; i++) {
         val_buf_[i] = dummy_;
       }
       buf_len_ = 0;
@@ -363,7 +402,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
   __device__ void done()
   {
     if (buf_len_ != 0) {
-      bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+      ivf_flat::bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
       merge_();
     }
   }
@@ -371,71 +410,77 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
  private:
   __device__ void add_first_(const T* in, IdxT start, IdxT end)
   {
-    IdxT idx = start + lane_;
-    for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
+    IdxT idx = start + laneId();
+    for (int i = 0; i < kMaxArrLen; ++i, idx += WarpSize) {
       if (idx < end) {
         val_arr_[i] = in[idx];
         idx_arr_[i] = idx;
       }
     }
-    bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
+    ivf_flat::bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
   }
 
   __device__ void add_first_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
-    IdxT idx = start + lane_;
-    for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
+    IdxT idx = start + laneId();
+    for (int i = 0; i < kMaxArrLen; ++i, idx += WarpSize) {
       if (idx < end) {
         val_arr_[i] = in[idx];
         idx_arr_[i] = in_idx[idx];
       }
     }
-    bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
+    ivf_flat::bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
   }
 
   __device__ void add_extra_(const T* in, IdxT start, IdxT end)
   {
-    IdxT idx = start + lane_;
-    for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
+    IdxT idx = start + laneId();
+    for (int i = 0; i < kMaxArrLen; ++i, idx += WarpSize) {
       val_buf_[i] = (idx < end) ? in[idx] : dummy_;
       idx_buf_[i] = idx;
     }
-    bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+    ivf_flat::bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
   }
 
   __device__ void add_extra_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
-    IdxT idx = start + lane_;
-    for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
+    IdxT idx = start + laneId();
+    for (int i = 0; i < kMaxArrLen; ++i, idx += WarpSize) {
       val_buf_[i] = (idx < end) ? in[idx] : dummy_;
       idx_buf_[i] = (idx < end) ? in_idx[idx] : std::numeric_limits<IdxT>::max();
     }
-    bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+    ivf_flat::bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
   }
 
   __device__ void merge_()
   {
-    for (int i = 0; i < max_arr_len_; ++i) {
+    for (int i = 0; i < kMaxArrLen; ++i) {
       if (is_greater_than<greater>(val_buf_[i], val_arr_[i])) {
         val_arr_[i] = val_buf_[i];
         idx_arr_[i] = idx_buf_[i];
       }
     }
-    bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
+    ivf_flat::bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
   }
 
-  using WarpSort<capacity, greater, T, IdxT>::max_arr_len_;
+  using WarpSort<capacity, greater, T, IdxT>::kMaxArrLen;
   using WarpSort<capacity, greater, T, IdxT>::val_arr_;
   using WarpSort<capacity, greater, T, IdxT>::idx_arr_;
-  using WarpSort<capacity, greater, T, IdxT>::lane_;
   using WarpSort<capacity, greater, T, IdxT>::k_;
   using WarpSort<capacity, greater, T, IdxT>::dummy_;
 
-  T val_buf_[max_arr_len_];
-  IdxT idx_buf_[max_arr_len_];
+  T val_buf_[kMaxArrLen];
+  IdxT idx_buf_[kMaxArrLen];
   int buf_len_;
 };
 
+/**
+ * This one is used for the second pass only:
+ *   if the first pass happens in multiple blocks, the output consists of a series
+ *   of sorted arrays, length `k` each.
+ *   Under this assumption, we can use load_sorted to just do the merging, rather than
+ *   the full sort.
+ */
 template <int capacity, bool greater, typename T, typename IdxT>
 class WarpMerge : public WarpSort<capacity, greater, T, IdxT> {
  public:
@@ -443,9 +488,9 @@ class WarpMerge : public WarpSort<capacity, greater, T, IdxT> {
 
   __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
-    IdxT idx       = start + lane_;
+    IdxT idx       = start + Pow2<kWarpWidth>::mod(laneId());
     IdxT first_end = (start + k_ < end) ? (start + k_) : end;
-    for (int i = 0; i < max_arr_len_; ++i, idx += WarpSize) {
+    for (int i = 0; i < kMaxArrLen; ++i, idx += kWarpWidth) {
       if (idx < first_end) {
         val_arr_[i] = in[idx];
         idx_arr_[i] = in_idx[idx];
@@ -460,10 +505,10 @@ class WarpMerge : public WarpSort<capacity, greater, T, IdxT> {
   __device__ void done() {}
 
  private:
-  using WarpSort<capacity, greater, T, IdxT>::max_arr_len_;
+  using WarpSort<capacity, greater, T, IdxT>::kWarpWidth;
+  using WarpSort<capacity, greater, T, IdxT>::kMaxArrLen;
   using WarpSort<capacity, greater, T, IdxT>::val_arr_;
   using WarpSort<capacity, greater, T, IdxT>::idx_arr_;
-  using WarpSort<capacity, greater, T, IdxT>::lane_;
   using WarpSort<capacity, greater, T, IdxT>::k_;
   using WarpSort<capacity, greater, T, IdxT>::dummy_;
 };
@@ -535,6 +580,12 @@ class WarpSortBlockWide {
 
   __device__ void add(T val, IdxT idx) { queue_.add(val, idx); }
 
+  /**
+   * At the point of calling this function, the warp-level queues consumed all input independently.
+   * The remaining work to be done is to merge them together.
+   *
+   * Here we tree-merge the results using the shared memory and block sync.
+   */
   __device__ void done()
   {
     queue_.done();
@@ -559,10 +610,12 @@ class WarpSortBlockWide {
 
   __device__ void dump(T* out, IdxT* out_idx) const
   {
-    if (threadIdx.x < WarpSize) { queue_.dump(out, out_idx); }
+    if (threadIdx.x < kWarpWidth) { queue_.dump(out, out_idx); }
   }
 
  private:
+  static constexpr int kWarpWidth = std::min<int>(capacity, WarpSize);
+
   WarpSortWarpWide<capacity, greater, T, IdxT> queue_;
   int k_;
   T dummy_;
@@ -570,6 +623,12 @@ class WarpSortBlockWide {
   IdxT* idx_smem_;
 };
 
+/**
+ * Uses the `WarpSortClass` to sort chunks of data within one block with no interblock
+ * communication. It can be arranged so, that multiple blocks process one line of input; in this
+ * case, they output multiple results of length k each. Then, a second pass is needed to merge those
+ * into one final output.
+ */
 template <template <int, bool, typename, typename> class WarpSortClass,
           int capacity,
           bool greater,
@@ -815,18 +874,18 @@ void warp_sort_topk_(int num_of_block,
     // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
     block_dim = num_of_warp * WarpSize;
     smem_size = calc_smem_size_for_block_wide<T>(num_of_warp, (IdxT)k);
-    launch_setup<WarpSortClass, T, IdxT>::kernel((IdxT)k,
-                                                 greater,
-                                                 (IdxT)batch_size,
-                                                 (IdxT)len,
-                                                 num_of_block,
-                                                 block_dim,
-                                                 smem_size,
-                                                 tmp_val.data(),
-                                                 tmp_idx.data(),
-                                                 out,
-                                                 out_idx,
-                                                 stream);
+    launch_setup<WarpMerge, T, IdxT>::kernel((IdxT)k,
+                                             greater,
+                                             (IdxT)batch_size,
+                                             (IdxT)len,
+                                             num_of_block,
+                                             block_dim,
+                                             smem_size,
+                                             tmp_val.data(),
+                                             tmp_idx.data(),
+                                             out,
+                                             out_idx,
+                                             stream);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 07b921b429..0c1d6108e0 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -14,9 +14,6 @@
  * limitations under the License.
  */
 
-#ifndef __KNN_H
-#define __KNN_H
-
 #pragma once
 
 #include "detail/knn_brute_force_faiss.cuh"
@@ -27,9 +24,7 @@
 
 #include <raft/common/nvtx.hpp>
 
-namespace raft {
-namespace spatial {
-namespace knn {
+namespace raft::spatial::knn {
 
 /**
  * Performs a k-select across row partitioned index/distance
@@ -216,8 +211,4 @@ void brute_force_knn(raft::handle_t const& handle,
                                metric,
                                metric_arg);
 }
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
-
-#endif
+}  // namespace raft::spatial::knn
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 07b921b429..b23c1038c4 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -14,210 +14,6 @@
  * limitations under the License.
  */
 
-#ifndef __KNN_H
-#define __KNN_H
-
 #pragma once
 
-#include "detail/knn_brute_force_faiss.cuh"
-#include "detail/selection_faiss.cuh"
-
-#include "detail/ivf_flat/radix_topk.cuh"
-#include "detail/ivf_flat/warpsort_topk.cuh"
-
-#include <raft/common/nvtx.hpp>
-
-namespace raft {
-namespace spatial {
-namespace knn {
-
-/**
- * Performs a k-select across row partitioned index/distance
- * matrices formatted like the following:
- * row1: k0, k1, k2
- * row2: k0, k1, k2
- * row3: k0, k1, k2
- * row1: k0, k1, k2
- * row2: k0, k1, k2
- * row3: k0, k1, k2
- *
- * etc...
- *
- * @tparam idx_t
- * @tparam value_t
- * @param in_keys
- * @param in_values
- * @param out_keys
- * @param out_values
- * @param n_samples
- * @param n_parts
- * @param k
- * @param stream
- * @param translations
- */
-template <typename idx_t = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t* in_keys,
-                            idx_t* in_values,
-                            value_t* out_keys,
-                            idx_t* out_values,
-                            size_t n_samples,
-                            int n_parts,
-                            int k,
-                            cudaStream_t stream,
-                            idx_t* translations)
-{
-  detail::knn_merge_parts(
-    in_keys, in_values, out_keys, out_values, n_samples, n_parts, k, stream, translations);
-}
-
-enum class SelectKAlgo { FAISS, RADIX_8_BITS, RADIX_11_BITS, WARP_SORT };
-
-/**
- * Select k smallest or largest key/values from each row in the input data.
- *
- * If you think of the input data `in_keys` as a row-major matrix with input_len columns and
- * n_inputs rows, then this function selects k smallest/largest values in each row and fills
- * in the row-major matrix `out_keys` of size (n_inputs, k).
- *
- * Note, depending on the selected algorithm, the values within rows of `out_keys` are not
- * necessarily sorted.
- *
- * @tparam idx_t
- *   the payload type (what is being selected together with the keys).
- * @tparam value_t
- *   what is being compared.
- *
- * @param[in] in_keys
- *   contiguous array of inputs of size (input_len * n_inputs);
- *   these are compared and selected.
- * @param[in] in_values
- *   contiguous array of inputs of size (input_len * n_inputs);
- *   typically, these are indices of the corresponding in_keys.
- * @param[in] n_inputs
- *   number of input rows, i.e. the batch size.
- * @param[in] input_len
- *   length of a single input array (row); also sometimes referred as n_cols.
- *   Invariant: input_len >= k.
- * @param[out] out_keys
- *   contiguous array of outputs of size (k * n_inputs);
- *   the k smallest/largest values from each row of the `in_keys`.
- * @param[out] out_values
- *   contiguous array of outputs of size (k * n_inputs);
- *   the payload selected together with `out_keys`.
- * @param[in] select_min
- *   whether to select k smallest (true) or largest (false) keys.
- * @param[in] k
- *   the number of outputs to select in each input row.
- * @param[in] stream
- * @param[in] algo
- *   the implementation of the algorithm
- */
-template <typename idx_t = int, typename value_t = float>
-inline void select_k(value_t* in_keys,
-                     idx_t* in_values,
-                     size_t n_inputs,
-                     size_t input_len,
-                     value_t* out_keys,
-                     idx_t* out_values,
-                     bool select_min,
-                     int k,
-                     cudaStream_t stream,
-                     SelectKAlgo algo = SelectKAlgo::FAISS)
-{
-  common::nvtx::range<common::nvtx::domain::raft> fun_scope("select-%s-%d (%zu, %zu) algo-%d",
-                                                            select_min ? "min" : "max",
-                                                            k,
-                                                            n_inputs,
-                                                            input_len,
-                                                            int(algo));
-  ASSERT(size_t(input_len) >= size_t(k),
-         "Size of the input (input_len = %zu) must be not smaller than the selection (k = %zu).",
-         size_t(input_len),
-         size_t(k));
-
-  switch (algo) {
-    case SelectKAlgo::FAISS:
-      detail::select_k(
-        in_keys, in_values, n_inputs, input_len, out_keys, out_values, select_min, k, stream);
-      break;
-
-    case SelectKAlgo::RADIX_8_BITS:
-      detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(
-        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
-      break;
-
-    case SelectKAlgo::RADIX_11_BITS:
-      detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(
-        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
-      break;
-
-    case SelectKAlgo::WARP_SORT:
-      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(
-        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
-      break;
-
-    default: ASSERT(false, "Unknown algorithm (id = %d)", int(algo));
-  }
-}
-
-/**
- * @brief Flat C++ API function to perform a brute force knn on
- * a series of input arrays and combine the results into a single
- * output array for indexes and distances.
- *
- * @param[in] handle the cuml handle to use
- * @param[in] input vector of pointers to the input arrays
- * @param[in] sizes vector of sizes of input arrays
- * @param[in] D the dimensionality of the arrays
- * @param[in] search_items array of items to search of dimensionality D
- * @param[in] n number of rows in search_items
- * @param[out] res_I the resulting index array of size n * k
- * @param[out] res_D the resulting distance array of size n * k
- * @param[in] k the number of nearest neighbors to return
- * @param[in] rowMajorIndex are the index arrays in row-major order?
- * @param[in] rowMajorQuery are the query arrays in row-major order?
- * @param[in] metric distance metric to use. Euclidean (L2) is used by
- * 			   default
- * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This
- * 					 is ignored if the metric_type is not Minkowski.
- * @param[in] translations starting offsets for partitions. should be the same size
- *            as input vector.
- */
-template <typename idx_t = std::int64_t, typename value_t = float, typename value_int = int>
-void brute_force_knn(raft::handle_t const& handle,
-                     std::vector<value_t*>& input,
-                     std::vector<value_int>& sizes,
-                     value_int D,
-                     value_t* search_items,
-                     value_int n,
-                     idx_t* res_I,
-                     value_t* res_D,
-                     value_int k,
-                     bool rowMajorIndex               = true,
-                     bool rowMajorQuery               = true,
-                     std::vector<idx_t>* translations = nullptr,
-                     distance::DistanceType metric    = distance::DistanceType::L2Unexpanded,
-                     float metric_arg                 = 2.0f)
-{
-  ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
-
-  detail::brute_force_knn_impl(handle,
-                               input,
-                               sizes,
-                               D,
-                               search_items,
-                               n,
-                               res_I,
-                               res_D,
-                               k,
-                               rowMajorIndex,
-                               rowMajorQuery,
-                               translations,
-                               metric,
-                               metric_arg);
-}
-}  // namespace knn
-}  // namespace spatial
-}  // namespace raft
-
-#endif
+#include "knn.cuh"
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 386d9d75f4..8b25b115e2 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -18,6 +18,7 @@
 #include <gtest/gtest.h>
 #include <numeric>
 #include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
 
 #include "../test_utils.h"
 
@@ -32,14 +33,6 @@ namespace raft::spatial::selection {
 using namespace raft;
 using namespace raft::sparse;
 
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec)
-{
-  for (auto e : vec)
-    os << " " << e;
-  return os;
-}
-
 struct SelectTestSpec {
   int n_inputs;
   int input_len;
@@ -69,6 +62,8 @@ auto gen_simple_ids(int n_inputs, int input_len) -> std::vector<IdxT>
 template <typename KeyT, typename IdxT>
 struct SelectInOutSimple {
  public:
+  bool not_supported = false;
+
   SelectInOutSimple(const SelectTestSpec& spec,
                     const std::vector<KeyT>& in_dists,
                     const std::vector<KeyT>& out_dists,
@@ -149,12 +144,9 @@ struct SelectInOutComputed {
 
     interruptible::synchronize(stream);
 
-    if (algo != knn::SelectKAlgo::WARP_SORT) {
-      // knn::SelectKAlgo::WARP_SORT is stable!
-      auto p = topk_sort_permutation(out_dists_, out_ids_, spec.k, spec.select_min);
-      apply_permutation(out_dists_, p);
-      apply_permutation(out_ids_, p);
-    }
+    auto p = topk_sort_permutation(out_dists_, out_ids_, spec.k, spec.select_min);
+    apply_permutation(out_dists_, p);
+    apply_permutation(out_ids_, p);
   }
 
   auto get_in_dists() -> std::vector<KeyT>& { return in_dists_; }
@@ -230,15 +222,6 @@ class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT,
       ref(std::get<2>(ps)),
       res(spec, algo, ref.get_in_dists(), ref.get_in_ids())
   {
-    // std::cout << "dists in: " << ref.get_in_dists() << std::endl;
-    // std::cout << "dists ref:" << ref.get_out_dists() << std::endl;
-    // std::cout << "dists out:" << res.get_out_dists() << std::endl;
-
-    // std::cout << std::endl;
-
-    // std::cout << "indices in :" << ref.get_in_ids() << std::endl;
-    // std::cout << "indices ref:" << ref.get_out_ids() << std::endl;
-    // std::cout << "indices out:" << res.get_out_ids() << std::endl;
   }
 
   explicit SelectionTest(typename ParamsReader<KeyT, IdxT>::ParamsIn ps)
@@ -253,7 +236,7 @@ class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT,
 
   void run()
   {
-    if (res.not_supported) { GTEST_SKIP(); }
+    if (ref.not_supported || res.not_supported) { GTEST_SKIP(); }
     ASSERT_TRUE(hostArrMatch(ref.get_out_dists().data(),
                              res.get_out_dists().data(),
                              spec.n_inputs * spec.k,
@@ -312,7 +295,16 @@ auto inputs_simple_f = testing::Values(
   params_simple<float, int>::Inputs(
     {1, 7, 3, false}, {2.0, 3.0, 5.0, 1.0, 4.0, 1.0, 1.0}, {5.0, 4.0, 3.0}, {2, 4, 1}),
   params_simple<float, int>::Inputs(
-    {1, 7, 3, false}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}));
+    {1, 7, 3, false}, {2.0, 3.0, 5.0, 9.0, 4.0, 9.0, 9.0}, {9.0, 9.0, 9.0}, {3, 5, 6}),
+  params_simple<float, int>::Inputs(
+    {1, 130, 15, false},
+    {19, 1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  0,  1,  0,  1,  0,  1,  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0,  1, 0, 1, 0, 1,  0,  1,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+     1,  2, 1, 2, 1, 2,  1,  2,  1,  2,  1,  2,  1,  2,  1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 3, 4,
+     5,  6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 4, 4, 2, 3, 2, 3, 2, 3, 2, 3, 2, 20},
+    {20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6},
+    {129, 0, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105}));
 
 typedef SelectionTest<float, int, params_simple> SimpleFloatInt;
 TEST_P(SimpleFloatInt, Run) { run(); }
@@ -335,17 +327,18 @@ struct with_ref {
 
       auto s = rmm::cuda_stream_default;
       rmm::device_uvector<KeyT> dists_d(spec.input_len * spec.n_inputs, s);
-      raft::random::Rng r(42);
-      r.uniform(dists_d.data(), dists_d.size(), KeyT(-1.0), KeyT(1.0), s);
+      raft::random::Rng(42).uniform(dists_d.data(), dists_d.size(), KeyT(-1.0), KeyT(1.0), s);
       update_host(dists.data(), dists_d.data(), dists_d.size(), s);
       s.synchronize();
 
-      return std::make_tuple(spec, algo, SelectInOutComputed<KeyT, IdxT>(spec, algo, dists));
+      return std::make_tuple(spec, algo, SelectInOutComputed<KeyT, IdxT>(spec, RefAlgo, dists));
     }
   };
 };
 
-auto inputs_random = testing::Values(SelectTestSpec{20, 700, 1, true},
+auto inputs_random = testing::Values(SelectTestSpec{1, 130, 15, false},
+                                     SelectTestSpec{1, 128, 15, false},
+                                     SelectTestSpec{20, 700, 1, true},
                                      SelectTestSpec{20, 700, 2, true},
                                      SelectTestSpec{20, 700, 3, true},
                                      SelectTestSpec{20, 700, 4, true},
@@ -371,7 +364,17 @@ auto inputs_random = testing::Values(SelectTestSpec{20, 700, 1, true},
                                      SelectTestSpec{100, 1700, 512, true},
                                      SelectTestSpec{100, 1700, 1023, false},
                                      SelectTestSpec{100, 1700, 1024, true},
-                                     SelectTestSpec{100, 1700, 1700, true});
+                                     SelectTestSpec{100, 1700, 1700, true},
+                                     SelectTestSpec{100, 100000, 1, false},
+                                     SelectTestSpec{100, 100000, 2, false},
+                                     SelectTestSpec{100, 100000, 3, false},
+                                     SelectTestSpec{100, 100000, 7, false},
+                                     SelectTestSpec{100, 100000, 16, false},
+                                     SelectTestSpec{100, 100000, 31, false},
+                                     SelectTestSpec{100, 100000, 32, false},
+                                     SelectTestSpec{100, 100000, 64, false},
+                                     SelectTestSpec{100, 100000, 100, false},
+                                     SelectTestSpec{100, 100000, 200, false});
 
 typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::FAISS>::params_random>
   ReferencedRandomFloatInt;

From 8b6351b745f88abc97bb188a25261d0f373e9317 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 17 Mar 2022 16:50:08 +0100
Subject: [PATCH 10/41] Update knn.cuh docs

---
 cpp/include/raft/spatial/knn/knn.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 0c1d6108e0..3df49a2b0c 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -80,7 +80,7 @@ enum class SelectKAlgo { FAISS, RADIX_8_BITS, RADIX_11_BITS, WARP_SORT };
  * @tparam idx_t
  *   the payload type (what is being selected together with the keys).
  * @tparam value_t
- *   what is being compared.
+ *   the type of the keys (what is being compared).
  *
  * @param[in] in_keys
  *   contiguous array of inputs of size (input_len * n_inputs);

From a43e46234b767f0c864564b80f99ab9a595c4c07 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 18 Mar 2022 09:42:17 +0100
Subject: [PATCH 11/41] Add more comments

---
 .../knn/detail/ivf_flat/bitonic_sort.cuh      | 29 ++++++++++++++-----
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 24 +++++++++++++++
 2 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
index 3a15cf8a95..b356bca8ad 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
@@ -31,13 +31,13 @@ __device__ __forceinline__ void swap(T& x, T& y)
 }
 
 template <typename T>
-__device__ __forceinline__ void assign(bool cond, T* ptr, T x)
+__device__ __forceinline__ void conditional_assign(bool cond, T* ptr, T x)
 {
   if (cond) { *ptr = x; }
 }
 
 template <typename T>
-__device__ __forceinline__ void assign(bool cond, T& ptr, T x)
+__device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
 {
   if (cond) { ptr = x; }
 }
@@ -65,6 +65,8 @@ struct bitonic_merge {
   static constexpr int kArrLen = Size / WarpSize;
   static constexpr int kStride = kArrLen / 2;
 
+  // NB: the Dummy parameter is needed to postpone the evaluation of the template,
+  //     to use SFINAE, to choose between the two versions of `run` below.
   template <bool Fits, typename Dummy>
   using when_fits_in_warp =
     std::enable_if_t<(Fits == (Size <= WarpSize)) && std::is_same_v<Dummy, Dummy>, void>;
@@ -119,9 +121,9 @@ struct bitonic_merge {
         if (key == other) { do_assign = reverse != ((payload_this > payload_that) != is_second); }
       }
 
-      helpers::assign(do_assign, key, other);
+      helpers::conditional_assign(do_assign, key, other);
       // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
-      (helpers::assign(do_assign, payload, shfl(payload, Size - lane - 1, Size)), ...);
+      (helpers::conditional_assign(do_assign, payload, shfl(payload, Size - lane - 1, Size)), ...);
 
       stride /= 2;
     }
@@ -139,9 +141,9 @@ struct bitonic_merge {
         if (key == other) { do_assign = reverse != ((payload_this > payload_that) != is_second); }
       }
 
-      helpers::assign(do_assign, key, other);
+      helpers::conditional_assign(do_assign, key, other);
       // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
-      (helpers::assign(do_assign, payload, shfl_xor(payload, stride, Size)), ...);
+      (helpers::conditional_assign(do_assign, payload, shfl_xor(payload, stride, Size)), ...);
     }
   }
 
@@ -169,9 +171,19 @@ struct bitonic_merge {
 
 /**
  * Bitonic sort at the warp level.
+ * The data is strided among min(Size, WarpSize) threads,
+ * e.g. calling `bitonic_sort<128, true>::run(arr)` takes a unique 4-element array
+ * as input of each thread in a warp and sorts them, such that for a fixed i,
+ * arr[i] are sorted within the threads in a warp, and for any i < j, arr[j] in any thread is not
+ * smaller than arr[i] in any other thread.
  *
- * @tparam Size is the number of elements (must be power of two).
- * @tparam Ascending is the resulting order (true: ascending, false: descending).
+ * @tparam Size
+ *   the number of elements (must be power of two).
+ * @tparam Ascending
+ *   the resulting order (true: ascending, false: descending).
+ * @tparam AlreadySortedSize
+ *   the input consist of (Size/AlreadySortedSize) already sorted chunks, each chunk
+ *   consists of `AlreadySortedSize` elements.
  */
 template <int Size, bool Ascending, int AlreadySortedSize = 1>
 struct bitonic_sort {
@@ -200,6 +212,7 @@ struct bitonic_sort {
 
   /**
    * Execute the sort.
+   * The input pointers are unique per-thread.
    *
    * @param keys
    *   is a device pointer to a contiguous array of keys, unique per thread;
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index ac75c8ecc0..19231cce88 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -96,6 +96,10 @@
     for a whole warp, while val/idx is for a thread.
     No shared memory is needed.
 
+    The host function uses a heuristic to choose between these two classes for sorting,
+    WarpBitonic being chosen when the number of inputs per warp is somewhat small
+    (see the usage of LaunchThreshold<WarpBitonic>::len_factor_for_choosing).
+
     Example:
       __global__ void kernel() {
         WarpBitonic<...> queue(...);
@@ -137,6 +141,21 @@ int calc_capacity(int k)
 
 }  // namespace
 
+/**
+ * A fixed-size warp-level priority queue.
+ * By feeding the data through this queue, you get the `k <= capacity`
+ * smallest/greatest values in the data.
+ *
+ * @tparam capacity
+ *   maximum number of elements in the queue.
+ * @tparam greater
+ *   which comparison to use: `true` means `>`, `false` means `<`.
+ * @tparam T
+ *   the type of keys (what is being compared)
+ * @tparam IdxT
+ *   the type of payload (normally, indices of elements), i.e.
+ *   the content sorted alongside the keys.
+ */
 template <int capacity, bool greater, typename T, typename IdxT>
 class WarpSort {
   static_assert(isPo2(capacity));
@@ -408,6 +427,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
   }
 
  private:
+  // Fill in the primary val_arr_/idx_arr_
   __device__ void add_first_(const T* in, IdxT start, IdxT end)
   {
     IdxT idx = start + laneId();
@@ -420,6 +440,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
     ivf_flat::bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
   }
 
+  // Fill in the primary val_arr_/idx_arr_
   __device__ void add_first_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     IdxT idx = start + laneId();
@@ -432,6 +453,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
     ivf_flat::bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
   }
 
+  // Fill in the secondary val_buf_/idx_buf_
   __device__ void add_extra_(const T* in, IdxT start, IdxT end)
   {
     IdxT idx = start + laneId();
@@ -442,6 +464,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
     ivf_flat::bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
   }
 
+  // Fill in the secondary val_buf_/idx_buf_
   __device__ void add_extra_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     IdxT idx = start + laneId();
@@ -486,6 +509,7 @@ class WarpMerge : public WarpSort<capacity, greater, T, IdxT> {
  public:
   __device__ WarpMerge(int k, T dummy) : WarpSort<capacity, greater, T, IdxT>(k, dummy) {}
 
+  // NB: the input is already sorted, because it's the second pass.
   __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     IdxT idx       = start + Pow2<kWarpWidth>::mod(laneId());

From 45f6a35bc4dc2c346cdbdeaa748c09c0272f052e Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 18 Mar 2022 10:05:18 +0100
Subject: [PATCH 12/41] Use radix top-k as reference, because it supports
 larger k

---
 cpp/test/spatial/selection.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 8b25b115e2..c864b0bf95 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -376,14 +376,14 @@ auto inputs_random = testing::Values(SelectTestSpec{1, 130, 15, false},
                                      SelectTestSpec{100, 100000, 100, false},
                                      SelectTestSpec{100, 100000, 200, false});
 
-typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::FAISS>::params_random>
+typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::RADIX_11_BITS>::params_random>
   ReferencedRandomFloatInt;
 TEST_P(ReferencedRandomFloatInt, Run) { run(); }
 INSTANTIATE_TEST_CASE_P(SelectionTest,
                         ReferencedRandomFloatInt,
                         testing::Combine(inputs_random, selection_algos));
 
-typedef SelectionTest<double, int, with_ref<knn::SelectKAlgo::FAISS>::params_random>
+typedef SelectionTest<double, int, with_ref<knn::SelectKAlgo::RADIX_11_BITS>::params_random>
   ReferencedRandomDoubleInt;
 TEST_P(ReferencedRandomDoubleInt, Run) { run(); }
 INSTANTIATE_TEST_CASE_P(SelectionTest,

From 0fe93d2542ea9e259518ad632b4ec6e546847701 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 18 Mar 2022 13:03:23 +0100
Subject: [PATCH 13/41] Add more comments and refactor vectorized_process

---
 .../knn/detail/ivf_flat/radix_topk.cuh        | 89 +++++++++----------
 1 file changed, 42 insertions(+), 47 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
index e88ac602f7..22741f39b9 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
@@ -23,22 +23,13 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/device_atomics.cuh>
-
-/*
-  Two implementations:
-
-  (1) radix select (select + filter):
-      first select the k-th value by going through radix passes,
-      then filter out all wanted data from original data
-
-  (2) radix topk:
-      filter out wanted data directly while going through radix passes
-*/
+#include <raft/vectorized.cuh>
 
 namespace raft::spatial::knn::detail::ivf_flat {
 
-constexpr int BLOCK_DIM       = 512;
-constexpr int ITEM_PER_THREAD = 32;
+constexpr int BLOCK_DIM            = 512;
+constexpr int ITEM_PER_THREAD      = 32;
+constexpr int VECTORIZED_READ_SIZE = 16;
 
 template <int BITS_PER_PASS>
 __host__ __device__ constexpr int calc_num_buckets()
@@ -49,7 +40,7 @@ __host__ __device__ constexpr int calc_num_buckets()
 template <typename T, int BITS_PER_PASS>
 __host__ __device__ constexpr int calc_num_passes()
 {
-  return (sizeof(T) * 8 - 1) / BITS_PER_PASS + 1;
+  return ceildiv<int>(sizeof(T) * 8, BITS_PER_PASS);
 }
 
 // bit 0 is the least significant (rightmost) bit
@@ -71,6 +62,7 @@ __device__ constexpr unsigned calc_mask(int pass)
   return (1 << num_bits) - 1;
 }
 
+/** Use cub to twiddle bits - so that we can correctly compare bits of floating-point values. */
 template <typename T>
 __device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
 {
@@ -87,51 +79,54 @@ __device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
   return (twiddle_in(x, greater) >> start_bit) & mask;
 }
 
+/**
+ * Map a Func over the input data, using vectorized load instructions if possible.
+ *
+ * NB: in future, we should move this to cpp/include/raft/linalg/detail/unary_op.cuh, which
+ *     currently does not support the second lambda argument (index of an element)
+ *
+ * @tparam T element type
+ * @tparam IdxT indexing type
+ * @tparam Func void (T x, IdxT idx)
+ *
+ * @param in the input data
+ * @param len the number of elements to read
+ * @param f the lambda taking two arguments (T x, IdxT idx)
+ */
 template <typename T, typename IdxT, typename Func>
 __device__ void vectorized_process(const T* in, IdxT len, Func f)
 {
-  using WideT = float4;
-
   const IdxT stride = blockDim.x * gridDim.x;
   const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
-  if constexpr (sizeof(T) >= sizeof(WideT)) {
+  if constexpr (sizeof(T) >= VECTORIZED_READ_SIZE || VECTORIZED_READ_SIZE % sizeof(T) != 0) {
     for (IdxT i = tid; i < len; i += stride) {
       f(in[i], i);
     }
   } else {
-    static_assert(sizeof(WideT) % sizeof(T) == 0);
-    constexpr int items_per_scalar = sizeof(WideT) / sizeof(T);
-    // TODO: it's UB
-    union {
-      WideT scalar;
-      T array[items_per_scalar];
-    } wide;
-
-    int skip_cnt = (reinterpret_cast<size_t>(in) % sizeof(WideT))
-                     ? ((sizeof(WideT) - reinterpret_cast<size_t>(in) % sizeof(WideT)) / sizeof(T))
-                     : 0;
-    if (skip_cnt > len) { skip_cnt = len; }
-    const WideT* in_cast = reinterpret_cast<decltype(in_cast)>(in + skip_cnt);
-    const IdxT len_cast  = (len - skip_cnt) / items_per_scalar;
-    for (IdxT i = tid; i < len_cast; i += stride) {
-      wide.scalar       = in_cast[i];
-      const IdxT real_i = skip_cnt + i * items_per_scalar;
+    using wide_t      = TxN_t<T, VECTORIZED_READ_SIZE / sizeof(T)>;
+    using align_bytes = Pow2<(size_t)VECTORIZED_READ_SIZE>;
+    using align_elems = Pow2<wide_t::Ratio>;
+    wide_t wide;
+
+    // how many elements to skip in order to do aligned vectorized load
+    const IdxT skip_cnt_left = std::min<IdxT>((IdxT)(align_bytes::roundUp(in) - in), len);
+
+    // The main loop: process all aligned data
+    for (IdxT i = tid * wide_t::Ratio + skip_cnt_left; i + wide_t::Ratio <= len;
+         i += stride * wide_t::Ratio) {
+      wide.load(in, i);
 #pragma unroll
-      for (int j = 0; j < items_per_scalar; ++j) {
-        f(wide.array[j], real_i + j);
+      for (int j = 0; j < wide_t::Ratio; ++j) {
+        f(wide.val.data[j], i + j);
       }
     }
 
-    static_assert(WarpSize >= items_per_scalar);
-    // and because items_per_scalar > skip_cnt, WarpSize > skip_cnt
-    // no need to use loop
-    if (tid < skip_cnt) { f(in[tid], tid); }
-    // because len_cast = (len - skip_cnt) / items_per_scalar,
-    // len_cast * items_per_scalar + items_per_scalar > len - skip_cnt;
-    // and so
-    // len - (skip_cnt + len_cast * items_per_scalar) < items_per_scalar <=
-    // WarpSize no need to use loop
-    const IdxT remain_i = skip_cnt + len_cast * items_per_scalar + tid;
+    static_assert(WarpSize >= wide_t::Ratio);
+    // Processes the skipped elements on the left
+    if (tid < skip_cnt_left) { f(in[tid], tid); }
+    // Processes the skipped elements on the right
+    const IdxT skip_cnt_right = align_elems::mod(len - skip_cnt_left);
+    const IdxT remain_i       = len - skip_cnt_right + tid;
     if (remain_i < len) { f(in[remain_i], remain_i); }
   }
 }
@@ -442,7 +437,7 @@ void radix_topk(const T* in,
   T* out_buf             = nullptr;
   IdxT* out_idx_buf      = nullptr;
 
-  dim3 blocks((len - 1) / (NUM_THREAD * ITEM_PER_THREAD) + 1, batch_size);
+  dim3 blocks(ceildiv<size_t>(len, NUM_THREAD * ITEM_PER_THREAD), batch_size);
 
   constexpr int num_passes = calc_num_passes<T, BITS_PER_PASS>();
 

From 50800a4f6fcfa5638d1e889dab9db1f1ae956796 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 21 Mar 2022 07:58:04 +0100
Subject: [PATCH 14/41] Make bitonic sort use less template parameters for
 faster compile times

---
 .../knn/detail/ivf_flat/bitonic_sort.cuh      | 343 ++++++++----------
 .../knn/detail/ivf_flat/warpsort_topk.cuh     |  96 ++---
 2 files changed, 181 insertions(+), 258 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
index b356bca8ad..f892f6f9ae 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
@@ -30,244 +30,201 @@ __device__ __forceinline__ void swap(T& x, T& y)
   y   = t;
 }
 
-template <typename T>
-__device__ __forceinline__ void conditional_assign(bool cond, T* ptr, T x)
-{
-  if (cond) { *ptr = x; }
-}
-
 template <typename T>
 __device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
 {
   if (cond) { ptr = x; }
 }
 
-template <typename T, typename... Ts>
-__device__ __forceinline__ auto first(T x, Ts... xs) -> T
-{
-  return x;
-}
-
 }  // namespace helpers
 
 /**
- * Bitonic merge at the warp level.
+ * Warp-wide bitonic merge and sort.
+ * The data is strided among `warp_width` threads,
+ * e.g. calling `bitonic<4>::sort(arr)` takes a unique 4-element array as input of each thread in a
+ * warp and sorts them, such that for a fixed i, arr[i] are sorted within the threads in a warp, and
+ * for any i < j, arr[j] in any thread is not smaller than arr[i] in any other thread.
  *
- * @tparam Size is the number of elements (must be power of two).
- * @tparam Ascending is the resulting order (true: ascending, false: descending).
- * @tparam Cross whether the right half of the input is sorted in the opposite direction.
- */
-template <int Size, bool Ascending, bool Cross = true>
-struct bitonic_merge {
-  static_assert(isPo2(Size));
-
-  /** How many contiguous elements are processed by one thread. */
-  static constexpr int kArrLen = Size / WarpSize;
-  static constexpr int kStride = kArrLen / 2;
-
-  // NB: the Dummy parameter is needed to postpone the evaluation of the template,
-  //     to use SFINAE, to choose between the two versions of `run` below.
-  template <bool Fits, typename Dummy>
-  using when_fits_in_warp =
-    std::enable_if_t<(Fits == (Size <= WarpSize)) && std::is_same_v<Dummy, Dummy>, void>;
-
-  template <typename KeyT, typename... PayloadTs>
-  static __device__ auto run(bool reverse,
-                             KeyT* __restrict__ keys,
-                             PayloadTs* __restrict__... payloads) -> when_fits_in_warp<false, KeyT>
-  {
-    static_assert(Cross, "Straight merging is not implemented for Size > WarpSize.");
-    for (int i = 0; i < kStride; i++) {
-      const int other_i = i + kStride;
-      KeyT& key         = keys[i];
-      KeyT& other       = keys[other_i];
-      bool do_swap      = Ascending != reverse ? key > other : key < other;
-      // Normally, we expect `payloads` to be the array of indices from 0 to len;
-      // in that case, the construct below makes the sorting stable.
-      if constexpr (sizeof...(payloads) > 0) {
-        if (key == other) {
-          do_swap =
-            reverse != (helpers::first(payloads...)[i] > helpers::first(payloads...)[other_i]);
-        }
-      }
-      if (do_swap) {
-        helpers::swap(key, other);
-        (helpers::swap(payloads[i], payloads[other_i]), ...);
-      }
-    }
-
-    bitonic_merge<Size / 2, Ascending, true>::run(reverse, keys, payloads...);
-    bitonic_merge<Size / 2, Ascending, true>::run(reverse, keys + kStride, (payloads + kStride)...);
-  }
-
-  template <typename KeyT, typename... PayloadTs>
-  static __device__ auto run(bool reverse,
-                             KeyT& __restrict__ key,
-                             PayloadTs& __restrict__... payload) -> when_fits_in_warp<true, KeyT>
-  {
-    const int lane = threadIdx.x % Size;
-    int stride     = Size / 2;
-    if constexpr (!Cross) {
-      bool is_second = lane & stride;
-      KeyT other     = shfl(key, Size - lane - 1, Size);
-
-      bool asc       = Ascending != reverse;
-      bool do_assign = key != other && ((key > other) == (asc != is_second));
-      // Normally, we expect `payloads` to be the array of indices from 0 to len;
-      // in that case, the construct below makes the sorting stable.
-      if constexpr (sizeof...(payload) > 0) {
-        auto payload_this = helpers::first(payload...);
-        auto payload_that = shfl(payload_this, Size - lane - 1, Size);
-        if (key == other) { do_assign = reverse != ((payload_this > payload_that) != is_second); }
-      }
-
-      helpers::conditional_assign(do_assign, key, other);
-      // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
-      (helpers::conditional_assign(do_assign, payload, shfl(payload, Size - lane - 1, Size)), ...);
-
-      stride /= 2;
-    }
-    for (; stride > 0; stride /= 2) {
-      bool is_second = lane & stride;
-      KeyT other     = shfl_xor(key, stride, Size);
-
-      bool asc       = Ascending != reverse;
-      bool do_assign = key != other && ((key > other) == (asc != is_second));
-      // Normally, we expect `payloads` to be the array of indices from 0 to len;
-      // in that case, the construct below makes the sorting stable.
-      if constexpr (sizeof...(payload) > 0) {
-        auto payload_this = helpers::first(payload...);
-        auto payload_that = shfl_xor(payload_this, stride, Size);
-        if (key == other) { do_assign = reverse != ((payload_this > payload_that) != is_second); }
-      }
-
-      helpers::conditional_assign(do_assign, key, other);
-      // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
-      (helpers::conditional_assign(do_assign, payload, shfl_xor(payload, stride, Size)), ...);
-    }
-  }
-
-  template <typename KeyT, typename... PayloadTs>
-  static __device__ auto run(bool reverse,
-                             KeyT* __restrict__ keys,
-                             PayloadTs* __restrict__... payloads) -> when_fits_in_warp<true, KeyT>
-  {
-    return run(reverse, *keys, *payloads...);
-  }
-
-  template <typename KeyT, typename... PayloadTs>
-  static __device__ void run(KeyT* __restrict__ keys, PayloadTs* __restrict__... payloads)
-  {
-    return run(false, keys, payloads...);
-  }
-
-  template <typename KeyT, typename... PayloadTs>
-  static __device__ auto run(KeyT& __restrict__ key, PayloadTs& __restrict__... payload)
-    -> when_fits_in_warp<true, KeyT>
-  {
-    return run(false, key, payload...);
-  }
-};
-
-/**
- * Bitonic sort at the warp level.
- * The data is strided among min(Size, WarpSize) threads,
- * e.g. calling `bitonic_sort<128, true>::run(arr)` takes a unique 4-element array
- * as input of each thread in a warp and sorts them, such that for a fixed i,
- * arr[i] are sorted within the threads in a warp, and for any i < j, arr[j] in any thread is not
- * smaller than arr[i] in any other thread.
+ * As an example, assuming `Size = 4`, `warp_width = 16`, and `WarpSize = 32`, the layout is:
+ * `
+ *  arr_i \ laneId()
+ *       0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15    16  17  18 ...
+ *      subwarp_1                                                         subwarp_2
+ *   0   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15     0   1   2  ...
+ *   1  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31    16  17  18 ...
+ *   2  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47    32  33  34 ...
+ *   3  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63    48  49  50 ...
+ * `
  *
  * @tparam Size
- *   the number of elements (must be power of two).
- * @tparam Ascending
- *   the resulting order (true: ascending, false: descending).
- * @tparam AlreadySortedSize
- *   the input consist of (Size/AlreadySortedSize) already sorted chunks, each chunk
- *   consists of `AlreadySortedSize` elements.
+ *   number of elements processed in each thread;
+ *   i.e. the total data size is `Size * warp_width`.
+ *   Must be power-of-two.
+ *
  */
-template <int Size, bool Ascending, int AlreadySortedSize = 1>
-struct bitonic_sort {
+template <int Size = 1>
+class bitonic {
   static_assert(isPo2(Size));
-  static_assert(isPo2(AlreadySortedSize));
-  static_assert(isPo2(Size >= AlreadySortedSize));
 
-  template <typename KeyT, typename... PayloadTs>
-  static __device__ void run(bool reverse,
-                             KeyT* __restrict__ keys,
-                             PayloadTs* __restrict__... payloads)
+ public:
+  /**
+   * Initialize bitonic sort config.
+   *
+   * @param ascending
+   *   the resulting order (true: ascending, false: descending).
+   * @param warp_width
+   *   the number of threads participating in the warp-level primitives;
+   *   the total size of the sorted data is `Size * warp_width`.
+   *   Must be power-of-two, not larger than the WarpSize.
+   */
+  __device__ __forceinline__ explicit bitonic(bool ascending, int warp_width = WarpSize)
+    : ascending_(ascending), warp_width_(warp_width)
   {
-    constexpr int kSize2 = Size / 2;
-    if constexpr (kSize2 > AlreadySortedSize) {
-      // NB: the `reverse` expression here is always `0` (false) when `Size > WarpSize`
-      bitonic_sort<kSize2, Ascending, AlreadySortedSize>::run(laneId() & kSize2, keys, payloads...);
-      if constexpr (Size > WarpSize) {
-        // NB: this part is executed only if the size of the input arrays is larger than the warp.
-        constexpr int kShift = kSize2 / WarpSize;
-        bitonic_sort<kSize2, Ascending, AlreadySortedSize>::run(
-          true, keys + kShift, (payloads + kShift)...);
-      }
-    }
-    bitonic_merge<Size, Ascending, (kSize2 > AlreadySortedSize)>::run(reverse, keys, payloads...);
   }
 
+  bitonic(bitonic const&) = delete;
+  bitonic(bitonic&&)      = delete;
+  auto operator=(bitonic const&) -> bitonic& = delete;
+  auto operator=(bitonic&&) -> bitonic& = delete;
+
   /**
-   * Execute the sort.
+   * You can think of this function in two ways:
+   *
+   *   1) Sort any bitonic sequence.
+   *   2) Merge two halfs of the input data assuming they're already sorted, and their order is
+   *      opposite (i.e. either ascending, descending or vice-versa).
+   *
    * The input pointers are unique per-thread.
+   * See the class description for the description of the data layout.
    *
    * @param keys
-   *   is a device pointer to a contiguous array of keys, unique per thread;
+   *   is a device pointer to a contiguous array of keys, unique per thread; must be at least `Size`
+   *   elements long.
    * @param payloads
    *   are zero or more associated arrays of the same size as keys, which are sorted together with
-   *   the keys.
+   *   the keys; must be at least `Size` elements long.
    */
   template <typename KeyT, typename... PayloadTs>
-  static __device__ void run(KeyT* __restrict__ keys, PayloadTs* __restrict__... payloads)
+  __device__ __forceinline__ void merge(KeyT* __restrict__ keys,
+                                        PayloadTs* __restrict__... payloads) const
   {
-    return run(false, keys, payloads...);
+    return bitonic<Size>::merge_(ascending_, warp_width_, keys, payloads...);
   }
-};
 
-template <bool Ascending, bool Cross>
-struct bitonic_merge<1, Ascending, Cross> {
-  template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void run(bool reverse,
-                                             KeyT* __restrict__ keys,
-                                             PayloadTs* __restrict__... payloads)
-  {
-  }
+  /**
+   * Sort the data.
+   * The input pointers are unique per-thread.
+   * See the class description for the description of the data layout.
+   *
+   * @param keys
+   *   is a device pointer to a contiguous array of keys, unique per thread; must be at least `Size`
+   *   elements long.
+   * @param payloads
+   *   are zero or more associated arrays of the same size as keys, which are sorted together with
+   *   the keys; must be at least `Size` elements long.
+   */
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void run(bool reverse,
-                                             KeyT& __restrict__ keys,
-                                             PayloadTs& __restrict__... payloads)
+  __device__ __forceinline__ void sort(KeyT* __restrict__ keys,
+                                       PayloadTs* __restrict__... payloads) const
   {
+    return bitonic<Size>::sort_(ascending_, warp_width_, keys, payloads...);
   }
 
-  template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void run(KeyT* __restrict__ keys,
-                                             PayloadTs* __restrict__... payloads)
+  /**
+   * @brief `merge` variant for the case of one element per thread.
+   *
+   * @param key
+   * @param payload
+   */
+  template <typename KeyT, typename... PayloadTs, int S = Size>
+  __device__ __forceinline__ auto merge(KeyT& __restrict__ key,
+                                        PayloadTs& __restrict__... payload) const
+    -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
   {
+    static_assert(S == Size);
+    return merge(*key, *payload...);
   }
 
-  template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void run(KeyT& __restrict__ keys,
-                                             PayloadTs& __restrict__... payloads)
+  /**
+   * @brief `sort` variant for the case of one element per thread.
+   *
+   * @param key
+   * @param payload
+   */
+  template <typename KeyT, typename... PayloadTs, int S = Size>
+  __device__ __forceinline__ auto sort(KeyT& __restrict__ key,
+                                       PayloadTs& __restrict__... payload) const
+    -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
   {
+    static_assert(S == Size);
+    return sort(*key, *payload...);
   }
-};
 
-template <int Size, bool Ascending>
-struct bitonic_sort<Size, Ascending, Size> {
+ private:
+  const int warp_width_;
+  const bool ascending_;
+
+  template <int AnotherSize>
+  friend class bitonic;
+
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void run(bool reverse,
-                                             KeyT* __restrict__ keys,
-                                             PayloadTs* __restrict__... payloads)
+  static __device__ __forceinline__ void merge_(bool ascending,
+                                                int warp_width,
+                                                KeyT* __restrict__ keys,
+                                                PayloadTs* __restrict__... payloads)
   {
+#pragma unroll
+    for (int size = Size; size > 1; size >>= 1) {
+      const int stride = size >> 1;
+#pragma unroll
+      for (int offset = 0; offset < Size; offset += size) {
+#pragma unroll
+        for (int i = offset + stride - 1; i >= offset; i--) {
+          const int other_i = i + stride;
+          KeyT& key         = keys[i];
+          KeyT& other       = keys[other_i];
+          if (ascending ? key > other : key < other) {
+            helpers::swap(key, other);
+            (helpers::swap(payloads[i], payloads[other_i]), ...);
+          }
+        }
+      }
+    }
+    const int lane = laneId();
+#pragma unroll
+    for (int i = 0; i < Size; i++) {
+      KeyT& key = keys[i];
+      for (int stride = (warp_width >> 1); stride > 0; stride >>= 1) {
+        const bool is_second = lane & stride;
+        const KeyT other     = shfl_xor(key, stride, warp_width);
+        const bool do_assign = (ascending != is_second) ? key > other : key < other;
+
+        helpers::conditional_assign(do_assign, key, other);
+        // NB: don't put shfl_xor in a conditional; it must be called by all threads in a warp.
+        (helpers::conditional_assign(
+           do_assign, payloads[i], shfl_xor(payloads[i], stride, warp_width)),
+         ...);
+      }
+    }
   }
 
   template <typename KeyT, typename... PayloadTs>
-  static __device__ __forceinline__ void run(KeyT* __restrict__ keys,
-                                             PayloadTs* __restrict__... payloads)
+  static __device__ __forceinline__ void sort_(bool ascending,
+                                               int warp_width,
+                                               KeyT* __restrict__ keys,
+                                               PayloadTs* __restrict__... payloads)
   {
+    if constexpr (Size == 1) {
+      const int lane = laneId();
+      for (int width = 2; width < warp_width; width <<= 1) {
+        bitonic<1>::merge_(lane & width, width, keys, payloads...);
+      }
+    } else {
+      constexpr int kSize2 = Size / 2;
+      bitonic<kSize2>::sort_(false, warp_width, keys, payloads...);
+      bitonic<kSize2>::sort_(true, warp_width, keys + kSize2, (payloads + kSize2)...);
+    }
+    bitonic<Size>::merge_(ascending, warp_width, keys, payloads...);
   }
 };
 
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index 19231cce88..fc8c3476d8 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -161,6 +161,16 @@ class WarpSort {
   static_assert(isPo2(capacity));
 
  public:
+  /**
+   * Construct the WarpSort empty queue.
+   *
+   * @param k
+   *   number of elements to select.
+   * @param dummy
+   *   the `empty` value for the choosen binary operation,
+   *   i.e. `greater ? lower_bound<T>() : upper_bound<T>()`.
+   *
+   */
   __device__ WarpSort(IdxT k, T dummy) : k_(k), dummy_(dummy)
   {
 #pragma unroll
@@ -185,7 +195,7 @@ class WarpSort {
         }
       }
     }
-    ivf_flat::bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
+    ivf_flat::bitonic<kMaxArrLen>(!greater, kWarpWidth).merge(val_arr_, idx_arr_);
   }
 
   __device__ void dump(T* out, IdxT* out_idx) const
@@ -198,21 +208,6 @@ class WarpSort {
     }
   }
 
-  // TODO: do all merging in the bitonic_sort.cuh
-  // /**
-  //  * When capacity < WarpSize, merges sorted queues within the warp.
-  //  * As a result, the top k selected values are placed in the first queue in the group
-  //  * (i.e. within the first kWarpWidth values, since k <= capacity == kWarpWidth).
-  //  *
-  //  * It does nothing when capacity >= WarpSize
-  //  */
-  // __device__ __forceinline__ void merge_within_warp()
-  // {
-  //   if constexpr (kWarpWidth < WarpSize) {
-  //     ivf_flat::bitonic_sort<WarpSize, !greater, kWarpWidth>::run(val_arr_, idx_arr_);
-  //   }
-  // }
-
  protected:
   static constexpr int kWarpWidth = std::min<int>(capacity, WarpSize);
   static constexpr int kMaxArrLen = capacity / kWarpWidth;
@@ -286,54 +281,25 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
     // it's the best we can do, should use "val_arr_[k_th_row_]"
     // NB on using srcLane: it's ok if it is outside the warp size / width;
     //                      the modulo op will be done inside the __shfl_sync.
-    k_th_ = shfl(val_arr_[kMaxArrLen - 1], Pow2<WarpSize>::mod(k_ - 1));
+    k_th_ = shfl(val_arr_[kMaxArrLen - 1], k_ - 1);
   }
 
   __device__ void merge_buf_()
   {
-    ivf_flat::bitonic_sort<kMaxBufLen * WarpSize, greater>::run(val_buf_, idx_buf_);
-    //         // merge the tails of both value arrays, which means possibly updating
-    //     // smallest of the val_arr_ with the largest of the val_buf_, because
-    //     // they are sorted in the opposite directions.
-    //     // Essentially, this is the first step of bitonic_merge<capacity * 2, !greater>
-    // #pragma unroll
-    //     for (int i = std::min(kMaxArrLen, kMaxBufLen); i > 0; i--) {
-    //       T& val = val_arr_[kMaxArrLen - i];
-    //       T buf  = val_buf_[kMaxBufLen - i];
-    //       if (is_greater_than<greater>(buf, val)) {
-    //         val                      = buf;
-    //         idx_arr_[kMaxArrLen - i] = idx_buf_[kMaxBufLen - i];
-    //       }
-    //     }
-
-    if (kMaxArrLen > kMaxBufLen) {
-      for (int i = 0; i < kMaxBufLen; ++i) {
-        T& val = val_arr_[kMaxArrLen - kMaxBufLen + i];
-        T& buf = val_buf_[i];
-        if (is_greater_than<greater>(buf, val)) {
-          val                                   = buf;
-          idx_arr_[kMaxArrLen - kMaxBufLen + i] = idx_buf_[i];
-        }
-      }
-    } else if (kMaxArrLen < kMaxBufLen) {
-      for (int i = 0; i < kMaxArrLen; ++i) {
-        T& val = val_arr_[i];
-        T& buf = val_buf_[kMaxBufLen - kMaxArrLen + i];
-        if (is_greater_than<greater>(buf, val)) {
-          val         = buf;
-          idx_arr_[i] = idx_buf_[kMaxBufLen - kMaxArrLen + i];
-        }
-      }
-    } else {
-      for (int i = 0; i < kMaxArrLen; ++i) {
-        if (is_greater_than<greater>(val_buf_[i], val_arr_[i])) {
-          val_arr_[i] = val_buf_[i];
-          idx_arr_[i] = idx_buf_[i];
-        }
+    ivf_flat::bitonic<kMaxBufLen>(greater).sort(val_buf_, idx_buf_);
+    // merge the tails of both value arrays, which means possibly updating
+    // smallest of the val_arr_ with the largest of the val_buf_, because
+    // they are sorted in the opposite directions.
+#pragma unroll
+    for (int i = std::min(kMaxArrLen, kMaxBufLen); i > 0; i--) {
+      T& val = val_arr_[kMaxArrLen - i];
+      T buf  = val_buf_[kMaxBufLen - i];
+      if (is_greater_than<greater>(buf, val)) {
+        val                      = buf;
+        idx_arr_[kMaxArrLen - i] = idx_buf_[kMaxBufLen - i];
       }
     }
-
-    ivf_flat::bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
+    ivf_flat::bitonic<kMaxArrLen>(!greater).merge(val_arr_, idx_arr_);
 
     buf_len_ = 0;
     set_k_th_();  // contains warp sync
@@ -408,7 +374,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
 
     ++buf_len_;
     if (buf_len_ == kMaxArrLen) {
-      ivf_flat::bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+      ivf_flat::bitonic<kMaxArrLen>(greater).sort(val_buf_, idx_buf_);
       merge_();
 #pragma unroll
       for (int i = 0; i < kMaxArrLen; i++) {
@@ -421,7 +387,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
   __device__ void done()
   {
     if (buf_len_ != 0) {
-      ivf_flat::bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+      ivf_flat::bitonic<kMaxArrLen>(greater).sort(val_buf_, idx_buf_);
       merge_();
     }
   }
@@ -437,7 +403,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
         idx_arr_[i] = idx;
       }
     }
-    ivf_flat::bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
+    ivf_flat::bitonic<kMaxArrLen>(!greater).sort(val_arr_, idx_arr_);
   }
 
   // Fill in the primary val_arr_/idx_arr_
@@ -450,7 +416,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
         idx_arr_[i] = in_idx[idx];
       }
     }
-    ivf_flat::bitonic_sort<capacity, !greater>::run(val_arr_, idx_arr_);
+    ivf_flat::bitonic<kMaxArrLen>(!greater).sort(val_arr_, idx_arr_);
   }
 
   // Fill in the secondary val_buf_/idx_buf_
@@ -461,7 +427,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
       val_buf_[i] = (idx < end) ? in[idx] : dummy_;
       idx_buf_[i] = idx;
     }
-    ivf_flat::bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+    ivf_flat::bitonic<kMaxArrLen>(greater).sort(val_buf_, idx_buf_);
   }
 
   // Fill in the secondary val_buf_/idx_buf_
@@ -472,7 +438,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
       val_buf_[i] = (idx < end) ? in[idx] : dummy_;
       idx_buf_[i] = (idx < end) ? in_idx[idx] : std::numeric_limits<IdxT>::max();
     }
-    ivf_flat::bitonic_sort<capacity, greater>::run(val_buf_, idx_buf_);
+    ivf_flat::bitonic<kMaxArrLen>(greater).sort(val_buf_, idx_buf_);
   }
 
   __device__ void merge_()
@@ -483,7 +449,7 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
         idx_arr_[i] = idx_buf_[i];
       }
     }
-    ivf_flat::bitonic_merge<capacity, !greater>::run(val_arr_, idx_arr_);
+    ivf_flat::bitonic<kMaxArrLen>(!greater).merge(val_arr_, idx_arr_);
   }
 
   using WarpSort<capacity, greater, T, IdxT>::kMaxArrLen;

From 78805f0fd58b972c0d65ff19a74d7f2b250679cb Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 21 Mar 2022 09:38:41 +0100
Subject: [PATCH 15/41] Use gridDim.y for the batch dimension to simplify math
 and use less registers

---
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 59 +++++++++++--------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index fc8c3476d8..46873873b4 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -625,34 +625,30 @@ template <template <int, bool, typename, typename> class WarpSortClass,
           typename T,
           typename IdxT>
 __global__ void block_kernel(
-  const T* in, const IdxT* in_idx, IdxT batch_size, IdxT len, int k, T* out, IdxT* out_idx, T dummy)
+  const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx, T dummy)
 {
   extern __shared__ __align__(sizeof(T) * 256) uint8_t smem_buf_bytes[];
-  T* smem_buf = (T*)smem_buf_bytes;
+  T* smem_buf = reinterpret_cast<T*>(smem_buf_bytes);
 
-  const int num_of_block        = gridDim.x / batch_size;
-  const IdxT len_per_block      = (len - 1) / num_of_block + 1;
-  const int batch_id            = blockIdx.x / num_of_block;
-  const int block_id_in_a_batch = blockIdx.x % num_of_block;
-
-  IdxT start = block_id_in_a_batch * len_per_block;
-  IdxT end   = start + len_per_block;
-  if (end >= len) { end = len; }
+  const IdxT len_per_block = ceildiv<IdxT>(len, gridDim.x);
+  const IdxT start         = blockIdx.x * len_per_block;
+  const IdxT end           = std::min(len, start + len_per_block);
 
   WarpSortBlockWide<WarpSortClass, capacity, greater, T, IdxT> queue(k, dummy, smem_buf);
   if constexpr (std::is_same_v<WarpSortClass<capacity, greater, T, IdxT>,
                                WarpMerge<capacity, greater, T, IdxT>>) {
-    queue.add(in + batch_id * len, in_idx + batch_id * len, start, end);
+    queue.add(in + blockIdx.y * len, in_idx + blockIdx.y * len, start, end);
   } else {
     if (in_idx == nullptr) {
-      queue.add(in + batch_id * len, start, end);
+      queue.add(in + blockIdx.y * len, start, end);
     } else {
-      queue.add(in + batch_id * len, in_idx + batch_id * len, start, end);
+      queue.add(in + blockIdx.y * len, in_idx + blockIdx.y * len, start, end);
     }
   }
 
   queue.done();
-  queue.dump(out + blockIdx.x * k, out_idx + blockIdx.x * k);
+  const int block_id = blockIdx.x + gridDim.x * blockIdx.y;
+  queue.dump(out + block_id * k, out_idx + block_id * k);
 }
 
 template <template <int, bool, typename, typename> class WarpSortClass,
@@ -720,16 +716,33 @@ struct launch_setup {
     }
     ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
     T dummy = greater ? lower_bound<T>() : upper_bound<T>();
-    if (greater) {
-      block_kernel<WarpSortClass, Capacity, true>
-        <<<batch_size * num_blocks, block_dim, smem_size, stream>>>(
-          in_key, in_idx, batch_size, len, k, out_key, out_idx, dummy);
-    } else {
-      block_kernel<WarpSortClass, Capacity, false>
-        <<<batch_size * num_blocks, block_dim, smem_size, stream>>>(
-          in_key, in_idx, batch_size, len, k, out_key, out_idx, dummy);
+    // This is less than cuda's max block dim along Y axis (65535), but it's a
+    // power-of-two, which ensures the alignment of batches in memory.
+    constexpr IdxT kMaxGridDimY = 32768;
+    for (IdxT offset = 0; offset < batch_size; offset += kMaxGridDimY) {
+      IdxT batch_chunk = std::min<IdxT>(kMaxGridDimY, batch_size - offset);
+      dim3 gs(num_blocks, batch_chunk, 1);
+      if (greater) {
+        block_kernel<WarpSortClass, Capacity, true>
+          <<<gs, block_dim, smem_size, stream>>>(in_key + offset * len,
+                                                 in_idx + offset * len,
+                                                 len,
+                                                 k,
+                                                 out_key + offset * num_blocks * k,
+                                                 out_idx + offset * num_blocks * k,
+                                                 dummy);
+      } else {
+        block_kernel<WarpSortClass, Capacity, false>
+          <<<gs, block_dim, smem_size, stream>>>(in_key + offset * len,
+                                                 in_idx + offset * len,
+                                                 len,
+                                                 k,
+                                                 out_key + offset * num_blocks * k,
+                                                 out_idx + offset * num_blocks * k,
+                                                 dummy);
+      }
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 };
 

From 9cf1f3396a1ab5f8f92d7332f4fe73df8a144d8c Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 21 Mar 2022 11:19:21 +0100
Subject: [PATCH 16/41] Update tests

---
 cpp/test/spatial/selection.cu | 39 +++++++++++++++++------------------
 cpp/test/test_utils.h         | 22 ++++++++++++--------
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index c864b0bf95..c892f707bb 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -237,12 +237,8 @@ class SelectionTest : public testing::TestWithParam<typename ParamsReader<KeyT,
   void run()
   {
     if (ref.not_supported || res.not_supported) { GTEST_SKIP(); }
-    ASSERT_TRUE(hostArrMatch(ref.get_out_dists().data(),
-                             res.get_out_dists().data(),
-                             spec.n_inputs * spec.k,
-                             Compare<KeyT>()));
-    ASSERT_TRUE(hostArrMatch(
-      ref.get_out_ids().data(), res.get_out_ids().data(), spec.n_inputs * spec.k, Compare<IdxT>()));
+    ASSERT_TRUE(hostVecMatch(ref.get_out_dists(), res.get_out_dists(), Compare<KeyT>()));
+    ASSERT_TRUE(hostVecMatch(ref.get_out_ids(), res.get_out_ids(), Compare<IdxT>()));
   }
 };
 
@@ -327,7 +323,7 @@ struct with_ref {
 
       auto s = rmm::cuda_stream_default;
       rmm::device_uvector<KeyT> dists_d(spec.input_len * spec.n_inputs, s);
-      raft::random::Rng(42).uniform(dists_d.data(), dists_d.size(), KeyT(-1.0), KeyT(1.0), s);
+      raft::random::Rng(42).normal(dists_d.data(), dists_d.size(), KeyT(10.0), KeyT(100.0), s);
       update_host(dists.data(), dists_d.data(), dists_d.size(), s);
       s.synchronize();
 
@@ -365,25 +361,28 @@ auto inputs_random = testing::Values(SelectTestSpec{1, 130, 15, false},
                                      SelectTestSpec{100, 1700, 1023, false},
                                      SelectTestSpec{100, 1700, 1024, true},
                                      SelectTestSpec{100, 1700, 1700, true},
-                                     SelectTestSpec{100, 100000, 1, false},
-                                     SelectTestSpec{100, 100000, 2, false},
-                                     SelectTestSpec{100, 100000, 3, false},
-                                     SelectTestSpec{100, 100000, 7, false},
-                                     SelectTestSpec{100, 100000, 16, false},
-                                     SelectTestSpec{100, 100000, 31, false},
-                                     SelectTestSpec{100, 100000, 32, false},
-                                     SelectTestSpec{100, 100000, 64, false},
-                                     SelectTestSpec{100, 100000, 100, false},
-                                     SelectTestSpec{100, 100000, 200, false});
-
-typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::RADIX_11_BITS>::params_random>
+                                     SelectTestSpec{100, 100000, 1, true},
+                                     SelectTestSpec{100, 100000, 2, true},
+                                     SelectTestSpec{100, 100000, 3, true},
+                                     SelectTestSpec{100, 100000, 7, true},
+                                     SelectTestSpec{100, 100000, 16, true},
+                                     SelectTestSpec{100, 100000, 31, true},
+                                     SelectTestSpec{100, 100000, 32, true},
+                                     SelectTestSpec{100, 100000, 64, true},
+                                     SelectTestSpec{100, 100000, 60, true},
+                                     SelectTestSpec{100, 100000, 100, true},
+                                     SelectTestSpec{100, 100000, 200, true},
+                                     SelectTestSpec{100000, 100, 100, false},
+                                     SelectTestSpec{100000, 200, 100, false});
+
+typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::RADIX_8_BITS>::params_random>
   ReferencedRandomFloatInt;
 TEST_P(ReferencedRandomFloatInt, Run) { run(); }
 INSTANTIATE_TEST_CASE_P(SelectionTest,
                         ReferencedRandomFloatInt,
                         testing::Combine(inputs_random, selection_algos));
 
-typedef SelectionTest<double, int, with_ref<knn::SelectKAlgo::RADIX_11_BITS>::params_random>
+typedef SelectionTest<double, int, with_ref<knn::SelectKAlgo::RADIX_8_BITS>::params_random>
   ReferencedRandomDoubleInt;
 TEST_P(ReferencedRandomDoubleInt, Run) { run(); }
 INSTANTIATE_TEST_CASE_P(SelectionTest,
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index 3f1338c5cd..196b0cd0a8 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -209,22 +209,26 @@ testing::AssertionResult devArrMatchHost(
   return testing::AssertionSuccess();
 }
 
-/*
- * @brief Helper function to compare host n-D arrays using a custom comparison
- * @tparam T the data type of the arrays
+/**
+ * @brief Helper function to compare host vectors using a custom comparison
+ * @tparam T the element type
  * @tparam L the comparator lambda or object function
- * @param expected_h host array of expected value(s)
- * @param actual_h host array actual values
+ * @param expected_h host vector of expected value(s)
+ * @param actual_h host vector actual values
  * @param eq_compare the comparator
  * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
  */
 template <typename T, typename L>
-testing::AssertionResult hostArrMatch(const T* expected_h,
-                                      const T* actual_h,
-                                      size_t size,
+testing::AssertionResult hostVecMatch(const std::vector<T>& expected_h,
+                                      const std::vector<T>& actual_h,
                                       L eq_compare)
 {
-  for (size_t i(0); i < size; ++i) {
+  auto n = actual_h.size();
+  if (n != expected_h.size())
+    return testing::AssertionFailure()
+           << "vector sizez mismatch: "
+           << "actual=" << n << " != expected=" << expected_h.size() << "; ";
+  for (size_t i = 0; i < n; ++i) {
     auto exp = expected_h[i];
     auto act = actual_h[i];
     if (!eq_compare(exp, act)) {

From 90293dc2bc33fcc228ee7e135197eedeb092d453 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 21 Mar 2022 11:34:34 +0100
Subject: [PATCH 17/41] Allow larger batch sizes for radix_topk

---
 .../knn/detail/ivf_flat/radix_topk.cuh        | 45 +++++++++++++++----
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
index 22741f39b9..19b363e5a0 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
@@ -27,6 +27,7 @@
 
 namespace raft::spatial::knn::detail::ivf_flat {
 
+constexpr uint16_t MAX_BATCH_SIZE  = 1024;
 constexpr int BLOCK_DIM            = 512;
 constexpr int ITEM_PER_THREAD      = 32;
 constexpr int VECTORIZED_READ_SIZE = 16;
@@ -407,15 +408,15 @@ __global__ void radix_kernel(const T* in_buf,
 }
 
 template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
-void radix_topk(const T* in,
-                const IdxT* in_idx,
-                size_t batch_size,
-                size_t len,
-                int k,
-                T* out,
-                IdxT* out_idx,
-                bool greater,
-                rmm::cuda_stream_view stream)
+void radix_topk_(const T* in,
+                 const IdxT* in_idx,
+                 uint16_t batch_size,
+                 size_t len,
+                 int k,
+                 T* out,
+                 IdxT* out_idx,
+                 bool greater,
+                 rmm::cuda_stream_view stream)
 {
   // TODO: is it possible to relax this restriction?
   static_assert(calc_num_passes<T, BITS_PER_PASS>() > 1);
@@ -477,6 +478,32 @@ void radix_topk(const T* in,
                                           k,
                                           greater,
                                           pass);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
+template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
+void radix_topk(const T* in,
+                const IdxT* in_idx,
+                size_t batch_size,
+                size_t len,
+                int k,
+                T* out,
+                IdxT* out_idx,
+                bool greater,
+                rmm::cuda_stream_view stream)
+{
+  for (size_t offset = 0; offset < batch_size; offset += MAX_BATCH_SIZE) {
+    auto batch_chunk = uint16_t(std::min<size_t>(MAX_BATCH_SIZE, batch_size - offset));
+    radix_topk_<T, IdxT, BITS_PER_PASS, NUM_THREAD>(in + offset * len,
+                                                    in_idx + offset * len,
+                                                    batch_chunk,
+                                                    len,
+                                                    k,
+                                                    out + offset * k,
+                                                    out_idx + offset * k,
+                                                    greater,
+                                                    stream);
   }
 }
 

From 48ac5c7a8c3948553f823b33e311323409191452 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 22 Mar 2022 08:10:48 +0100
Subject: [PATCH 18/41] Update docs

---
 cpp/bench/spatial/selection.cu       |  2 +-
 cpp/include/raft/spatial/knn/knn.cuh | 22 ++++++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
index 01f3fb8c17..f6fa5e93d3 100644
--- a/cpp/bench/spatial/selection.cu
+++ b/cpp/bench/spatial/selection.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 3df49a2b0c..3b4c691b99 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -65,7 +65,17 @@ inline void knn_merge_parts(value_t* in_keys,
     in_keys, in_values, out_keys, out_values, n_samples, n_parts, k, stream, translations);
 }
 
-enum class SelectKAlgo { FAISS, RADIX_8_BITS, RADIX_11_BITS, WARP_SORT };
+/** Choose an implementation for the select-top-k, */
+enum class SelectKAlgo {
+  /** Adapted from the faiss project. Result: sorted (not stable). */
+  FAISS,
+  /** Incomplete series of radix sort passes, comparing 8 bits per pass. Result: unsorted. */
+  RADIX_8_BITS,
+  /** Incomplete series of radix sort passes, comparing 11 bits per pass. Result: unsorted. */
+  RADIX_11_BITS,
+  /** Filtering with a bitonic-sort-based priority queue. Result: sorted (not stable). */
+  WARP_SORT
+};
 
 /**
  * Select k smallest or largest key/values from each row in the input data.
@@ -75,7 +85,7 @@ enum class SelectKAlgo { FAISS, RADIX_8_BITS, RADIX_11_BITS, WARP_SORT };
  * in the row-major matrix `out_keys` of size (n_inputs, k).
  *
  * Note, depending on the selected algorithm, the values within rows of `out_keys` are not
- * necessarily sorted.
+ * necessarily sorted. See the `SelectKAlgo` enumeration for more details.
  *
  * @tparam idx_t
  *   the payload type (what is being selected together with the keys).
@@ -83,10 +93,10 @@ enum class SelectKAlgo { FAISS, RADIX_8_BITS, RADIX_11_BITS, WARP_SORT };
  *   the type of the keys (what is being compared).
  *
  * @param[in] in_keys
- *   contiguous array of inputs of size (input_len * n_inputs);
+ *   contiguous device array of inputs of size (input_len * n_inputs);
  *   these are compared and selected.
  * @param[in] in_values
- *   contiguous array of inputs of size (input_len * n_inputs);
+ *   contiguous device array of inputs of size (input_len * n_inputs);
  *   typically, these are indices of the corresponding in_keys.
  * @param[in] n_inputs
  *   number of input rows, i.e. the batch size.
@@ -94,10 +104,10 @@ enum class SelectKAlgo { FAISS, RADIX_8_BITS, RADIX_11_BITS, WARP_SORT };
  *   length of a single input array (row); also sometimes referred as n_cols.
  *   Invariant: input_len >= k.
  * @param[out] out_keys
- *   contiguous array of outputs of size (k * n_inputs);
+ *   contiguous device array of outputs of size (k * n_inputs);
  *   the k smallest/largest values from each row of the `in_keys`.
  * @param[out] out_values
- *   contiguous array of outputs of size (k * n_inputs);
+ *   contiguous device array of outputs of size (k * n_inputs);
  *   the payload selected together with `out_keys`.
  * @param[in] select_min
  *   whether to select k smallest (true) or largest (false) keys.

From fa76a4d661e502ce2d9b6accd46f34df52f71fd5 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 23 Mar 2022 08:03:23 +0100
Subject: [PATCH 19/41] More cosmetic refactoring

---
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 328 +++++++-----------
 cpp/include/raft/spatial/knn/knn.cuh          |   2 +-
 2 files changed, 129 insertions(+), 201 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index 46873873b4..99dc15e247 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -68,7 +68,7 @@
     so each thread must get different val/idx.
 
     After adding is finished, function done() should be called. And finally,
-    dump() is used to get the top-k result.
+    store() is used to get the top-k result.
 
     Example:
       __global__ void kernel() {
@@ -82,7 +82,7 @@
         }
 
         queue.done();
-        queue.dump(out, out_idx);
+        queue.store(out, out_idx);
      }
 
      int smem_size = calc_smem_size_for_block_wide<T>(...);
@@ -115,7 +115,7 @@
 
         queue.done();
         // each warp outputs to a different offset
-        queue.dump(out+ warp_id * k * sizeof(T), out_idx+ warp_id * k * sizeof(IdxT));
+        queue.store(out+ warp_id * k * sizeof(T), out_idx+ warp_id * k * sizeof(IdxT));
       }
  */
 
@@ -125,14 +125,15 @@ static constexpr int kMaxCapacity = 256;
 
 namespace {
 
-template <bool greater, typename T>
-__device__ inline bool is_greater_than(T val, T baseline)
+/** Whether 'left` should indeed be on the left w.r.t. `right`. */
+template <bool Ascending, typename T>
+__device__ __forceinline__ auto is_ordered(T left, T right) -> bool
 {
-  if constexpr (greater) { return val > baseline; }
-  if constexpr (!greater) { return val < baseline; }
+  if constexpr (Ascending) { return left < right; }
+  if constexpr (!Ascending) { return left > right; }
 }
 
-int calc_capacity(int k)
+constexpr auto calc_capacity(int k) -> int
 {
   int capacity = isPo2(k) ? k : (1 << (log2(k) + 1));
   if (capacity < WarpSize) { capacity = WarpSize; }  // TODO: remove this to allow small sizes.
@@ -143,22 +144,22 @@ int calc_capacity(int k)
 
 /**
  * A fixed-size warp-level priority queue.
- * By feeding the data through this queue, you get the `k <= capacity`
+ * By feeding the data through this queue, you get the `k <= Capacity`
  * smallest/greatest values in the data.
  *
- * @tparam capacity
+ * @tparam Capacity
  *   maximum number of elements in the queue.
- * @tparam greater
- *   which comparison to use: `true` means `>`, `false` means `<`.
+ * @tparam Ascending
+ *   which comparison to use: `true` means `<`, `false` means `>`.
  * @tparam T
  *   the type of keys (what is being compared)
  * @tparam IdxT
  *   the type of payload (normally, indices of elements), i.e.
  *   the content sorted alongside the keys.
  */
-template <int capacity, bool greater, typename T, typename IdxT>
+template <int Capacity, bool Ascending, typename T, typename IdxT>
 class WarpSort {
-  static_assert(isPo2(capacity));
+  static_assert(isPo2(Capacity));
 
  public:
   /**
@@ -168,7 +169,7 @@ class WarpSort {
    *   number of elements to select.
    * @param dummy
    *   the `empty` value for the choosen binary operation,
-   *   i.e. `greater ? lower_bound<T>() : upper_bound<T>()`.
+   *   i.e. `Ascending ? upper_bound<T>() : lower_bound<T>()`.
    *
    */
   __device__ WarpSort(IdxT k, T dummy) : k_(k), dummy_(dummy)
@@ -182,23 +183,24 @@ class WarpSort {
   /**
    * Load k values from the pointers at the given position, and merge them in the storage.
    */
-  __device__ void load_sorted(const T* in, const IdxT* in_idx, IdxT start)
+  __device__ void load_sorted(const T* in, const IdxT* in_idx)
   {
-    IdxT idx = start + kWarpWidth - 1 - Pow2<kWarpWidth>::mod(laneId());
+    IdxT idx = kWarpWidth - 1 - Pow2<kWarpWidth>::mod(laneId());
 #pragma unroll
     for (int i = kMaxArrLen - 1; i >= 0; --i, idx += kWarpWidth) {
-      if (idx < start + k_) {
+      if (idx < k_) {
         T t = in[idx];
-        if (is_greater_than<greater>(t, val_arr_[i])) {
+        if (is_ordered<Ascending>(t, val_arr_[i])) {
           val_arr_[i] = t;
           idx_arr_[i] = in_idx[idx];
         }
       }
     }
-    ivf_flat::bitonic<kMaxArrLen>(!greater, kWarpWidth).merge(val_arr_, idx_arr_);
+    ivf_flat::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
   }
 
-  __device__ void dump(T* out, IdxT* out_idx) const
+  /** Save the content by the pointer location. */
+  __device__ void store(T* out, IdxT* out_idx) const
   {
     IdxT idx = Pow2<kWarpWidth>::mod(laneId());
 #pragma unroll kMaxArrLen
@@ -209,22 +211,54 @@ class WarpSort {
   }
 
  protected:
-  static constexpr int kWarpWidth = std::min<int>(capacity, WarpSize);
-  static constexpr int kMaxArrLen = capacity / kWarpWidth;
+  static constexpr int kWarpWidth = std::min<int>(Capacity, WarpSize);
+  static constexpr int kMaxArrLen = Capacity / kWarpWidth;
 
   const IdxT k_;
   const T dummy_;
   T val_arr_[kMaxArrLen];
   IdxT idx_arr_[kMaxArrLen];
+
+  /**
+   * Merge another array (sorted in the opposite direction) in the queue.
+   * Thanks to the other array being sorted in the opposite direction,
+   * it's enough to call bitonic.merge once to maintain the valid state
+   * of the queue.
+   *
+   * @tparam PerThreadSizeIn
+   *   the size of the other array per-thread (compared to `kMaxArrLen`).
+   *
+   * @param keys_in
+   *   the values to be merged in. Pointers are unique per-thread. The values
+   *   must already be sorted in the opposite direction.
+   *   The layout of `keys_in` must be the same as the layout of `val_arr_`.
+   * @param ids_in
+   *   the associated indices of the elements in the same format as `keys_in`.
+   */
+  template <int PerThreadSizeIn>
+  __device__ __forceinline__ void merge_in(const T* __restrict__ keys_in,
+                                           const IdxT* __restrict__ ids_in)
+  {
+#pragma unroll
+    for (int i = std::min(kMaxArrLen, PerThreadSizeIn); i > 0; i--) {
+      T& key  = val_arr_[kMaxArrLen - i];
+      T other = keys_in[PerThreadSizeIn - i];
+      if (is_ordered<Ascending>(other, key)) {
+        key                      = other;
+        idx_arr_[kMaxArrLen - i] = ids_in[PerThreadSizeIn - i];
+      }
+    }
+    ivf_flat::bitonic<kMaxArrLen>(Ascending).merge(val_arr_, idx_arr_);
+  }
 };
 
-template <int capacity, bool greater, typename T, typename IdxT>
-class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
-  static_assert(capacity >= WarpSize);
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class WarpSelect : public WarpSort<Capacity, Ascending, T, IdxT> {
+  static_assert(Capacity >= WarpSize);
 
  public:
   __device__ WarpSelect(int k, T dummy)
-    : WarpSort<capacity, greater, T, IdxT>(k, dummy), buf_len_(0), k_th_(dummy)
+    : WarpSort<Capacity, Ascending, T, IdxT>(k, dummy), buf_len_(0), k_th_(dummy)
   {
 #pragma unroll
     for (int i = 0; i < kMaxBufLen; i++) {
@@ -232,15 +266,6 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
     }
   }
 
-  __device__ void add(const T* in, IdxT start, IdxT end)
-  {
-    const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
-    for (IdxT i = start + laneId(); i < end_for_fullwarp; i += WarpSize) {
-      T val = (i < end) ? in[i] : dummy_;
-      add(val, i);
-    }
-  }
-
   __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
@@ -255,7 +280,7 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
   {
     // comparing for k_th should reduce the total amount of updates:
     // `false` means the input value is surely not in the top-k values.
-    if (is_greater_than<greater>(val, k_th_)) {
+    if (is_ordered<Ascending>(val, k_th_)) {
       // NB: the loop is used here to ensure the constant indexing,
       //     to not force the buffers spill into the local memory.
 #pragma unroll
@@ -286,21 +311,8 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
 
   __device__ void merge_buf_()
   {
-    ivf_flat::bitonic<kMaxBufLen>(greater).sort(val_buf_, idx_buf_);
-    // merge the tails of both value arrays, which means possibly updating
-    // smallest of the val_arr_ with the largest of the val_buf_, because
-    // they are sorted in the opposite directions.
-#pragma unroll
-    for (int i = std::min(kMaxArrLen, kMaxBufLen); i > 0; i--) {
-      T& val = val_arr_[kMaxArrLen - i];
-      T buf  = val_buf_[kMaxBufLen - i];
-      if (is_greater_than<greater>(buf, val)) {
-        val                      = buf;
-        idx_arr_[kMaxArrLen - i] = idx_buf_[kMaxBufLen - i];
-      }
-    }
-    ivf_flat::bitonic<kMaxArrLen>(!greater).merge(val_arr_, idx_arr_);
-
+    ivf_flat::bitonic<kMaxBufLen>(!Ascending).sort(val_buf_, idx_buf_);
+    this->merge_in<kMaxBufLen>(val_buf_, idx_buf_);
     buf_len_ = 0;
     set_k_th_();  // contains warp sync
 #pragma unroll
@@ -309,13 +321,13 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
     }
   }
 
-  using WarpSort<capacity, greater, T, IdxT>::kMaxArrLen;
-  using WarpSort<capacity, greater, T, IdxT>::val_arr_;
-  using WarpSort<capacity, greater, T, IdxT>::idx_arr_;
-  using WarpSort<capacity, greater, T, IdxT>::k_;
-  using WarpSort<capacity, greater, T, IdxT>::dummy_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using WarpSort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::k_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::dummy_;
 
-  static constexpr int kMaxBufLen = (capacity <= 64) ? 2 : 4;
+  static constexpr int kMaxBufLen = (Capacity <= 64) ? 2 : 4;
 
   T val_buf_[kMaxBufLen];
   IdxT idx_buf_[kMaxBufLen];
@@ -324,13 +336,13 @@ class WarpSelect : public WarpSort<capacity, greater, T, IdxT> {
   T k_th_;
 };
 
-template <int capacity, bool greater, typename T, typename IdxT>
-class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
-  static_assert(capacity >= WarpSize);
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class WarpBitonic : public WarpSort<Capacity, Ascending, T, IdxT> {
+  static_assert(Capacity >= WarpSize);
 
  public:
   __device__ WarpBitonic(int k, T dummy)
-    : WarpSort<capacity, greater, T, IdxT>(k, dummy), buf_len_(0)
+    : WarpSort<Capacity, Ascending, T, IdxT>(k, dummy), buf_len_(0)
   {
 #pragma unroll
     for (int i = 0; i < kMaxArrLen; i++) {
@@ -338,25 +350,14 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
     }
   }
 
-  __device__ void add(const T* in, IdxT start, IdxT end)
-  {
-    add_first_(in, start, end);
-    start += capacity;
-    while (start < end) {
-      add_extra_(in, start, end);
-      merge_();
-      start += capacity;
-    }
-  }
-
   __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     add_first_(in, in_idx, start, end);
-    start += capacity;
+    start += Capacity;
     while (start < end) {
       add_extra_(in, in_idx, start, end);
-      merge_();
-      start += capacity;
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
+      start += Capacity;
     }
   }
 
@@ -374,8 +375,8 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
 
     ++buf_len_;
     if (buf_len_ == kMaxArrLen) {
-      ivf_flat::bitonic<kMaxArrLen>(greater).sort(val_buf_, idx_buf_);
-      merge_();
+      ivf_flat::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
 #pragma unroll
       for (int i = 0; i < kMaxArrLen; i++) {
         val_buf_[i] = dummy_;
@@ -387,26 +388,13 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
   __device__ void done()
   {
     if (buf_len_ != 0) {
-      ivf_flat::bitonic<kMaxArrLen>(greater).sort(val_buf_, idx_buf_);
-      merge_();
+      ivf_flat::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+      this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
     }
   }
 
  private:
-  // Fill in the primary val_arr_/idx_arr_
-  __device__ void add_first_(const T* in, IdxT start, IdxT end)
-  {
-    IdxT idx = start + laneId();
-    for (int i = 0; i < kMaxArrLen; ++i, idx += WarpSize) {
-      if (idx < end) {
-        val_arr_[i] = in[idx];
-        idx_arr_[i] = idx;
-      }
-    }
-    ivf_flat::bitonic<kMaxArrLen>(!greater).sort(val_arr_, idx_arr_);
-  }
-
-  // Fill in the primary val_arr_/idx_arr_
+  /** Fill in the primary val_arr_/idx_arr_ */
   __device__ void add_first_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     IdxT idx = start + laneId();
@@ -416,21 +404,10 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
         idx_arr_[i] = in_idx[idx];
       }
     }
-    ivf_flat::bitonic<kMaxArrLen>(!greater).sort(val_arr_, idx_arr_);
+    ivf_flat::bitonic<kMaxArrLen>(Ascending).sort(val_arr_, idx_arr_);
   }
 
-  // Fill in the secondary val_buf_/idx_buf_
-  __device__ void add_extra_(const T* in, IdxT start, IdxT end)
-  {
-    IdxT idx = start + laneId();
-    for (int i = 0; i < kMaxArrLen; ++i, idx += WarpSize) {
-      val_buf_[i] = (idx < end) ? in[idx] : dummy_;
-      idx_buf_[i] = idx;
-    }
-    ivf_flat::bitonic<kMaxArrLen>(greater).sort(val_buf_, idx_buf_);
-  }
-
-  // Fill in the secondary val_buf_/idx_buf_
+  /** Fill in the secondary val_buf_/idx_buf_ */
   __device__ void add_extra_(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     IdxT idx = start + laneId();
@@ -438,25 +415,14 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
       val_buf_[i] = (idx < end) ? in[idx] : dummy_;
       idx_buf_[i] = (idx < end) ? in_idx[idx] : std::numeric_limits<IdxT>::max();
     }
-    ivf_flat::bitonic<kMaxArrLen>(greater).sort(val_buf_, idx_buf_);
+    ivf_flat::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
   }
 
-  __device__ void merge_()
-  {
-    for (int i = 0; i < kMaxArrLen; ++i) {
-      if (is_greater_than<greater>(val_buf_[i], val_arr_[i])) {
-        val_arr_[i] = val_buf_[i];
-        idx_arr_[i] = idx_buf_[i];
-      }
-    }
-    ivf_flat::bitonic<kMaxArrLen>(!greater).merge(val_arr_, idx_arr_);
-  }
-
-  using WarpSort<capacity, greater, T, IdxT>::kMaxArrLen;
-  using WarpSort<capacity, greater, T, IdxT>::val_arr_;
-  using WarpSort<capacity, greater, T, IdxT>::idx_arr_;
-  using WarpSort<capacity, greater, T, IdxT>::k_;
-  using WarpSort<capacity, greater, T, IdxT>::dummy_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using WarpSort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::k_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::dummy_;
 
   T val_buf_[kMaxArrLen];
   IdxT idx_buf_[kMaxArrLen];
@@ -470,10 +436,10 @@ class WarpBitonic : public WarpSort<capacity, greater, T, IdxT> {
  *   Under this assumption, we can use load_sorted to just do the merging, rather than
  *   the full sort.
  */
-template <int capacity, bool greater, typename T, typename IdxT>
-class WarpMerge : public WarpSort<capacity, greater, T, IdxT> {
+template <int Capacity, bool Ascending, typename T, typename IdxT>
+class WarpMerge : public WarpSort<Capacity, Ascending, T, IdxT> {
  public:
-  __device__ WarpMerge(int k, T dummy) : WarpSort<capacity, greater, T, IdxT>(k, dummy) {}
+  __device__ WarpMerge(int k, T dummy) : WarpSort<Capacity, Ascending, T, IdxT>(k, dummy) {}
 
   // NB: the input is already sorted, because it's the second pass.
   __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
@@ -488,19 +454,19 @@ class WarpMerge : public WarpSort<capacity, greater, T, IdxT> {
     }
 
     for (start += k_; start < end; start += k_) {
-      load_sorted(in, in_idx, start);
+      load_sorted(in + start, in_idx + start);
     }
   }
 
   __device__ void done() {}
 
  private:
-  using WarpSort<capacity, greater, T, IdxT>::kWarpWidth;
-  using WarpSort<capacity, greater, T, IdxT>::kMaxArrLen;
-  using WarpSort<capacity, greater, T, IdxT>::val_arr_;
-  using WarpSort<capacity, greater, T, IdxT>::idx_arr_;
-  using WarpSort<capacity, greater, T, IdxT>::k_;
-  using WarpSort<capacity, greater, T, IdxT>::dummy_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using WarpSort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using WarpSort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::k_;
+  using WarpSort<Capacity, Ascending, T, IdxT>::dummy_;
 };
 
 template <typename T, typename IdxT>
@@ -510,8 +476,8 @@ int calc_smem_size_for_block_wide(int num_of_warp, IdxT k)
 }
 
 template <template <int, bool, typename, typename> class WarpSortWarpWide,
-          int capacity,
-          bool greater,
+          int Capacity,
+          bool Ascending,
           typename T,
           typename IdxT>
 class WarpSortBlockWide {
@@ -527,9 +493,6 @@ class WarpSortBlockWide {
 
   __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
-    // static_assert(std::is_same_v<WarpSortWarpWide<capacity, greater, T, IdxT>,
-    //                              WarpMerge<capacity, greater, T, IdxT>>);
-
     int num_of_warp   = blockDim.x / WarpSize;
     const int warp_id = threadIdx.x / WarpSize;
     IdxT len_per_warp = (end - start - 1) / num_of_warp + 1;
@@ -541,33 +504,6 @@ class WarpSortBlockWide {
     queue_.add(in, in_idx, warp_start, warp_end);
   }
 
-  // can't use the form of "in + len" and let the caller pass "in" by setting it to
-  // correct offset.
-  // It's due to the need to fill idx.
-  // so has to pass the correct offset as "start"
-  __device__ void add(const T* in, IdxT start, IdxT end)
-  {
-    if constexpr (std::is_same_v<WarpSortWarpWide<capacity, greater, T, IdxT>,
-                                 WarpSelect<capacity, greater, T, IdxT>>) {
-      const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
-      for (IdxT i = start + threadIdx.x; i < end_for_fullwarp; i += blockDim.x) {
-        T val = (i < end) ? in[i] : dummy_;
-        queue_.add(val, i);
-      }
-    } else if constexpr (std::is_same_v<WarpSortWarpWide<capacity, greater, T, IdxT>,
-                                        WarpBitonic<capacity, greater, T, IdxT>>) {
-      int num_of_warp   = blockDim.x / WarpSize;
-      const int warp_id = threadIdx.x / WarpSize;
-      IdxT len_per_warp = (end - start - 1) / num_of_warp + 1;
-      len_per_warp      = Pow2<WarpSize>::roundUp(len_per_warp);
-
-      IdxT warp_start = start + warp_id * len_per_warp;
-      IdxT warp_end   = warp_start + len_per_warp;
-      if (warp_end > end) { warp_end = end; }
-      queue_.add(in, warp_start, warp_end);
-    }
-  }
-
   __device__ void add(T val, IdxT idx) { queue_.add(val, idx); }
 
   /**
@@ -587,26 +523,29 @@ class WarpSortBlockWide {
       int half_num_of_warp = (num_of_warp + 1) / 2;
       if (warp_id < num_of_warp && warp_id >= half_num_of_warp) {
         int dst_warp_id = warp_id - half_num_of_warp;
-        queue_.dump(val_smem_ + dst_warp_id * k_, idx_smem_ + dst_warp_id * k_);
+        queue_.store(val_smem_ + dst_warp_id * k_, idx_smem_ + dst_warp_id * k_);
       }
       __syncthreads();
 
-      if (warp_id < num_of_warp / 2) { queue_.load_sorted(val_smem_, idx_smem_, warp_id * k_); }
+      if (warp_id < num_of_warp / 2) {
+        queue_.load_sorted(val_smem_ + warp_id * k_, idx_smem_ + warp_id * k_);
+      }
       __syncthreads();
 
       num_of_warp = half_num_of_warp;
     }
   }
 
-  __device__ void dump(T* out, IdxT* out_idx) const
+  /** Save the content by the pointer location. */
+  __device__ void store(T* out, IdxT* out_idx) const
   {
-    if (threadIdx.x < kWarpWidth) { queue_.dump(out, out_idx); }
+    if (threadIdx.x < kWarpWidth) { queue_.store(out, out_idx); }
   }
 
  private:
-  static constexpr int kWarpWidth = std::min<int>(capacity, WarpSize);
+  static constexpr int kWarpWidth = std::min<int>(Capacity, WarpSize);
 
-  WarpSortWarpWide<capacity, greater, T, IdxT> queue_;
+  WarpSortWarpWide<Capacity, Ascending, T, IdxT> queue_;
   int k_;
   T dummy_;
   T* val_smem_;
@@ -620,8 +559,8 @@ class WarpSortBlockWide {
  * into one final output.
  */
 template <template <int, bool, typename, typename> class WarpSortClass,
-          int capacity,
-          bool greater,
+          int Capacity,
+          bool Ascending,
           typename T,
           typename IdxT>
 __global__ void block_kernel(
@@ -634,21 +573,12 @@ __global__ void block_kernel(
   const IdxT start         = blockIdx.x * len_per_block;
   const IdxT end           = std::min(len, start + len_per_block);
 
-  WarpSortBlockWide<WarpSortClass, capacity, greater, T, IdxT> queue(k, dummy, smem_buf);
-  if constexpr (std::is_same_v<WarpSortClass<capacity, greater, T, IdxT>,
-                               WarpMerge<capacity, greater, T, IdxT>>) {
-    queue.add(in + blockIdx.y * len, in_idx + blockIdx.y * len, start, end);
-  } else {
-    if (in_idx == nullptr) {
-      queue.add(in + blockIdx.y * len, start, end);
-    } else {
-      queue.add(in + blockIdx.y * len, in_idx + blockIdx.y * len, start, end);
-    }
-  }
+  WarpSortBlockWide<WarpSortClass, Capacity, Ascending, T, IdxT> queue(k, dummy, smem_buf);
+  queue.add(in + blockIdx.y * len, in_idx + blockIdx.y * len, start, end);
 
   queue.done();
   const int block_id = blockIdx.x + gridDim.x * blockIdx.y;
-  queue.dump(out + block_id * k, out_idx + block_id * k);
+  queue.store(out + block_id * k, out_idx + block_id * k);
 }
 
 template <template <int, bool, typename, typename> class WarpSortClass,
@@ -685,7 +615,7 @@ struct launch_setup {
   }
 
   static void kernel(int k,
-                     bool greater,
+                     bool select_min,
                      IdxT batch_size,
                      IdxT len,
                      int num_blocks,
@@ -701,7 +631,7 @@ struct launch_setup {
     if constexpr (Capacity > WarpSize) {  // TODO: replace with `Capacity > 1` to allow small sizes.
       if (capacity < Capacity) {
         return launch_setup<WarpSortClass, T, IdxT, Capacity / 2>::kernel(k,
-                                                                          greater,
+                                                                          select_min,
                                                                           batch_size,
                                                                           len,
                                                                           num_blocks,
@@ -715,14 +645,14 @@ struct launch_setup {
       }
     }
     ASSERT(capacity <= Capacity, "Requested k is too big (%d)", k);
-    T dummy = greater ? lower_bound<T>() : upper_bound<T>();
+    T dummy = select_min ? upper_bound<T>() : lower_bound<T>();
     // This is less than cuda's max block dim along Y axis (65535), but it's a
     // power-of-two, which ensures the alignment of batches in memory.
     constexpr IdxT kMaxGridDimY = 32768;
     for (IdxT offset = 0; offset < batch_size; offset += kMaxGridDimY) {
       IdxT batch_chunk = std::min<IdxT>(kMaxGridDimY, batch_size - offset);
       dim3 gs(num_blocks, batch_chunk, 1);
-      if (greater) {
+      if (select_min) {
         block_kernel<WarpSortClass, Capacity, true>
           <<<gs, block_dim, smem_size, stream>>>(in_key + offset * len,
                                                  in_idx + offset * len,
@@ -844,14 +774,13 @@ void warp_sort_topk_(int num_of_block,
                      size_t len,
                      int k,
                      T* out,
-                     IdxT* out_idx       = nullptr,
-                     bool greater        = true,
+                     IdxT* out_idx,
+                     bool select_min,
                      cudaStream_t stream = 0)
 {
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream);
 
-  // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
   int capacity = calc_capacity(k);
 
   T* result_val    = (num_of_block == 1) ? out : tmp_val.data();
@@ -859,7 +788,7 @@ void warp_sort_topk_(int num_of_block,
   int block_dim    = num_of_warp * WarpSize;
   int smem_size    = calc_smem_size_for_block_wide<T>(num_of_warp, (IdxT)k);
   launch_setup<WarpSortClass, T, IdxT>::kernel((IdxT)k,
-                                               greater,
+                                               select_min,
                                                (IdxT)batch_size,
                                                (IdxT)len,
                                                num_of_block,
@@ -874,11 +803,10 @@ void warp_sort_topk_(int num_of_block,
   if (num_of_block > 1) {
     len = k * num_of_block;
     calc_launch_parameter_for_merge<T>(len, k, &num_of_block, &num_of_warp);
-    // printf("#block=%d, #warp=%d\n", num_of_block, num_of_warp);
     block_dim = num_of_warp * WarpSize;
     smem_size = calc_smem_size_for_block_wide<T>(num_of_warp, (IdxT)k);
     launch_setup<WarpMerge, T, IdxT>::kernel((IdxT)k,
-                                             greater,
+                                             select_min,
                                              (IdxT)batch_size,
                                              (IdxT)len,
                                              num_of_block,
@@ -899,8 +827,8 @@ void warp_sort_topk(const T* in,
                     size_t len,
                     int k,
                     T* out,
-                    IdxT* out_idx                = nullptr,
-                    bool greater                 = true,
+                    IdxT* out_idx,
+                    bool select_min,
                     rmm::cuda_stream_view stream = 0)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
@@ -913,11 +841,11 @@ void warp_sort_topk(const T* in,
 
   if (len_per_warp <= capacity * LaunchThreshold<WarpBitonic>::len_factor_for_choosing) {
     warp_sort_topk_<WarpBitonic, T, IdxT>(
-      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, greater, stream);
+      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
   } else {
     calc_launch_parameter<WarpSelect, T>(batch_size, len, k, &num_of_block, &num_of_warp);
     warp_sort_topk_<WarpSelect, T, IdxT>(
-      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, greater, stream);
+      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 3b4c691b99..d17665e08c 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -158,7 +158,7 @@ inline void select_k(value_t* in_keys,
 
     case SelectKAlgo::WARP_SORT:
       detail::ivf_flat::warp_sort_topk<value_t, idx_t>(
-        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     default: ASSERT(false, "Unknown algorithm (id = %d)", int(algo));

From 3285de562183b61942d902ad4eb8c17b26dffd3b Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 23 Mar 2022 08:22:41 +0100
Subject: [PATCH 20/41] Even more cosmetic refactoring

---
 .../knn/detail/ivf_flat/bitonic_sort.cuh      |  9 ++++---
 .../knn/detail/ivf_flat/radix_topk.cuh        | 14 +++++++---
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 26 +++++--------------
 3 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
index f892f6f9ae..813c5e85e9 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
@@ -41,16 +41,17 @@ __device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
 /**
  * Warp-wide bitonic merge and sort.
  * The data is strided among `warp_width` threads,
- * e.g. calling `bitonic<4>::sort(arr)` takes a unique 4-element array as input of each thread in a
- * warp and sorts them, such that for a fixed i, arr[i] are sorted within the threads in a warp, and
- * for any i < j, arr[j] in any thread is not smaller than arr[i] in any other thread.
+ * e.g. calling `bitonic<4>(ascending=true).sort(arr)` takes a unique 4-element array as input of
+ * each thread in a warp and sorts them, such that for a fixed i, arr[i] are sorted within the
+ * threads in a warp, and for any i < j, arr[j] in any thread is not smaller than arr[i] in any
+ * other thread.
  *
  * As an example, assuming `Size = 4`, `warp_width = 16`, and `WarpSize = 32`, the layout is:
  * `
  *  arr_i \ laneId()
  *       0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15    16  17  18 ...
  *      subwarp_1                                                         subwarp_2
- *   0   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15     0   1   2  ...
+ *   0   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15     0   1   2 ...
  *   1  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31    16  17  18 ...
  *   2  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47    32  33  34 ...
  *   3  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63    48  49  50 ...
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
index 19b363e5a0..8f67c6d4ea 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
@@ -44,8 +44,13 @@ __host__ __device__ constexpr int calc_num_passes()
   return ceildiv<int>(sizeof(T) * 8, BITS_PER_PASS);
 }
 
-// bit 0 is the least significant (rightmost) bit
-// this function works even when pass=-1, which is used in calc_mask()
+/**
+ * Bit 0 is the least significant (rightmost);
+ * this implementation processes input from the most to the least significant bit.
+ * This way, we can skip some passes in the end at the cost of having an unsorted output.
+ *
+ * NB: Use pass=-1 for calc_mask().
+ */
 template <typename T, int BITS_PER_PASS>
 __device__ constexpr int calc_start_bit(int pass)
 {
@@ -63,7 +68,10 @@ __device__ constexpr unsigned calc_mask(int pass)
   return (1 << num_bits) - 1;
 }
 
-/** Use cub to twiddle bits - so that we can correctly compare bits of floating-point values. */
+/**
+ * Use cub to twiddle bits - so that we can correctly compare bits of floating-point values as well
+ * as of integers.
+ */
 template <typename T>
 __device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
 {
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index 99dc15e247..d8cccbd58c 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -26,26 +26,13 @@
 #include <type_traits>
 
 /*
-  Three APIs of different scope are provided:
+  Three APIs of different scopes are provided:
     1. host function: warp_sort_topk()
     2. block-wide API: class WarpSortBlockWide
     3. warp-wide API: class WarpSelect and class WarpBitonic
 
 
-  1. warp_sort_topk()
-    Like CUB functions, it should be called twice.
-    First for getting required buffer size, and a second for the real top-k computation.
-    For the first call, buf==nullptr should be passed, and required buffer
-    size is returned as parameter buf_size.
-    For the second call, pass allocated buffer of required size.
-
-    Example:
-      void* buf = nullptr;
-      size_t buf_size;
-      warp_sort_topk(nullptr, buf_size, ...);  // will set buf_size
-      cudaMalloc(&buf, buf_size);
-      warp_sort_topk(buf, buf_size, ...);
-
+  1. warp_sort_topk() // the description for it is no longer true, should be deleted
 
   2. class WarpSortBlockWide
     It can be regarded as a fixed size priority queue for a thread block,
@@ -86,11 +73,11 @@
      }
 
      int smem_size = calc_smem_size_for_block_wide<T>(...);
-     kernel<<grid_dim, block_dim, smem_size>>>();
+     kernel<<<grid_dim, block_dim, smem_size>>>();
 
 
   3. class WarpSelect and class WarpBitonic
-    These two classes can be regarded as fixed sized priority queue for a warp.
+    These two classes can be regarded as fixed size priority queue for a warp.
     Usage is similar to class WarpSortBlockWide.
     Two types of add() functions are provided, and also note that [start, end) is
     for a whole warp, while val/idx is for a thread.
@@ -115,7 +102,7 @@
 
         queue.done();
         // each warp outputs to a different offset
-        queue.store(out+ warp_id * k * sizeof(T), out_idx+ warp_id * k * sizeof(IdxT));
+        queue.store(out+ warp_id * k, out_idx+ warp_id * k);
       }
  */
 
@@ -303,7 +290,6 @@ class WarpSelect : public WarpSort<Capacity, Ascending, T, IdxT> {
  private:
   __device__ void set_k_th_()
   {
-    // it's the best we can do, should use "val_arr_[k_th_row_]"
     // NB on using srcLane: it's ok if it is outside the warp size / width;
     //                      the modulo op will be done inside the __shfl_sync.
     k_th_ = shfl(val_arr_[kMaxArrLen - 1], k_ - 1);
@@ -554,7 +540,7 @@ class WarpSortBlockWide {
 
 /**
  * Uses the `WarpSortClass` to sort chunks of data within one block with no interblock
- * communication. It can be arranged so, that multiple blocks process one line of input; in this
+ * communication. It can be arranged so, that multiple blocks process one row of input; in this
  * case, they output multiple results of length k each. Then, a second pass is needed to merge those
  * into one final output.
  */

From faecc322dca805492eb30baddcd06ef414d1a8f1 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 23 Mar 2022 08:27:35 +0100
Subject: [PATCH 21/41] Flip the ascending/descending flag for radix_topk

---
 .../raft/spatial/knn/detail/ivf_flat/radix_topk.cuh       | 8 ++++----
 cpp/include/raft/spatial/knn/knn.cuh                      | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
index 8f67c6d4ea..c91eb55aad 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
@@ -423,7 +423,7 @@ void radix_topk_(const T* in,
                  int k,
                  T* out,
                  IdxT* out_idx,
-                 bool greater,
+                 bool select_min,
                  rmm::cuda_stream_view stream)
 {
   // TODO: is it possible to relax this restriction?
@@ -484,7 +484,7 @@ void radix_topk_(const T* in,
                                           histograms.data(),
                                           len,
                                           k,
-                                          greater,
+                                          !select_min,
                                           pass);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
@@ -498,7 +498,7 @@ void radix_topk(const T* in,
                 int k,
                 T* out,
                 IdxT* out_idx,
-                bool greater,
+                bool select_min,
                 rmm::cuda_stream_view stream)
 {
   for (size_t offset = 0; offset < batch_size; offset += MAX_BATCH_SIZE) {
@@ -510,7 +510,7 @@ void radix_topk(const T* in,
                                                     k,
                                                     out + offset * k,
                                                     out_idx + offset * k,
-                                                    greater,
+                                                    select_min,
                                                     stream);
   }
 }
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index d17665e08c..56c5451fbf 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -148,12 +148,12 @@ inline void select_k(value_t* in_keys,
 
     case SelectKAlgo::RADIX_8_BITS:
       detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(
-        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     case SelectKAlgo::RADIX_11_BITS:
       detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(
-        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, !select_min, stream);
+        in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     case SelectKAlgo::WARP_SORT:

From db24b108d74d2f36877995fa9d672eb1077df9ce Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 23 Mar 2022 08:32:17 +0100
Subject: [PATCH 22/41] Even more cosmetic refactoring

---
 cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index d8cccbd58c..d84cb825d9 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -787,6 +787,7 @@ void warp_sort_topk_(int num_of_block,
                                                stream);
 
   if (num_of_block > 1) {
+    // Merge the results across blocks using WarpMerge
     len = k * num_of_block;
     calc_launch_parameter_for_merge<T>(len, k, &num_of_block, &num_of_warp);
     block_dim = num_of_warp * WarpSize;

From a30a2fc10cda5055d074e4213a91e7172fdb3cee Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 23 Mar 2022 09:08:35 +0100
Subject: [PATCH 23/41] Fix a typo

---
 cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
index 813c5e85e9..d6c49f633c 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
@@ -143,7 +143,7 @@ class bitonic {
     -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
   {
     static_assert(S == Size);
-    return merge(*key, *payload...);
+    return merge(&key, &payload...);
   }
 
   /**
@@ -158,7 +158,7 @@ class bitonic {
     -> std::enable_if_t<S == 1, void>  // SFINAE to enable this for Size == 1 only
   {
     static_assert(S == Size);
-    return sort(*key, *payload...);
+    return sort(&key, &payload...);
   }
 
  private:

From c722d9f2d29a3ccb84b9e4fbfa196e476de31012 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 23 Mar 2022 15:09:50 +0100
Subject: [PATCH 24/41] Rename one of the 'add' overloads to reflect it should
 be used only once

---
 .../knn/detail/ivf_flat/warpsort_topk.cuh     | 41 ++++++++-----------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
index d84cb825d9..593ab9b85f 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
@@ -43,11 +43,11 @@
     So the required shared memory size should be calculated using
     calc_smem_size_for_block_wide() and passed as the 3rd kernel launch parameter.
 
-    Two overloaded add() functions can be used to add items to the queue.
-    One is add(const T* in, IdxT start, IdxT end) and it adds a range of items,
+    Two overloade functions can be used to add items to the queue.
+    One is load(const T* in, IdxT start, IdxT end) and it adds a range of items,
     namely [start, end) of in. The idx is inferred from start.
     This function should be called only once to add all items, and should not be
-    used together with the second form of add().
+    used together with the add().
     The second one is add(T val, IdxT idx), and it adds only one item pair.
     Note that the range [start, end) is for the whole block of threads, that is,
     each thread in the same block should get the same start/end.
@@ -253,7 +253,7 @@ class WarpSelect : public WarpSort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     const IdxT end_for_fullwarp = Pow2<WarpSize>::roundUp(end - start) + start;
     for (IdxT i = start + laneId(); i < end_for_fullwarp; i += WarpSize) {
@@ -336,7 +336,7 @@ class WarpBitonic : public WarpSort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     add_first_(in, in_idx, start, end);
     start += Capacity;
@@ -428,18 +428,9 @@ class WarpMerge : public WarpSort<Capacity, Ascending, T, IdxT> {
   __device__ WarpMerge(int k, T dummy) : WarpSort<Capacity, Ascending, T, IdxT>(k, dummy) {}
 
   // NB: the input is already sorted, because it's the second pass.
-  __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
-    IdxT idx       = start + Pow2<kWarpWidth>::mod(laneId());
-    IdxT first_end = (start + k_ < end) ? (start + k_) : end;
-    for (int i = 0; i < kMaxArrLen; ++i, idx += kWarpWidth) {
-      if (idx < first_end) {
-        val_arr_[i] = in[idx];
-        idx_arr_[i] = in_idx[idx];
-      }
-    }
-
-    for (start += k_; start < end; start += k_) {
+    for (; start < end; start += k_) {
       load_sorted(in + start, in_idx + start);
     }
   }
@@ -477,24 +468,24 @@ class WarpSortBlockWide {
                                         Pow2<256>::roundUp(num_of_warp / 2 * sizeof(T) * k_));
   }
 
-  __device__ void add(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
+  __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
   {
     int num_of_warp   = blockDim.x / WarpSize;
     const int warp_id = threadIdx.x / WarpSize;
-    IdxT len_per_warp = (end - start - 1) / num_of_warp + 1;
-    len_per_warp      = ((len_per_warp - 1) / k_ + 1) * k_;
+    IdxT len_per_warp = ceildiv<IdxT>(end - start, num_of_warp);
+    len_per_warp      = alignTo<IdxT>(len_per_warp, k_);
 
     IdxT warp_start = start + warp_id * len_per_warp;
     IdxT warp_end   = warp_start + len_per_warp;
     if (warp_end > end) { warp_end = end; }
-    queue_.add(in, in_idx, warp_start, warp_end);
+    queue_.load(in, in_idx, warp_start, warp_end);
   }
 
   __device__ void add(T val, IdxT idx) { queue_.add(val, idx); }
 
   /**
-   * At the point of calling this function, the warp-level queues consumed all input independently.
-   * The remaining work to be done is to merge them together.
+   * At the point of calling this function, the warp-level queues consumed all input
+   * independently. The remaining work to be done is to merge them together.
    *
    * Here we tree-merge the results using the shared memory and block sync.
    */
@@ -541,8 +532,8 @@ class WarpSortBlockWide {
 /**
  * Uses the `WarpSortClass` to sort chunks of data within one block with no interblock
  * communication. It can be arranged so, that multiple blocks process one row of input; in this
- * case, they output multiple results of length k each. Then, a second pass is needed to merge those
- * into one final output.
+ * case, they output multiple results of length k each. Then, a second pass is needed to merge
+ * those into one final output.
  */
 template <template <int, bool, typename, typename> class WarpSortClass,
           int Capacity,
@@ -560,7 +551,7 @@ __global__ void block_kernel(
   const IdxT end           = std::min(len, start + len_per_block);
 
   WarpSortBlockWide<WarpSortClass, Capacity, Ascending, T, IdxT> queue(k, dummy, smem_buf);
-  queue.add(in + blockIdx.y * len, in_idx + blockIdx.y * len, start, end);
+  queue.load(in + blockIdx.y * len, in_idx + blockIdx.y * len, start, end);
 
   queue.done();
   const int block_id = blockIdx.x + gridDim.x * blockIdx.y;

From fe95dedc374a6d1a312375def69be02ec2a7f3a3 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 24 Mar 2022 09:42:27 +0100
Subject: [PATCH 25/41] Refactor names and document radix_topk

---
 .../knn/detail/ivf_flat/radix_topk.cuh        | 252 +++++++++---------
 1 file changed, 120 insertions(+), 132 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
index c91eb55aad..ad83f50aa2 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
@@ -28,20 +28,19 @@
 namespace raft::spatial::knn::detail::ivf_flat {
 
 constexpr uint16_t MAX_BATCH_SIZE  = 1024;
-constexpr int BLOCK_DIM            = 512;
 constexpr int ITEM_PER_THREAD      = 32;
 constexpr int VECTORIZED_READ_SIZE = 16;
 
-template <int BITS_PER_PASS>
+template <int BitsPerPass>
 __host__ __device__ constexpr int calc_num_buckets()
 {
-  return 1 << BITS_PER_PASS;
+  return 1 << BitsPerPass;
 }
 
-template <typename T, int BITS_PER_PASS>
+template <typename T, int BitsPerPass>
 __host__ __device__ constexpr int calc_num_passes()
 {
-  return ceildiv<int>(sizeof(T) * 8, BITS_PER_PASS);
+  return ceildiv<int>(sizeof(T) * 8, BitsPerPass);
 }
 
 /**
@@ -51,20 +50,19 @@ __host__ __device__ constexpr int calc_num_passes()
  *
  * NB: Use pass=-1 for calc_mask().
  */
-template <typename T, int BITS_PER_PASS>
+template <typename T, int BitsPerPass>
 __device__ constexpr int calc_start_bit(int pass)
 {
-  int start_bit = static_cast<int>(sizeof(T) * 8) - (pass + 1) * BITS_PER_PASS;
+  int start_bit = static_cast<int>(sizeof(T) * 8) - (pass + 1) * BitsPerPass;
   if (start_bit < 0) { start_bit = 0; }
   return start_bit;
 }
 
-template <typename T, int BITS_PER_PASS>
+template <typename T, int BitsPerPass>
 __device__ constexpr unsigned calc_mask(int pass)
 {
-  static_assert(BITS_PER_PASS <= 31);
-  int num_bits =
-    calc_start_bit<T, BITS_PER_PASS>(pass - 1) - calc_start_bit<T, BITS_PER_PASS>(pass);
+  static_assert(BitsPerPass <= 31);
+  int num_bits = calc_start_bit<T, BitsPerPass>(pass - 1) - calc_start_bit<T, BitsPerPass>(pass);
   return (1 << num_bits) - 1;
 }
 
@@ -81,10 +79,10 @@ __device__ typename cub::Traits<T>::UnsignedBits twiddle_in(T key, bool greater)
   return bits;
 }
 
-template <typename T, int BITS_PER_PASS>
+template <typename T, int BitsPerPass>
 __device__ int calc_bucket(T x, int start_bit, unsigned mask, bool greater)
 {
-  static_assert(BITS_PER_PASS <= sizeof(int) * 8 - 1);  // so return type can be int
+  static_assert(BitsPerPass <= sizeof(int) * 8 - 1);  // so return type can be int
   return (twiddle_in(x, greater) >> start_bit) & mask;
 }
 
@@ -151,10 +149,9 @@ struct Counter {
   unsigned int finished_block_cnt;
   IdxT out_cnt;
   IdxT out_back_cnt;
-  T kth_value;
 };
 
-template <typename T, typename IdxT, int BITS_PER_PASS>
+template <typename T, typename IdxT, int BitsPerPass>
 __device__ void filter_and_histogram(const T* in_buf,
                                      const IdxT* in_idx_buf,
                                      T* out_buf,
@@ -168,19 +165,19 @@ __device__ void filter_and_histogram(const T* in_buf,
                                      int pass,
                                      int k)
 {
-  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ IdxT histogram_smem[num_buckets];
   for (IdxT i = threadIdx.x; i < num_buckets; i += blockDim.x) {
     histogram_smem[i] = 0;
   }
   __syncthreads();
 
-  const int start_bit = calc_start_bit<T, BITS_PER_PASS>(pass);
-  const unsigned mask = calc_mask<T, BITS_PER_PASS>(pass);
+  const int start_bit = calc_start_bit<T, BitsPerPass>(pass);
+  const unsigned mask = calc_mask<T, BitsPerPass>(pass);
 
   if (pass == 0) {
     auto f = [greater, start_bit, mask](T value, IdxT) {
-      int bucket = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
+      int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
       atomicAdd(histogram_smem + bucket, IdxT(1));
     };
     vectorized_process(in_buf, len, f);
@@ -189,10 +186,9 @@ __device__ void filter_and_histogram(const T* in_buf,
     const int want_bucket        = counter->bucket;
     IdxT& filter_cnt             = counter->filter_cnt;
     IdxT& out_cnt                = counter->out_cnt;
-    T& kth_value                 = counter->kth_value;
     const IdxT counter_len       = counter->len;
-    const int previous_start_bit = calc_start_bit<T, BITS_PER_PASS>(pass - 1);
-    const unsigned previous_mask = calc_mask<T, BITS_PER_PASS>(pass - 1);
+    const int previous_start_bit = calc_start_bit<T, BitsPerPass>(pass - 1);
+    const unsigned previous_mask = calc_mask<T, BitsPerPass>(pass - 1);
 
     auto f = [in_idx_buf,
               out_buf,
@@ -208,24 +204,19 @@ __device__ void filter_and_histogram(const T* in_buf,
               want_bucket,
               &filter_cnt,
               &out_cnt,
-              &kth_value,
               counter_len](T value, IdxT i) {
       int prev_bucket =
-        calc_bucket<T, BITS_PER_PASS>(value, previous_start_bit, previous_mask, greater);
+        calc_bucket<T, BitsPerPass>(value, previous_start_bit, previous_mask, greater);
       if (prev_bucket == want_bucket) {
         IdxT pos     = atomicAdd(&filter_cnt, IdxT(1));
         out_buf[pos] = value;
         if (out_idx_buf) { out_idx_buf[pos] = in_idx_buf ? in_idx_buf[i] : i; }
-        int bucket = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
+        int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
         atomicAdd(histogram_smem + bucket, IdxT(1));
 
         if (counter_len == 1) {
-          if (out) {
-            out[k - 1]     = value;
-            out_idx[k - 1] = in_idx_buf ? in_idx_buf[i] : i;
-          } else {
-            kth_value = value;
-          }
+          out[k - 1]     = value;
+          out_idx[k - 1] = in_idx_buf ? in_idx_buf[i] : i;
         }
       } else if (out && prev_bucket < want_bucket) {
         IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
@@ -243,13 +234,17 @@ __device__ void filter_and_histogram(const T* in_buf,
   }
 }
 
-template <typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
+/**
+ * Replace a part of the histogram with its own prefix sum, starting from the `start` and adding
+ * `current` to each entry of the result.
+ */
+template <typename IdxT, int BitsPerPass, int BlockSize>
 __device__ void scan(volatile IdxT* histogram,
                      const int start,
                      const int num_buckets,
                      const IdxT current)
 {
-  typedef cub::BlockScan<IdxT, NUM_THREAD> BlockScan;
+  typedef cub::BlockScan<IdxT, BlockSize> BlockScan;
   __shared__ typename BlockScan::TempStorage temp_storage;
 
   IdxT thread_data = 0;
@@ -263,55 +258,60 @@ __device__ void scan(volatile IdxT* histogram,
                     // to be read after
 }
 
-template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
+/** Calculate in which bucket the k-th value will fall */
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
 __device__ void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
 {
-  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   int index                 = threadIdx.x;
-  IdxT current_value        = 0;
+  IdxT last_prefix_sum      = 0;
   int num_pass              = 1;
-  if constexpr (num_buckets >= NUM_THREAD) {
-    static_assert(num_buckets % NUM_THREAD == 0);
-    num_pass = num_buckets / NUM_THREAD;
+  if constexpr (num_buckets >= BlockSize) {
+    static_assert(num_buckets % BlockSize == 0);
+    num_pass = num_buckets / BlockSize;
   }
 
-  for (int i = 0; i < num_pass && (current_value < k); i++) {
-    scan<IdxT, BITS_PER_PASS, NUM_THREAD>(histogram, i * NUM_THREAD, num_buckets, current_value);
+  for (int i = 0; i < num_pass && (last_prefix_sum < k); i++) {
+    // Turn the i-th chunk of the histogram into its prefix sum.
+    scan<IdxT, BitsPerPass, BlockSize>(histogram, i * BlockSize, num_buckets, last_prefix_sum);
     if (index < num_buckets) {
+      // Number of values in the previous `index-1` buckets (see the `scan` op above)
       IdxT prev = (index == 0) ? 0 : histogram[index - 1];
-      IdxT cur  = histogram[index];
+      // Number of values in `index` buckets
+      IdxT cur = histogram[index];
 
       // one and only one thread will satisfy this condition, so only write once
       if (prev < k && cur >= k) {
-        counter->k            = k - prev;
+        counter->k            = k - prev;  // how many values still are there to find
         counter->previous_len = counter->len;
-        counter->len          = cur - prev;
+        counter->len          = cur - prev;  // number of values in `index` bucket
         counter->bucket       = index;
       }
     }
-    index += NUM_THREAD;
-    current_value = histogram[(i + 1) * NUM_THREAD - 1];
+    index += BlockSize;
+    // this will break the loop when the counter is set (cur >= k), because last_prefix_sum >= cur
+    last_prefix_sum = histogram[(i + 1) * BlockSize - 1];
   }
 }
 
-template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
-__global__ void radix_kernel(const T* in_buf,
-                             const IdxT* in_idx_buf,
-                             T* out_buf,
-                             IdxT* out_idx_buf,
-                             T* out,
-                             IdxT* out_idx,
-                             Counter<T, IdxT>* counters,
-                             IdxT* histograms,
-                             const IdxT len,
-                             const int k,
-                             const bool greater,
-                             const int pass)
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+__global__ void __launch_bounds__(BlockSize) radix_kernel(const T* in_buf,
+                                                          const IdxT* in_idx_buf,
+                                                          T* out_buf,
+                                                          IdxT* out_idx_buf,
+                                                          T* out,
+                                                          IdxT* out_idx,
+                                                          Counter<T, IdxT>* counters,
+                                                          IdxT* histograms,
+                                                          const IdxT len,
+                                                          const int k,
+                                                          const bool greater,
+                                                          const int pass)
 {
   __shared__ bool isLastBlockDone;
 
-  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
-  constexpr int num_passes  = calc_num_passes<T, BITS_PER_PASS>();
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  constexpr int num_passes  = calc_num_passes<T, BitsPerPass>();
   const int batch_id        = blockIdx.y;
   in_buf += batch_id * len;
   out_buf += batch_id * len;
@@ -324,18 +324,18 @@ __global__ void radix_kernel(const T* in_buf,
   auto counter   = counters + batch_id;
   auto histogram = histograms + batch_id * num_buckets;
 
-  filter_and_histogram<T, IdxT, BITS_PER_PASS>(in_buf,
-                                               in_idx_buf,
-                                               out_buf,
-                                               out_idx_buf,
-                                               out,
-                                               out_idx,
-                                               len,
-                                               counter,
-                                               histogram,
-                                               greater,
-                                               pass,
-                                               k);
+  filter_and_histogram<T, IdxT, BitsPerPass>(in_buf,
+                                             in_idx_buf,
+                                             out_buf,
+                                             out_idx_buf,
+                                             out,
+                                             out_idx,
+                                             len,
+                                             counter,
+                                             histogram,
+                                             greater,
+                                             pass,
+                                             k);
   __threadfence();
 
   if (threadIdx.x == 0) {
@@ -363,48 +363,36 @@ __global__ void radix_kernel(const T* in_buf,
     IdxT ori_k = counter->k;
 
     if (counter->len > 0) {
-      choose_bucket<T, IdxT, BITS_PER_PASS, NUM_THREAD>(counter, histogram, ori_k);
+      choose_bucket<T, IdxT, BitsPerPass, BlockSize>(counter, histogram, ori_k);
     }
 
     __syncthreads();
     if (pass == num_passes - 1) {
       const IdxT previous_len = counter->previous_len;
       const int want_bucket   = counter->bucket;
-      int start_bit           = calc_start_bit<T, BITS_PER_PASS>(pass);
-      unsigned mask           = calc_mask<T, BITS_PER_PASS>(pass);
-
-      if (!out) {  // radix select
-        for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
-          const T value = out_buf[i];
-          int bucket    = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
-          if (bucket == want_bucket) {
-            // TODO: UB
-            // could use atomicExch, but it's not defined for T=half
-            counter->kth_value = value;
-            break;
-          }
-        }
-      } else {  // radix topk
-        IdxT& out_cnt = counter->out_cnt;
-        for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
-          const T value = out_buf[i];
-          int bucket    = calc_bucket<T, BITS_PER_PASS>(value, start_bit, mask, greater);
-          if (bucket < want_bucket) {
-            IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
+      int start_bit           = calc_start_bit<T, BitsPerPass>(pass);
+      unsigned mask           = calc_mask<T, BitsPerPass>(pass);
+
+      // radix topk
+      IdxT& out_cnt = counter->out_cnt;
+      for (IdxT i = threadIdx.x; i < previous_len; i += blockDim.x) {
+        const T value = out_buf[i];
+        int bucket    = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
+        if (bucket < want_bucket) {
+          IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
+          out[pos]     = value;
+          out_idx[pos] = out_idx_buf[i];
+        } else if (bucket == want_bucket) {
+          IdxT needed_num_of_kth = counter->k;
+          IdxT back_pos          = atomicAdd(&(counter->out_back_cnt), IdxT(1));
+          if (back_pos < needed_num_of_kth) {
+            IdxT pos     = k - 1 - back_pos;
             out[pos]     = value;
             out_idx[pos] = out_idx_buf[i];
-          } else if (bucket == want_bucket) {
-            IdxT needed_num_of_kth = counter->k;
-            IdxT back_pos          = atomicAdd(&(counter->out_back_cnt), IdxT(1));
-            if (back_pos < needed_num_of_kth) {
-              IdxT pos     = k - 1 - back_pos;
-              out[pos]     = value;
-              out_idx[pos] = out_idx_buf[i];
-            }
           }
         }
-        __syncthreads();
       }
+      __syncthreads();
     } else {
       // reset for next pass
       for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
@@ -415,7 +403,7 @@ __global__ void radix_kernel(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
 void radix_topk_(const T* in,
                  const IdxT* in_idx,
                  uint16_t batch_size,
@@ -427,8 +415,8 @@ void radix_topk_(const T* in,
                  rmm::cuda_stream_view stream)
 {
   // TODO: is it possible to relax this restriction?
-  static_assert(calc_num_passes<T, BITS_PER_PASS>() > 1);
-  constexpr int num_buckets = calc_num_buckets<BITS_PER_PASS>();
+  static_assert(calc_num_passes<T, BitsPerPass>() > 1);
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
 
   rmm::device_uvector<Counter<T, IdxT>> counters(batch_size, stream);
   rmm::device_uvector<IdxT> histograms(num_buckets * batch_size, stream);
@@ -446,9 +434,9 @@ void radix_topk_(const T* in,
   T* out_buf             = nullptr;
   IdxT* out_idx_buf      = nullptr;
 
-  dim3 blocks(ceildiv<size_t>(len, NUM_THREAD * ITEM_PER_THREAD), batch_size);
+  dim3 blocks(ceildiv<size_t>(len, BlockSize * ITEM_PER_THREAD), batch_size);
 
-  constexpr int num_passes = calc_num_passes<T, BITS_PER_PASS>();
+  constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
 
   for (int pass = 0; pass < num_passes; ++pass) {
     if (pass == 0) {
@@ -473,24 +461,24 @@ void radix_topk_(const T* in,
       out_idx_buf = idx_buf1.data();
     }
 
-    radix_kernel<T, IdxT, BITS_PER_PASS, NUM_THREAD>
-      <<<blocks, NUM_THREAD, 0, stream>>>(in_buf,
-                                          in_idx_buf,
-                                          out_buf,
-                                          out_idx_buf,
-                                          out,
-                                          out_idx,
-                                          counters.data(),
-                                          histograms.data(),
-                                          len,
-                                          k,
-                                          !select_min,
-                                          pass);
+    radix_kernel<T, IdxT, BitsPerPass, BlockSize>
+      <<<blocks, BlockSize, 0, stream>>>(in_buf,
+                                         in_idx_buf,
+                                         out_buf,
+                                         out_idx_buf,
+                                         out,
+                                         out_idx,
+                                         counters.data(),
+                                         histograms.data(),
+                                         len,
+                                         k,
+                                         !select_min,
+                                         pass);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }
 
-template <typename T, typename IdxT, int BITS_PER_PASS, int NUM_THREAD>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
 void radix_topk(const T* in,
                 const IdxT* in_idx,
                 size_t batch_size,
@@ -503,15 +491,15 @@ void radix_topk(const T* in,
 {
   for (size_t offset = 0; offset < batch_size; offset += MAX_BATCH_SIZE) {
     auto batch_chunk = uint16_t(std::min<size_t>(MAX_BATCH_SIZE, batch_size - offset));
-    radix_topk_<T, IdxT, BITS_PER_PASS, NUM_THREAD>(in + offset * len,
-                                                    in_idx + offset * len,
-                                                    batch_chunk,
-                                                    len,
-                                                    k,
-                                                    out + offset * k,
-                                                    out_idx + offset * k,
-                                                    select_min,
-                                                    stream);
+    radix_topk_<T, IdxT, BitsPerPass, BlockSize>(in + offset * len,
+                                                 in_idx + offset * len,
+                                                 batch_chunk,
+                                                 len,
+                                                 k,
+                                                 out + offset * k,
+                                                 out_idx + offset * k,
+                                                 select_min,
+                                                 stream);
   }
 }
 

From 00a62a4b22acc2e0cff083efd018eaa58d457edc Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 24 Mar 2022 12:13:17 +0100
Subject: [PATCH 26/41] Choose the batch size dynamically

---
 .../knn/detail/ivf_flat/radix_topk.cuh        | 181 ++++++++++--------
 1 file changed, 100 insertions(+), 81 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
index ad83f50aa2..e866738cbd 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
@@ -27,7 +27,6 @@
 
 namespace raft::spatial::knn::detail::ivf_flat {
 
-constexpr uint16_t MAX_BATCH_SIZE  = 1024;
 constexpr int ITEM_PER_THREAD      = 32;
 constexpr int VECTORIZED_READ_SIZE = 16;
 
@@ -403,79 +402,41 @@ __global__ void __launch_bounds__(BlockSize) radix_kernel(const T* in_buf,
   }
 }
 
+/**
+ * Calculate the minimal batch size, such that GPU is still fully occupied.
+ */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
-void radix_topk_(const T* in,
-                 const IdxT* in_idx,
-                 uint16_t batch_size,
-                 size_t len,
-                 int k,
-                 T* out,
-                 IdxT* out_idx,
-                 bool select_min,
-                 rmm::cuda_stream_view stream)
+inline uint16_t get_optimal_batch_size(size_t req_batch_size, size_t blocks_per_row)
 {
-  // TODO: is it possible to relax this restriction?
-  static_assert(calc_num_passes<T, BitsPerPass>() > 1);
-  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+  int dev_id, sm_count, occupancy, max_grid_dim_y;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&max_grid_dim_y, cudaDevAttrMaxGridDimY, dev_id));
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &occupancy, radix_kernel<T, IdxT, BitsPerPass, BlockSize>, BlockSize, 0));
+
+  // fully occupy GPU
+  size_t opt_batch_size = ceildiv<size_t>(sm_count * occupancy, blocks_per_row);
+  // round it up to the closest pow-of-two for better data alignment
+  opt_batch_size = isPo2(opt_batch_size) ? opt_batch_size : (1 << (log2(opt_batch_size) + 1));
+  // Take a max possible pow-of-two grid_dim_y
+  max_grid_dim_y = isPo2(max_grid_dim_y) ? max_grid_dim_y : (1 << log2(max_grid_dim_y));
+  // If the optimal batch size is very small compared to the requested batch size, we know
+  // the extra required memory is not significant and we can increase the batch size for
+  // better occupancy when the grid size is not multiple of the SM count.
+  // Also don't split the batch size when there is not much work overall.
+  const size_t safe_enlarge_factor = 9;
+  const size_t min_grid_size       = 1024;
+  while ((opt_batch_size << safe_enlarge_factor) < req_batch_size ||
+         blocks_per_row * opt_batch_size < min_grid_size) {
+    opt_batch_size <<= 1;
+  }
 
-  rmm::device_uvector<Counter<T, IdxT>> counters(batch_size, stream);
-  rmm::device_uvector<IdxT> histograms(num_buckets * batch_size, stream);
-  rmm::device_uvector<T> buf1(len * batch_size, stream);
-  rmm::device_uvector<IdxT> idx_buf1(len * batch_size, stream);
-  rmm::device_uvector<T> buf2(len * batch_size, stream);
-  rmm::device_uvector<IdxT> idx_buf2(len * batch_size, stream);
-
-  RAFT_CUDA_TRY(
-    cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
-
-  const T* in_buf        = nullptr;
-  const IdxT* in_idx_buf = nullptr;
-  T* out_buf             = nullptr;
-  IdxT* out_idx_buf      = nullptr;
-
-  dim3 blocks(ceildiv<size_t>(len, BlockSize * ITEM_PER_THREAD), batch_size);
-
-  constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
-
-  for (int pass = 0; pass < num_passes; ++pass) {
-    if (pass == 0) {
-      in_buf      = in;
-      in_idx_buf  = nullptr;
-      out_buf     = nullptr;
-      out_idx_buf = nullptr;
-    } else if (pass == 1) {
-      in_buf      = in;
-      in_idx_buf  = in_idx ? in_idx : nullptr;
-      out_buf     = buf1.data();
-      out_idx_buf = idx_buf1.data();
-    } else if (pass % 2 == 0) {
-      in_buf      = buf1.data();
-      in_idx_buf  = idx_buf1.data();
-      out_buf     = buf2.data();
-      out_idx_buf = idx_buf2.data();
-    } else {
-      in_buf      = buf2.data();
-      in_idx_buf  = idx_buf2.data();
-      out_buf     = buf1.data();
-      out_idx_buf = idx_buf1.data();
-    }
+  // Do not exceed the max grid size.
+  opt_batch_size = std::min<size_t>(opt_batch_size, size_t(max_grid_dim_y));
 
-    radix_kernel<T, IdxT, BitsPerPass, BlockSize>
-      <<<blocks, BlockSize, 0, stream>>>(in_buf,
-                                         in_idx_buf,
-                                         out_buf,
-                                         out_idx_buf,
-                                         out,
-                                         out_idx,
-                                         counters.data(),
-                                         histograms.data(),
-                                         len,
-                                         k,
-                                         !select_min,
-                                         pass);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
+  // Don't do more work than needed
+  return uint16_t(std::min<size_t>(opt_batch_size, req_batch_size));
 }
 
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
@@ -489,17 +450,75 @@ void radix_topk(const T* in,
                 bool select_min,
                 rmm::cuda_stream_view stream)
 {
-  for (size_t offset = 0; offset < batch_size; offset += MAX_BATCH_SIZE) {
-    auto batch_chunk = uint16_t(std::min<size_t>(MAX_BATCH_SIZE, batch_size - offset));
-    radix_topk_<T, IdxT, BitsPerPass, BlockSize>(in + offset * len,
-                                                 in_idx + offset * len,
-                                                 batch_chunk,
-                                                 len,
-                                                 k,
-                                                 out + offset * k,
-                                                 out_idx + offset * k,
-                                                 select_min,
-                                                 stream);
+  // TODO: is it possible to relax this restriction?
+  static_assert(calc_num_passes<T, BitsPerPass>() > 1);
+  constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
+
+  size_t blocks_per_row = ceildiv<size_t>(len, BlockSize * ITEM_PER_THREAD);
+  uint16_t max_batch_size =
+    get_optimal_batch_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, blocks_per_row);
+
+  rmm::device_uvector<Counter<T, IdxT>> counters(max_batch_size, stream);
+  rmm::device_uvector<IdxT> histograms(num_buckets * max_batch_size, stream);
+  rmm::device_uvector<T> buf1(len * max_batch_size, stream);
+  rmm::device_uvector<IdxT> idx_buf1(len * max_batch_size, stream);
+  rmm::device_uvector<T> buf2(len * max_batch_size, stream);
+  rmm::device_uvector<IdxT> idx_buf2(len * max_batch_size, stream);
+
+  for (size_t offset = 0; offset < batch_size; offset += max_batch_size) {
+    auto batch_chunk = uint16_t(std::min<size_t>(max_batch_size, batch_size - offset));
+
+    RAFT_CUDA_TRY(
+      cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
+
+    const T* in_buf        = nullptr;
+    const IdxT* in_idx_buf = nullptr;
+    T* out_buf             = nullptr;
+    IdxT* out_idx_buf      = nullptr;
+
+    dim3 blocks(blocks_per_row, batch_chunk);
+
+    constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
+
+    for (int pass = 0; pass < num_passes; ++pass) {
+      if (pass == 0) {
+        in_buf      = in + offset * len;
+        in_idx_buf  = nullptr;
+        out_buf     = nullptr;
+        out_idx_buf = nullptr;
+      } else if (pass == 1) {
+        in_buf      = in + offset * len;
+        in_idx_buf  = in_idx ? in_idx + offset * len : nullptr;
+        out_buf     = buf1.data();
+        out_idx_buf = idx_buf1.data();
+      } else if (pass % 2 == 0) {
+        in_buf      = buf1.data();
+        in_idx_buf  = idx_buf1.data();
+        out_buf     = buf2.data();
+        out_idx_buf = idx_buf2.data();
+      } else {
+        in_buf      = buf2.data();
+        in_idx_buf  = idx_buf2.data();
+        out_buf     = buf1.data();
+        out_idx_buf = idx_buf1.data();
+      }
+
+      radix_kernel<T, IdxT, BitsPerPass, BlockSize>
+        <<<blocks, BlockSize, 0, stream>>>(in_buf,
+                                           in_idx_buf,
+                                           out_buf,
+                                           out_idx_buf,
+                                           out + offset * k,
+                                           out_idx + offset * k,
+                                           counters.data(),
+                                           histograms.data(),
+                                           len,
+                                           k,
+                                           !select_min,
+                                           pass);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
   }
 }
 

From 52f863e1f19abe9682a9e818d6b455706fef31aa Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 24 Mar 2022 13:56:22 +0100
Subject: [PATCH 27/41] Rename the detail::topk folder

---
 .../detail/{ivf_flat => topk}/bitonic_sort.cuh |  4 ++--
 .../detail/{ivf_flat => topk}/radix_topk.cuh   |  4 ++--
 .../{ivf_flat => topk}/warpsort_topk.cuh       | 18 +++++++++---------
 cpp/include/raft/spatial/knn/knn.cuh           | 10 +++++-----
 cpp/test/spatial/selection.cu                  |  2 +-
 5 files changed, 19 insertions(+), 19 deletions(-)
 rename cpp/include/raft/spatial/knn/detail/{ivf_flat => topk}/bitonic_sort.cuh (98%)
 rename cpp/include/raft/spatial/knn/detail/{ivf_flat => topk}/radix_topk.cuh (99%)
 rename cpp/include/raft/spatial/knn/detail/{ivf_flat => topk}/warpsort_topk.cuh (97%)

diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
similarity index 98%
rename from cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
rename to cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
index d6c49f633c..69f4290a87 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/bitonic_sort.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
@@ -18,7 +18,7 @@
 
 #include <raft/cuda_utils.cuh>
 
-namespace raft::spatial::knn::detail::ivf_flat {
+namespace raft::spatial::knn::detail::topk {
 
 namespace helpers {
 
@@ -229,4 +229,4 @@ class bitonic {
   }
 };
 
-}  // namespace raft::spatial::knn::detail::ivf_flat
+}  // namespace raft::spatial::knn::detail::topk
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
similarity index 99%
rename from cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
rename to cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
index e866738cbd..00d55bedbc 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -25,7 +25,7 @@
 #include <raft/device_atomics.cuh>
 #include <raft/vectorized.cuh>
 
-namespace raft::spatial::knn::detail::ivf_flat {
+namespace raft::spatial::knn::detail::topk {
 
 constexpr int ITEM_PER_THREAD      = 32;
 constexpr int VECTORIZED_READ_SIZE = 16;
@@ -522,4 +522,4 @@ void radix_topk(const T* in,
   }
 }
 
-}  // namespace raft::spatial::knn::detail::ivf_flat
+}  // namespace raft::spatial::knn::detail::topk
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
similarity index 97%
rename from cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
rename to cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index 593ab9b85f..97cca0620e 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -106,7 +106,7 @@
       }
  */
 
-namespace raft::spatial::knn::detail::ivf_flat {
+namespace raft::spatial::knn::detail::topk {
 
 static constexpr int kMaxCapacity = 256;
 
@@ -183,7 +183,7 @@ class WarpSort {
         }
       }
     }
-    ivf_flat::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
+    topk::bitonic<kMaxArrLen>(Ascending, kWarpWidth).merge(val_arr_, idx_arr_);
   }
 
   /** Save the content by the pointer location. */
@@ -235,7 +235,7 @@ class WarpSort {
         idx_arr_[kMaxArrLen - i] = ids_in[PerThreadSizeIn - i];
       }
     }
-    ivf_flat::bitonic<kMaxArrLen>(Ascending).merge(val_arr_, idx_arr_);
+    topk::bitonic<kMaxArrLen>(Ascending).merge(val_arr_, idx_arr_);
   }
 };
 
@@ -297,7 +297,7 @@ class WarpSelect : public WarpSort<Capacity, Ascending, T, IdxT> {
 
   __device__ void merge_buf_()
   {
-    ivf_flat::bitonic<kMaxBufLen>(!Ascending).sort(val_buf_, idx_buf_);
+    topk::bitonic<kMaxBufLen>(!Ascending).sort(val_buf_, idx_buf_);
     this->merge_in<kMaxBufLen>(val_buf_, idx_buf_);
     buf_len_ = 0;
     set_k_th_();  // contains warp sync
@@ -361,7 +361,7 @@ class WarpBitonic : public WarpSort<Capacity, Ascending, T, IdxT> {
 
     ++buf_len_;
     if (buf_len_ == kMaxArrLen) {
-      ivf_flat::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+      topk::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
       this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
 #pragma unroll
       for (int i = 0; i < kMaxArrLen; i++) {
@@ -374,7 +374,7 @@ class WarpBitonic : public WarpSort<Capacity, Ascending, T, IdxT> {
   __device__ void done()
   {
     if (buf_len_ != 0) {
-      ivf_flat::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+      topk::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
       this->merge_in<kMaxArrLen>(val_buf_, idx_buf_);
     }
   }
@@ -390,7 +390,7 @@ class WarpBitonic : public WarpSort<Capacity, Ascending, T, IdxT> {
         idx_arr_[i] = in_idx[idx];
       }
     }
-    ivf_flat::bitonic<kMaxArrLen>(Ascending).sort(val_arr_, idx_arr_);
+    topk::bitonic<kMaxArrLen>(Ascending).sort(val_arr_, idx_arr_);
   }
 
   /** Fill in the secondary val_buf_/idx_buf_ */
@@ -401,7 +401,7 @@ class WarpBitonic : public WarpSort<Capacity, Ascending, T, IdxT> {
       val_buf_[i] = (idx < end) ? in[idx] : dummy_;
       idx_buf_[i] = (idx < end) ? in_idx[idx] : std::numeric_limits<IdxT>::max();
     }
-    ivf_flat::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
+    topk::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
   }
 
   using WarpSort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
@@ -827,4 +827,4 @@ void warp_sort_topk(const T* in,
   }
 }
 
-}  // namespace raft::spatial::knn::detail::ivf_flat
+}  // namespace raft::spatial::knn::detail::topk
diff --git a/cpp/include/raft/spatial/knn/knn.cuh b/cpp/include/raft/spatial/knn/knn.cuh
index 56c5451fbf..8765a7c30a 100644
--- a/cpp/include/raft/spatial/knn/knn.cuh
+++ b/cpp/include/raft/spatial/knn/knn.cuh
@@ -19,8 +19,8 @@
 #include "detail/knn_brute_force_faiss.cuh"
 #include "detail/selection_faiss.cuh"
 
-#include "detail/ivf_flat/radix_topk.cuh"
-#include "detail/ivf_flat/warpsort_topk.cuh"
+#include "detail/topk/radix_topk.cuh"
+#include "detail/topk/warpsort_topk.cuh"
 
 #include <raft/common/nvtx.hpp>
 
@@ -147,17 +147,17 @@ inline void select_k(value_t* in_keys,
       break;
 
     case SelectKAlgo::RADIX_8_BITS:
-      detail::ivf_flat::radix_topk<value_t, idx_t, 8, 512>(
+      detail::topk::radix_topk<value_t, idx_t, 8, 512>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     case SelectKAlgo::RADIX_11_BITS:
-      detail::ivf_flat::radix_topk<value_t, idx_t, 11, 512>(
+      detail::topk::radix_topk<value_t, idx_t, 11, 512>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
     case SelectKAlgo::WARP_SORT:
-      detail::ivf_flat::warp_sort_topk<value_t, idx_t>(
+      detail::topk::warp_sort_topk<value_t, idx_t>(
         in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream);
       break;
 
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index c892f707bb..cda4028043 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -104,7 +104,7 @@ struct SelectInOutComputed {
     // check if the size is supported by the algorithm
     switch (algo) {
       case knn::SelectKAlgo::WARP_SORT:
-        if (spec.k > raft::spatial::knn::detail::ivf_flat::kMaxCapacity) {
+        if (spec.k > raft::spatial::knn::detail::topk::kMaxCapacity) {
           not_supported = true;
           return;
         }

From 2a78c1f48497e06e4c8ee3a0c89b388b25ee85ed Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 24 Mar 2022 14:21:25 +0100
Subject: [PATCH 28/41] Add the high-level algorithm description

---
 .../spatial/knn/detail/topk/radix_topk.cuh    | 39 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
index 00d55bedbc..35d3a60147 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -150,6 +150,10 @@ struct Counter {
   IdxT out_back_cnt;
 };
 
+/**
+ * Fused filtering of the current phase and building histogram for the next phase
+ * (see steps 4-1 in `radix_kernel` description).
+ */
 template <typename T, typename IdxT, int BitsPerPass>
 __device__ void filter_and_histogram(const T* in_buf,
                                      const IdxT* in_idx_buf,
@@ -236,6 +240,7 @@ __device__ void filter_and_histogram(const T* in_buf,
 /**
  * Replace a part of the histogram with its own prefix sum, starting from the `start` and adding
  * `current` to each entry of the result.
+ * (step 2 in `radix_kernel` description)
  */
 template <typename IdxT, int BitsPerPass, int BlockSize>
 __device__ void scan(volatile IdxT* histogram,
@@ -257,7 +262,10 @@ __device__ void scan(volatile IdxT* histogram,
                     // to be read after
 }
 
-/** Calculate in which bucket the k-th value will fall */
+/**
+ * Calculate in which bucket the k-th value will fall
+ *  (steps 2-3 in `radix_kernel` description)
+ */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
 __device__ void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const IdxT k)
 {
@@ -293,6 +301,35 @@ __device__ void choose_bucket(Counter<T, IdxT>* counter, IdxT* histogram, const
   }
 }
 
+/**
+ *
+ * It is expected to call this kernel multiple times (passes), in each pass we process a radix,
+ * going from the most significant towards the least significant bits (MSD).
+ *
+ * Conceptually, each pass consists of 4 steps:
+ *
+ * 1. Calculate histogram
+ *      First, transform bits into a digit, the value of which is in the range
+ *      [0, 2^{BITS_PER_PASS}-1]. Then count the frequency of each digit value and the result is a
+ *      histogram. That is, histogram[i] contains the count of inputs having value i.
+ *
+ * 2. Scan the histogram
+ *      Inclusive prefix sum is computed for the histogram. After this step, histogram[i] contains
+ *      the count of inputs having value <= i.
+ *
+ * 3. Find the bucket j of the histogram that the k-th value falls into
+ *
+ * 4. Filtering
+ *      Input elements whose digit value <j are the top-k elements. We put them into the result
+ *      array out. The number of such elements is histogram[j-1]. Since the k-th value must be in
+ *      the bucket j, we write all elements in bucket j into a intermediate buffer out_buf. For the
+ *      next pass, these elements are used as input, and we would like to find the
+ *      (k - histogram[j-1])-th value among them. That is, the k in the next pass is set to
+ *      (k - histogram[j-1]).
+ *
+ * In the implementation, the filtering step is delayed to the next pass so the filtering and
+ * histogram computation are fused. In this way, inputs are read once rather than twice.
+ */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
 __global__ void __launch_bounds__(BlockSize) radix_kernel(const T* in_buf,
                                                           const IdxT* in_idx_buf,

From d811f75a82dae8d5ca7f94510bcb3d112e87c756 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 24 Mar 2022 14:55:00 +0100
Subject: [PATCH 29/41] Rename the warpsort classes

---
 .../spatial/knn/detail/topk/warpsort_topk.cuh | 163 ++++++++++--------
 1 file changed, 94 insertions(+), 69 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index 97cca0620e..0a19b6bc97 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -28,16 +28,16 @@
 /*
   Three APIs of different scopes are provided:
     1. host function: warp_sort_topk()
-    2. block-wide API: class WarpSortBlockWide
-    3. warp-wide API: class WarpSelect and class WarpBitonic
+    2. block-wide API: class block_sort
+    3. warp-wide API: class warp_sort_filtered and class warp_sort_immediate
 
 
   1. warp_sort_topk() // the description for it is no longer true, should be deleted
 
-  2. class WarpSortBlockWide
+  2. class block_sort
     It can be regarded as a fixed size priority queue for a thread block,
     although the API is not typical.
-    class WarpSelect and WarpBitonic can be used to instantiate WarpSortBlockWide.
+    class warp_sort_filtered and warp_sort_immediate can be used to instantiate block_sort.
 
     It uses dynamic shared memory as intermediate buffer.
     So the required shared memory size should be calculated using
@@ -59,7 +59,7 @@
 
     Example:
       __global__ void kernel() {
-        WarpSortBlockWide<WarpBitonic, ...> queue(...);
+        block_sort<warp_sort_immediate, ...> queue(...);
 
         // way 1, [0, len) is same for the whole block
         queue.add(in, 0, len);
@@ -76,20 +76,20 @@
      kernel<<<grid_dim, block_dim, smem_size>>>();
 
 
-  3. class WarpSelect and class WarpBitonic
+  3. class warp_sort_filtered and class warp_sort_immediate
     These two classes can be regarded as fixed size priority queue for a warp.
-    Usage is similar to class WarpSortBlockWide.
+    Usage is similar to class block_sort.
     Two types of add() functions are provided, and also note that [start, end) is
     for a whole warp, while val/idx is for a thread.
     No shared memory is needed.
 
     The host function uses a heuristic to choose between these two classes for sorting,
-    WarpBitonic being chosen when the number of inputs per warp is somewhat small
-    (see the usage of LaunchThreshold<WarpBitonic>::len_factor_for_choosing).
+    warp_sort_immediate being chosen when the number of inputs per warp is somewhat small
+    (see the usage of LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing).
 
     Example:
       __global__ void kernel() {
-        WarpBitonic<...> queue(...);
+        warp_sort_immediate<...> queue(...);
         int warp_id = threadIdx.x / WarpSize;
         int lane_id = threadIdx.x % WarpSize;
 
@@ -145,12 +145,12 @@ constexpr auto calc_capacity(int k) -> int
  *   the content sorted alongside the keys.
  */
 template <int Capacity, bool Ascending, typename T, typename IdxT>
-class WarpSort {
+class warp_sort {
   static_assert(isPo2(Capacity));
 
  public:
   /**
-   * Construct the WarpSort empty queue.
+   * Construct the warp_sort empty queue.
    *
    * @param k
    *   number of elements to select.
@@ -159,7 +159,7 @@ class WarpSort {
    *   i.e. `Ascending ? upper_bound<T>() : lower_bound<T>()`.
    *
    */
-  __device__ WarpSort(IdxT k, T dummy) : k_(k), dummy_(dummy)
+  __device__ warp_sort(IdxT k, T dummy) : k_(k), dummy_(dummy)
   {
 #pragma unroll
     for (int i = 0; i < kMaxArrLen; i++) {
@@ -239,13 +239,21 @@ class WarpSort {
   }
 };
 
+/**
+ * This version of warp_sort compares each input element against the current
+ * estimate of k-th value before adding it to the intermediate sorting buffer.
+ * This makes the algorithm do less sorting steps for long input sequences
+ * at the cost of extra checks on each step.
+ *
+ * This implementation is preferred for large input_len values.
+ */
 template <int Capacity, bool Ascending, typename T, typename IdxT>
-class WarpSelect : public WarpSort<Capacity, Ascending, T, IdxT> {
+class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
   static_assert(Capacity >= WarpSize);
 
  public:
-  __device__ WarpSelect(int k, T dummy)
-    : WarpSort<Capacity, Ascending, T, IdxT>(k, dummy), buf_len_(0), k_th_(dummy)
+  __device__ warp_sort_filtered(int k, T dummy)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k, dummy), buf_len_(0), k_th_(dummy)
   {
 #pragma unroll
     for (int i = 0; i < kMaxBufLen; i++) {
@@ -307,11 +315,11 @@ class WarpSelect : public WarpSort<Capacity, Ascending, T, IdxT> {
     }
   }
 
-  using WarpSort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
-  using WarpSort<Capacity, Ascending, T, IdxT>::val_arr_;
-  using WarpSort<Capacity, Ascending, T, IdxT>::idx_arr_;
-  using WarpSort<Capacity, Ascending, T, IdxT>::k_;
-  using WarpSort<Capacity, Ascending, T, IdxT>::dummy_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::dummy_;
 
   static constexpr int kMaxBufLen = (Capacity <= 64) ? 2 : 4;
 
@@ -322,13 +330,19 @@ class WarpSelect : public WarpSort<Capacity, Ascending, T, IdxT> {
   T k_th_;
 };
 
+/**
+ * This version of warp_sort adds every input element into the intermediate sorting
+ * buffer, and thus does the sorting step every `Capacity` input elements.
+ *
+ * This implementation is preferred for very small input_len values.
+ */
 template <int Capacity, bool Ascending, typename T, typename IdxT>
-class WarpBitonic : public WarpSort<Capacity, Ascending, T, IdxT> {
+class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
   static_assert(Capacity >= WarpSize);
 
  public:
-  __device__ WarpBitonic(int k, T dummy)
-    : WarpSort<Capacity, Ascending, T, IdxT>(k, dummy), buf_len_(0)
+  __device__ warp_sort_immediate(int k, T dummy)
+    : warp_sort<Capacity, Ascending, T, IdxT>(k, dummy), buf_len_(0)
   {
 #pragma unroll
     for (int i = 0; i < kMaxArrLen; i++) {
@@ -404,11 +418,11 @@ class WarpBitonic : public WarpSort<Capacity, Ascending, T, IdxT> {
     topk::bitonic<kMaxArrLen>(!Ascending).sort(val_buf_, idx_buf_);
   }
 
-  using WarpSort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
-  using WarpSort<Capacity, Ascending, T, IdxT>::val_arr_;
-  using WarpSort<Capacity, Ascending, T, IdxT>::idx_arr_;
-  using WarpSort<Capacity, Ascending, T, IdxT>::k_;
-  using WarpSort<Capacity, Ascending, T, IdxT>::dummy_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::dummy_;
 
   T val_buf_[kMaxArrLen];
   IdxT idx_buf_[kMaxArrLen];
@@ -423,9 +437,9 @@ class WarpBitonic : public WarpSort<Capacity, Ascending, T, IdxT> {
  *   the full sort.
  */
 template <int Capacity, bool Ascending, typename T, typename IdxT>
-class WarpMerge : public WarpSort<Capacity, Ascending, T, IdxT> {
+class warp_merge : public warp_sort<Capacity, Ascending, T, IdxT> {
  public:
-  __device__ WarpMerge(int k, T dummy) : WarpSort<Capacity, Ascending, T, IdxT>(k, dummy) {}
+  __device__ warp_merge(int k, T dummy) : warp_sort<Capacity, Ascending, T, IdxT>(k, dummy) {}
 
   // NB: the input is already sorted, because it's the second pass.
   __device__ void load(const T* in, const IdxT* in_idx, IdxT start, IdxT end)
@@ -438,12 +452,12 @@ class WarpMerge : public WarpSort<Capacity, Ascending, T, IdxT> {
   __device__ void done() {}
 
  private:
-  using WarpSort<Capacity, Ascending, T, IdxT>::kWarpWidth;
-  using WarpSort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
-  using WarpSort<Capacity, Ascending, T, IdxT>::val_arr_;
-  using WarpSort<Capacity, Ascending, T, IdxT>::idx_arr_;
-  using WarpSort<Capacity, Ascending, T, IdxT>::k_;
-  using WarpSort<Capacity, Ascending, T, IdxT>::dummy_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kWarpWidth;
+  using warp_sort<Capacity, Ascending, T, IdxT>::kMaxArrLen;
+  using warp_sort<Capacity, Ascending, T, IdxT>::val_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::idx_arr_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::k_;
+  using warp_sort<Capacity, Ascending, T, IdxT>::dummy_;
 };
 
 template <typename T, typename IdxT>
@@ -457,10 +471,9 @@ template <template <int, bool, typename, typename> class WarpSortWarpWide,
           bool Ascending,
           typename T,
           typename IdxT>
-class WarpSortBlockWide {
+class block_sort {
  public:
-  __device__ WarpSortBlockWide(int k, T dummy, void* smem_buf)
-    : queue_(k, dummy), k_(k), dummy_(dummy)
+  __device__ block_sort(int k, T dummy, void* smem_buf) : queue_(k, dummy), k_(k), dummy_(dummy)
   {
     val_smem_             = static_cast<T*>(smem_buf);
     const int num_of_warp = blockDim.x / WarpSize;
@@ -544,14 +557,14 @@ __global__ void block_kernel(
   const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx, T dummy)
 {
   extern __shared__ __align__(sizeof(T) * 256) uint8_t smem_buf_bytes[];
-  T* smem_buf = reinterpret_cast<T*>(smem_buf_bytes);
+  block_sort<WarpSortClass, Capacity, Ascending, T, IdxT> queue(
+    k, dummy, reinterpret_cast<T*>(smem_buf_bytes));
+  in += blockIdx.y * len;
+  in_idx += blockIdx.y * len;
 
   const IdxT len_per_block = ceildiv<IdxT>(len, gridDim.x);
-  const IdxT start         = blockIdx.x * len_per_block;
-  const IdxT end           = std::min(len, start + len_per_block);
-
-  WarpSortBlockWide<WarpSortClass, Capacity, Ascending, T, IdxT> queue(k, dummy, smem_buf);
-  queue.load(in + blockIdx.y * len, in_idx + blockIdx.y * len, start, end);
+  queue.load(
+    in, in_idx, blockIdx.x * len_per_block, std::min<IdxT>(len, (blockIdx.x + 1) * len_per_block));
 
   queue.done();
   const int block_id = blockIdx.x + gridDim.x * blockIdx.y;
@@ -658,13 +671,13 @@ struct LaunchThreshold {
 };
 
 template <>
-struct LaunchThreshold<WarpSelect> {
+struct LaunchThreshold<warp_sort_filtered> {
   static constexpr int len_factor_for_multi_block  = 2;
   static constexpr int len_factor_for_single_block = 32;
 };
 
 template <>
-struct LaunchThreshold<WarpBitonic> {
+struct LaunchThreshold<warp_sort_immediate> {
   static constexpr int len_factor_for_choosing     = 4;
   static constexpr int len_factor_for_multi_block  = 2;
   static constexpr int len_factor_for_single_block = 4;
@@ -734,7 +747,7 @@ void calc_launch_parameter_for_merge(IdxT len, int k, int* num_of_block, int* nu
 
   int block_size    = 0;
   int min_grid_size = 0;
-  launch_setup<WarpMerge, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
+  launch_setup<warp_merge, T, IdxT>::calc_optimal_params(k, &block_size, &min_grid_size);
 
   *num_of_warp      = block_size / WarpSize;
   IdxT len_per_warp = (len - 1) / (*num_of_warp) + 1;
@@ -778,23 +791,23 @@ void warp_sort_topk_(int num_of_block,
                                                stream);
 
   if (num_of_block > 1) {
-    // Merge the results across blocks using WarpMerge
+    // Merge the results across blocks using warp_merge
     len = k * num_of_block;
     calc_launch_parameter_for_merge<T>(len, k, &num_of_block, &num_of_warp);
     block_dim = num_of_warp * WarpSize;
     smem_size = calc_smem_size_for_block_wide<T>(num_of_warp, (IdxT)k);
-    launch_setup<WarpMerge, T, IdxT>::kernel((IdxT)k,
-                                             select_min,
-                                             (IdxT)batch_size,
-                                             (IdxT)len,
-                                             num_of_block,
-                                             block_dim,
-                                             smem_size,
-                                             tmp_val.data(),
-                                             tmp_idx.data(),
-                                             out,
-                                             out_idx,
-                                             stream);
+    launch_setup<warp_merge, T, IdxT>::kernel((IdxT)k,
+                                              select_min,
+                                              (IdxT)batch_size,
+                                              (IdxT)len,
+                                              num_of_block,
+                                              block_dim,
+                                              smem_size,
+                                              tmp_val.data(),
+                                              tmp_idx.data(),
+                                              out,
+                                              out_idx,
+                                              stream);
   }
 }
 
@@ -814,15 +827,27 @@ void warp_sort_topk(const T* in,
   int capacity     = calc_capacity(k);
   int num_of_block = 0;
   int num_of_warp  = 0;
-  calc_launch_parameter<WarpBitonic, T>(batch_size, len, (IdxT)k, &num_of_block, &num_of_warp);
+  calc_launch_parameter<warp_sort_immediate, T>(
+    batch_size, len, (IdxT)k, &num_of_block, &num_of_warp);
   int len_per_warp = len / (num_of_block * num_of_warp);
 
-  if (len_per_warp <= capacity * LaunchThreshold<WarpBitonic>::len_factor_for_choosing) {
-    warp_sort_topk_<WarpBitonic, T, IdxT>(
-      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
-  } else {
-    calc_launch_parameter<WarpSelect, T>(batch_size, len, k, &num_of_block, &num_of_warp);
-    warp_sort_topk_<WarpSelect, T, IdxT>(
+  if (len_per_warp <= capacity * LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing))
+    {
+      warp_sort_topk_<warp_sort_immediate, T, IdxT>(num_of_block,
+                                                    num_of_warp,
+                                                    in,
+                                                    in_idx,
+                                                    batch_size,
+                                                    len,
+                                                    k,
+                                                    out,
+                                                    out_idx,
+                                                    select_min,
+                                                    stream);
+    }
+  else {
+    calc_launch_parameter<warp_sort_filtered, T>(batch_size, len, k, &num_of_block, &num_of_warp);
+    warp_sort_topk_<warp_sort_filtered, T, IdxT>(
       num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
   }
 }

From fcab684ab3b215e87bf8636784f0a5bf833d3932 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 24 Mar 2022 18:15:06 +0100
Subject: [PATCH 30/41] Fix a typo

---
 .../spatial/knn/detail/topk/warpsort_topk.cuh | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index 0a19b6bc97..bc92a1d95c 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -831,21 +831,10 @@ void warp_sort_topk(const T* in,
     batch_size, len, (IdxT)k, &num_of_block, &num_of_warp);
   int len_per_warp = len / (num_of_block * num_of_warp);
 
-  if (len_per_warp <= capacity * LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing))
-    {
-      warp_sort_topk_<warp_sort_immediate, T, IdxT>(num_of_block,
-                                                    num_of_warp,
-                                                    in,
-                                                    in_idx,
-                                                    batch_size,
-                                                    len,
-                                                    k,
-                                                    out,
-                                                    out_idx,
-                                                    select_min,
-                                                    stream);
-    }
-  else {
+  if (len_per_warp <= capacity * LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing) {
+    warp_sort_topk_<warp_sort_immediate, T, IdxT>(
+      num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);
+  } else {
     calc_launch_parameter<warp_sort_filtered, T>(batch_size, len, k, &num_of_block, &num_of_warp);
     warp_sort_topk_<warp_sort_filtered, T, IdxT>(
       num_of_block, num_of_warp, in, in_idx, batch_size, len, k, out, out_idx, select_min, stream);

From 84de3f2c3a695544831fbbd76000d1fd14b3aef9 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 25 Mar 2022 10:28:32 +0100
Subject: [PATCH 31/41] Clarify some parts of documentsion for bitonic sort

---
 .../raft/spatial/knn/detail/topk/bitonic_sort.cuh      | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
index 69f4290a87..44ffe6bc50 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/bitonic_sort.cuh
@@ -45,8 +45,10 @@ __device__ __forceinline__ void conditional_assign(bool cond, T& ptr, T x)
  * each thread in a warp and sorts them, such that for a fixed i, arr[i] are sorted within the
  * threads in a warp, and for any i < j, arr[j] in any thread is not smaller than arr[i] in any
  * other thread.
+ * When `warp_width < WarpSize`, the data is sorted within all subwarps of the warp independently.
  *
- * As an example, assuming `Size = 4`, `warp_width = 16`, and `WarpSize = 32`, the layout is:
+ * As an example, assuming `Size = 4`, `warp_width = 16`, and `WarpSize = 32`, sorting a permutation
+ * of numbers 0-63 in each subwarp yield the following result:
  * `
  *  arr_i \ laneId()
  *       0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15    16  17  18 ...
@@ -132,7 +134,8 @@ class bitonic {
   }
 
   /**
-   * @brief `merge` variant for the case of one element per thread.
+   * @brief `merge` variant for the case of one element per thread
+   *        (pass input by a reference instead of a pointer).
    *
    * @param key
    * @param payload
@@ -147,7 +150,8 @@ class bitonic {
   }
 
   /**
-   * @brief `sort` variant for the case of one element per thread.
+   * @brief `sort` variant for the case of one element per thread
+   *        (pass input by a reference instead of a pointer).
    *
    * @param key
    * @param payload

From 25ff099c642db971443a171f9a40e80f69d48799 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Mon, 28 Mar 2022 07:31:52 +0200
Subject: [PATCH 32/41] Update
 cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index bc92a1d95c..78dee14c99 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -43,7 +43,7 @@
     So the required shared memory size should be calculated using
     calc_smem_size_for_block_wide() and passed as the 3rd kernel launch parameter.
 
-    Two overloade functions can be used to add items to the queue.
+    Two overload functions can be used to add items to the queue.
     One is load(const T* in, IdxT start, IdxT end) and it adds a range of items,
     namely [start, end) of in. The idx is inferred from start.
     This function should be called only once to add all items, and should not be

From 99f6febb60c50ad2609d2d544e44445bbad4187f Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Mon, 28 Mar 2022 07:32:13 +0200
Subject: [PATCH 33/41] Update
 cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index 78dee14c99..b8f94e5776 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -83,7 +83,7 @@
     for a whole warp, while val/idx is for a thread.
     No shared memory is needed.
 
-    The host function uses a heuristic to choose between these two classes for sorting,
+    The host function (warp_sort_topk) uses a heuristic to choose between these two classes for sorting,
     warp_sort_immediate being chosen when the number of inputs per warp is somewhat small
     (see the usage of LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing).
 

From deb7e4408f3e072448502b1cd43a1aa0dfd0aaa7 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Mon, 28 Mar 2022 07:32:22 +0200
Subject: [PATCH 34/41] Update
 cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index b8f94e5776..913c18df84 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -137,7 +137,8 @@ constexpr auto calc_capacity(int k) -> int
  * @tparam Capacity
  *   maximum number of elements in the queue.
  * @tparam Ascending
- *   which comparison to use: `true` means `<`, `false` means `>`.
+ *   which comparison to use: `true` means `<`, collect the smallest elements,
+ *                                             `false` means `>`, collect the greatest elements.
  * @tparam T
  *   the type of keys (what is being compared)
  * @tparam IdxT

From 073d0f57975cd0d4ad399fb47896f61528b02294 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Mon, 28 Mar 2022 07:32:28 +0200
Subject: [PATCH 35/41] Update
 cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index 913c18df84..7a958931b4 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -94,7 +94,7 @@
         int lane_id = threadIdx.x % WarpSize;
 
         // way 1, [0, len) is same for the whole warp
-        queue.add(in, 0, len);
+        queue.load(in, 0, len);
         // way 2, each thread gets its own val/idx pair
         for (IdxT i = lane_id; i < len, i += WarpSize) {
           queue.add(in[i], idx[i]);

From a8117408e9f9612a32d71c932f53ddcab4baa839 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 28 Mar 2022 07:55:04 +0200
Subject: [PATCH 36/41] Address review comments

---
 .../spatial/knn/detail/topk/radix_topk.cuh    | 77 ++++++++++++++-----
 .../spatial/knn/detail/topk/warpsort_topk.cuh | 46 +++++++++--
 2 files changed, 100 insertions(+), 23 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
index 35d3a60147..af674b4481 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -221,7 +221,7 @@ __device__ void filter_and_histogram(const T* in_buf,
           out[k - 1]     = value;
           out_idx[k - 1] = in_idx_buf ? in_idx_buf[i] : i;
         }
-      } else if (out && prev_bucket < want_bucket) {
+      } else if (prev_bucket < want_bucket) {
         IdxT pos     = atomicAdd(&out_cnt, IdxT(1));
         out[pos]     = value;
         out_idx[pos] = in_idx_buf ? in_idx_buf[i] : i;
@@ -351,12 +351,11 @@ __global__ void __launch_bounds__(BlockSize) radix_kernel(const T* in_buf,
   const int batch_id        = blockIdx.y;
   in_buf += batch_id * len;
   out_buf += batch_id * len;
+  out += batch_id * k;
+  out_idx += batch_id * k;
   if (in_idx_buf) { in_idx_buf += batch_id * len; }
   if (out_idx_buf) { out_idx_buf += batch_id * len; }
-  if (out) {
-    out += batch_id * k;
-    out_idx += batch_id * k;
-  }
+
   auto counter   = counters + batch_id;
   auto histogram = histograms + batch_id * num_buckets;
 
@@ -390,9 +389,9 @@ __global__ void __launch_bounds__(BlockSize) radix_kernel(const T* in_buf,
     // init counter, other members of counter is initialized with 0 by
     // cudaMemset()
     if (pass == 0 && threadIdx.x == 0) {
-      counter->k   = k;
-      counter->len = len;
-      if (out) { counter->out_back_cnt = 0; }
+      counter->k            = k;
+      counter->len          = len;
+      counter->out_back_cnt = 0;
     }
     __syncthreads();
 
@@ -476,6 +475,48 @@ inline uint16_t get_optimal_batch_size(size_t req_batch_size, size_t blocks_per_
   return uint16_t(std::min<size_t>(opt_batch_size, req_batch_size));
 }
 
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_keys` as a row-major matrix with len columns and
+ * batch_size rows, then this function selects k smallest/largest values in each row and fills
+ * in the row-major matrix `out` of size (batch_size, k).
+ *
+ * Note, the output is NOT sorted within the groups of `k` selected elements.
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ * @tparam BitsPerPass
+ *   The size of the radix;
+ *   it affects the number of passes and number of buckets.
+ * @tparam BlockSize
+ *   Number of threads in a kernel thread block.
+ *
+ * @param[in] in
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   typically, these are indices of the corresponding in_keys.
+ * @param[in] batch_size
+ *   number of input rows, i.e. the batch size.
+ * @param[in] len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: len >= k.
+ * @param[in] k
+ *   the number of outputs to select in each input row.
+ * @param[out] out
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the k smallest/largest values from each row of the `in_keys`.
+ * @param[out] out_idx
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the payload selected together with `out`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] stream
+ */
 template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
 void radix_topk(const T* in,
                 const IdxT* in_idx,
@@ -492,18 +533,18 @@ void radix_topk(const T* in,
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
 
   size_t blocks_per_row = ceildiv<size_t>(len, BlockSize * ITEM_PER_THREAD);
-  uint16_t max_batch_size =
+  uint16_t max_chunk_size =
     get_optimal_batch_size<T, IdxT, BitsPerPass, BlockSize>(batch_size, blocks_per_row);
 
-  rmm::device_uvector<Counter<T, IdxT>> counters(max_batch_size, stream);
-  rmm::device_uvector<IdxT> histograms(num_buckets * max_batch_size, stream);
-  rmm::device_uvector<T> buf1(len * max_batch_size, stream);
-  rmm::device_uvector<IdxT> idx_buf1(len * max_batch_size, stream);
-  rmm::device_uvector<T> buf2(len * max_batch_size, stream);
-  rmm::device_uvector<IdxT> idx_buf2(len * max_batch_size, stream);
+  rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream);
+  rmm::device_uvector<IdxT> histograms(num_buckets * max_chunk_size, stream);
+  rmm::device_uvector<T> buf1(len * max_chunk_size, stream);
+  rmm::device_uvector<IdxT> idx_buf1(len * max_chunk_size, stream);
+  rmm::device_uvector<T> buf2(len * max_chunk_size, stream);
+  rmm::device_uvector<IdxT> idx_buf2(len * max_chunk_size, stream);
 
-  for (size_t offset = 0; offset < batch_size; offset += max_batch_size) {
-    auto batch_chunk = uint16_t(std::min<size_t>(max_batch_size, batch_size - offset));
+  for (size_t offset = 0; offset < batch_size; offset += max_chunk_size) {
+    auto chunk_size = uint16_t(std::min<size_t>(max_chunk_size, batch_size - offset));
 
     RAFT_CUDA_TRY(
       cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
@@ -514,7 +555,7 @@ void radix_topk(const T* in,
     T* out_buf             = nullptr;
     IdxT* out_idx_buf      = nullptr;
 
-    dim3 blocks(blocks_per_row, batch_chunk);
+    dim3 blocks(blocks_per_row, chunk_size);
 
     constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
 
diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index 7a958931b4..e91deeae2b 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -32,7 +32,8 @@
     3. warp-wide API: class warp_sort_filtered and class warp_sort_immediate
 
 
-  1. warp_sort_topk() // the description for it is no longer true, should be deleted
+  1. warp_sort_topk()
+    (see the docstring)
 
   2. class block_sort
     It can be regarded as a fixed size priority queue for a thread block,
@@ -83,8 +84,8 @@
     for a whole warp, while val/idx is for a thread.
     No shared memory is needed.
 
-    The host function (warp_sort_topk) uses a heuristic to choose between these two classes for sorting,
-    warp_sort_immediate being chosen when the number of inputs per warp is somewhat small
+    The host function (warp_sort_topk) uses a heuristic to choose between these two classes for
+    sorting, warp_sort_immediate being chosen when the number of inputs per warp is somewhat small
     (see the usage of LaunchThreshold<warp_sort_immediate>::len_factor_for_choosing).
 
     Example:
@@ -246,7 +247,7 @@ class warp_sort {
  * This makes the algorithm do less sorting steps for long input sequences
  * at the cost of extra checks on each step.
  *
- * This implementation is preferred for large input_len values.
+ * This implementation is preferred for large len values.
  */
 template <int Capacity, bool Ascending, typename T, typename IdxT>
 class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
@@ -335,7 +336,7 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
  * This version of warp_sort adds every input element into the intermediate sorting
  * buffer, and thus does the sorting step every `Capacity` input elements.
  *
- * This implementation is preferred for very small input_len values.
+ * This implementation is preferred for very small len values.
  */
 template <int Capacity, bool Ascending, typename T, typename IdxT>
 class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
@@ -812,6 +813,41 @@ void warp_sort_topk_(int num_of_block,
   }
 }
 
+/**
+ * Select k smallest or largest key/values from each row in the input data.
+ *
+ * If you think of the input data `in_keys` as a row-major matrix with len columns and
+ * batch_size rows, then this function selects k smallest/largest values in each row and fills
+ * in the row-major matrix `out` of size (batch_size, k).
+ *
+ * @tparam T
+ *   the type of the keys (what is being compared).
+ * @tparam IdxT
+ *   the index type (what is being selected together with the keys).
+ *
+ * @param[in] in
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   these are compared and selected.
+ * @param[in] in_idx
+ *   contiguous device array of inputs of size (len * batch_size);
+ *   typically, these are indices of the corresponding in_keys.
+ * @param[in] batch_size
+ *   number of input rows, i.e. the batch size.
+ * @param[in] len
+ *   length of a single input array (row); also sometimes referred as n_cols.
+ *   Invariant: len >= k.
+ * @param[in] k
+ *   the number of outputs to select in each input row.
+ * @param[out] out
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the k smallest/largest values from each row of the `in_keys`.
+ * @param[out] out_idx
+ *   contiguous device array of outputs of size (k * batch_size);
+ *   the payload selected together with `out`.
+ * @param[in] select_min
+ *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] stream
+ */
 template <typename T, typename IdxT>
 void warp_sort_topk(const T* in,
                     const IdxT* in_idx,

From ff2d6e65f52013779b5b13b7dac39406edfce975 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 28 Mar 2022 08:09:30 +0200
Subject: [PATCH 37/41] Slightly reduce the number of tests for faster CI

---
 cpp/test/spatial/selection.cu | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index cda4028043..86adb10915 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -361,19 +361,8 @@ auto inputs_random = testing::Values(SelectTestSpec{1, 130, 15, false},
                                      SelectTestSpec{100, 1700, 1023, false},
                                      SelectTestSpec{100, 1700, 1024, true},
                                      SelectTestSpec{100, 1700, 1700, true},
-                                     SelectTestSpec{100, 100000, 1, true},
-                                     SelectTestSpec{100, 100000, 2, true},
-                                     SelectTestSpec{100, 100000, 3, true},
-                                     SelectTestSpec{100, 100000, 7, true},
-                                     SelectTestSpec{100, 100000, 16, true},
-                                     SelectTestSpec{100, 100000, 31, true},
-                                     SelectTestSpec{100, 100000, 32, true},
-                                     SelectTestSpec{100, 100000, 64, true},
-                                     SelectTestSpec{100, 100000, 60, true},
-                                     SelectTestSpec{100, 100000, 100, true},
-                                     SelectTestSpec{100, 100000, 200, true},
-                                     SelectTestSpec{100000, 100, 100, false},
-                                     SelectTestSpec{100000, 200, 100, false});
+                                     SelectTestSpec{10000, 100, 100, false},
+                                     SelectTestSpec{10000, 200, 100, false});
 
 typedef SelectionTest<float, int, with_ref<knn::SelectKAlgo::RADIX_8_BITS>::params_random>
   ReferencedRandomFloatInt;

From e2f7d8611264d2021a85372703e40fb555ee03e4 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 28 Mar 2022 09:22:16 +0200
Subject: [PATCH 38/41] Couple more comments

---
 cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
index af674b4481..21e6ea026c 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/radix_topk.cuh
@@ -179,6 +179,9 @@ __device__ void filter_and_histogram(const T* in_buf,
   const unsigned mask = calc_mask<T, BitsPerPass>(pass);
 
   if (pass == 0) {
+    // Passed to vectorized_process, this function executes in all blocks in parallel,
+    // i.e. the work is split along the input (both, in batches and chunks of a single row).
+    // Later, the histograms are merged using atomicAdd.
     auto f = [greater, start_bit, mask](T value, IdxT) {
       int bucket = calc_bucket<T, BitsPerPass>(value, start_bit, mask, greater);
       atomicAdd(histogram_smem + bucket, IdxT(1));
@@ -193,6 +196,7 @@ __device__ void filter_and_histogram(const T* in_buf,
     const int previous_start_bit = calc_start_bit<T, BitsPerPass>(pass - 1);
     const unsigned previous_mask = calc_mask<T, BitsPerPass>(pass - 1);
 
+    // See the remark above on the distributed execution of `f` using vectorized_process.
     auto f = [in_idx_buf,
               out_buf,
               out_idx_buf,
@@ -232,6 +236,7 @@ __device__ void filter_and_histogram(const T* in_buf,
   }
   __syncthreads();
 
+  // merge histograms produced by individual blocks
   for (int i = threadIdx.x; i < num_buckets; i += blockDim.x) {
     if (histogram_smem[i] != 0) { atomicAdd(histogram + i, histogram_smem[i]); }
   }

From 1936abdf3fb43fbce421fd0c2b419488e0b9cccb Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Mon, 28 Mar 2022 10:42:50 +0200
Subject: [PATCH 39/41] Address more comments

---
 cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
index e91deeae2b..f5ea8ba879 100644
--- a/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
+++ b/cpp/include/raft/spatial/knn/detail/topk/warpsort_topk.cuh
@@ -63,7 +63,7 @@
         block_sort<warp_sort_immediate, ...> queue(...);
 
         // way 1, [0, len) is same for the whole block
-        queue.add(in, 0, len);
+        queue.load(in, 0, len);
         // way 2, each thread gets its own val/idx pair
         for (IdxT i = threadIdx.x; i < len, i += blockDim.x) {
           queue.add(in[i], idx[i]);
@@ -139,7 +139,7 @@ constexpr auto calc_capacity(int k) -> int
  *   maximum number of elements in the queue.
  * @tparam Ascending
  *   which comparison to use: `true` means `<`, collect the smallest elements,
- *                                             `false` means `>`, collect the greatest elements.
+ *   `false` means `>`, collect the greatest elements.
  * @tparam T
  *   the type of keys (what is being compared)
  * @tparam IdxT

From 6b3804ca06aa7232d4c9304fd6f22c18be4ac351 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 29 Mar 2022 07:39:47 +0200
Subject: [PATCH 40/41] Remove commented-out bench cases

---
 cpp/bench/spatial/selection.cu | 59 +++++++++-------------------------
 1 file changed, 15 insertions(+), 44 deletions(-)

diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
index f6fa5e93d3..2a032bf936 100644
--- a/cpp/bench/spatial/selection.cu
+++ b/cpp/bench/spatial/selection.cu
@@ -77,52 +77,23 @@ struct selection : public fixture {
 };
 
 const std::vector<params> kInputs{
-  {10000, 10, 3, true},
-  {10000, 10, 10, true},
-  {10, 40, 15, true},
-  {10, 80, 15, true},
-  {10, 80, 1, true},
-  {10, 80, 7, true},
-  {10, 80, 8, true},
-  {10, 700, 3, true},
-  {10, 700, 32, true},
-  {10, 2000, 64, true},
-  {10, 10000, 7, true},
-  {10, 10000, 19, true},
+  {10000, 10, 3, true},    {10000, 10, 10, true},    {10, 40, 15, true},
+  {10, 80, 15, true},      {10, 80, 1, true},        {10, 80, 7, true},
+  {10, 80, 8, true},       {10, 700, 3, true},       {10, 700, 32, true},
+  {10, 2000, 64, true},    {10, 10000, 7, true},     {10, 10000, 19, true},
   {10, 10000, 127, true},
 
-  {1000, 10000, 1, true},
-  {1000, 10000, 2, true},
-  {1000, 10000, 4, true},
-  {1000, 10000, 8, true},
-  {1000, 10000, 16, true},
-  {1000, 10000, 32, true},
-  {1000, 10000, 64, true},
-  {1000, 10000, 128, true},
-  {1000, 10000, 256, true},
-  // {1000, 10000, 512, true}, {1000, 10000, 1024, true}, {1000, 10000, 2048, true},
-
-  {100, 100000, 1, true},
-  {100, 100000, 2, true},
-  {100, 100000, 4, true},
-  {100, 100000, 8, true},
-  {100, 100000, 16, true},
-  {100, 100000, 32, true},
-  {100, 100000, 64, true},
-  {100, 100000, 128, true},
-  {100, 100000, 256, true},
-  // {100, 100000, 512, true}, {100, 100000, 1024, true}, {100, 100000, 2048, true},
-
-  {10, 1000000, 1, true},
-  {10, 1000000, 2, true},
-  {10, 1000000, 4, true},
-  {10, 1000000, 8, true},
-  {10, 1000000, 16, true},
-  {10, 1000000, 32, true},
-  {10, 1000000, 64, true},
-  {10, 1000000, 128, true},
-  {10, 1000000, 256, true},
-  // {10, 1000000, 512, true}, {10, 1000000, 1024, true}, {10, 1000000, 2048, true},
+  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
+  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
+  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
+
+  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
+  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
+  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
+
+  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
+  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
+  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
 };
 
 #define SELECTION_REGISTER(KeyT, IdxT, Algo)                                      \

From bea83b3ff19f9ad1f6c63659efffa8fd36614148 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Tue, 29 Mar 2022 07:41:39 +0200
Subject: [PATCH 41/41] Change some bench cases

---
 cpp/bench/spatial/selection.cu | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
index 2a032bf936..09d02940a5 100644
--- a/cpp/bench/spatial/selection.cu
+++ b/cpp/bench/spatial/selection.cu
@@ -77,11 +77,9 @@ struct selection : public fixture {
 };
 
 const std::vector<params> kInputs{
-  {10000, 10, 3, true},    {10000, 10, 10, true},    {10, 40, 15, true},
-  {10, 80, 15, true},      {10, 80, 1, true},        {10, 80, 7, true},
-  {10, 80, 8, true},       {10, 700, 3, true},       {10, 700, 32, true},
-  {10, 2000, 64, true},    {10, 10000, 7, true},     {10, 10000, 19, true},
-  {10, 10000, 127, true},
+  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
+  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
+  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
 
   {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
   {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},