rapidsai · rapids-bot · Mar 30, 2022 · Mar 9, 2022 · Mar 9, 2022 · Mar 9, 2022
@@ -19,6 +19,7 @@ set(RAFT_CPP_BENCH_TARGET "bench_raft")
 # (please keep the filenames in alphabetical order)
 add_executable(${RAFT_CPP_BENCH_TARGET}
   bench/linalg/reduce.cu
+  bench/spatial/selection.cu
   bench/main.cpp
 )
 

@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/spatial/knn/knn.cuh>
+
+#include <raft/random/rng.hpp>
+#include <raft/sparse/detail/utils.h>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft::bench::spatial {
+
+struct params {
+  int n_inputs;
+  int input_len;
+  int k;
+  int select_min;
+};
+
+template <typename KeyT, typename IdxT, raft::spatial::knn::SelectKAlgo Algo>
+struct selection : public fixture {
+  explicit selection(const params& p)
+    : params_(p),
+      in_dists_(p.n_inputs * p.input_len, stream),
+      in_ids_(p.n_inputs * p.input_len, stream),
+      out_dists_(p.n_inputs * p.k, stream),
+      out_ids_(p.n_inputs * p.k, stream)
+  {
+    raft::sparse::iota_fill(in_ids_.data(), IdxT(p.n_inputs), IdxT(p.input_len), stream);
+    raft::random::Rng(42).uniform(
+      in_dists_.data(), in_dists_.size(), KeyT(-1.0), KeyT(1.0), stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    using_pool_memory_res res;
+    try {
+      std::ostringstream label_stream;
+      label_stream << params_.n_inputs << "#" << params_.input_len << "#" << params_.k;
+      state.SetLabel(label_stream.str());
+      loop_on_state(state, [this]() {
+        raft::spatial::knn::select_k<IdxT, KeyT>(in_dists_.data(),
+                                                 in_ids_.data(),
+                                                 params_.n_inputs,
+                                                 params_.input_len,
+                                                 out_dists_.data(),
+                                                 out_ids_.data(),
+                                                 params_.select_min,
+                                                 params_.k,
+                                                 stream,
+                                                 Algo);
+      });
+    } catch (raft::exception& e) {
+      state.SkipWithError(e.what());
+    }
+  }
+
+ private:
+  const params params_;
+  rmm::device_uvector<KeyT> in_dists_, out_dists_;
+  rmm::device_uvector<IdxT> in_ids_, out_ids_;
+};
+
+const std::vector<params> kInputs{
+  {20000, 500, 1, true},   {20000, 500, 2, true},    {20000, 500, 4, true},
+  {20000, 500, 8, true},   {20000, 500, 16, true},   {20000, 500, 32, true},
+  {20000, 500, 64, true},  {20000, 500, 128, true},  {20000, 500, 256, true},
+
+  {1000, 10000, 1, true},  {1000, 10000, 2, true},   {1000, 10000, 4, true},
+  {1000, 10000, 8, true},  {1000, 10000, 16, true},  {1000, 10000, 32, true},
+  {1000, 10000, 64, true}, {1000, 10000, 128, true}, {1000, 10000, 256, true},
+
+  {100, 100000, 1, true},  {100, 100000, 2, true},   {100, 100000, 4, true},
+  {100, 100000, 8, true},  {100, 100000, 16, true},  {100, 100000, 32, true},
+  {100, 100000, 64, true}, {100, 100000, 128, true}, {100, 100000, 256, true},
+
+  {10, 1000000, 1, true},  {10, 1000000, 2, true},   {10, 1000000, 4, true},
+  {10, 1000000, 8, true},  {10, 1000000, 16, true},  {10, 1000000, 32, true},
+  {10, 1000000, 64, true}, {10, 1000000, 128, true}, {10, 1000000, 256, true},
+};
+
+#define SELECTION_REGISTER(KeyT, IdxT, Algo)                                      \
+  namespace BENCHMARK_PRIVATE_NAME(selection)                                     \
+  {                                                                               \
+    using SelectK = selection<KeyT, IdxT, raft::spatial::knn::SelectKAlgo::Algo>; \
+    RAFT_BENCH_REGISTER(SelectK, #KeyT "/" #IdxT "/" #Algo, kInputs);             \
+  }
+
+SELECTION_REGISTER(float, int, FAISS);
+SELECTION_REGISTER(float, int, RADIX_8_BITS);
+SELECTION_REGISTER(float, int, RADIX_11_BITS);
+SELECTION_REGISTER(float, int, WARP_SORT);
+
+SELECTION_REGISTER(double, int, FAISS);
+SELECTION_REGISTER(double, int, RADIX_8_BITS);
+SELECTION_REGISTER(double, int, RADIX_11_BITS);
+SELECTION_REGISTER(double, int, WARP_SORT);
+
+SELECTION_REGISTER(double, size_t, FAISS);
+SELECTION_REGISTER(double, size_t, RADIX_8_BITS);
+SELECTION_REGISTER(double, size_t, RADIX_11_BITS);
+SELECTION_REGISTER(double, size_t, WARP_SORT);
+
+}  // namespace raft::bench::spatial
@@ -404,6 +404,22 @@ IntType gcd(IntType a, IntType b)
   return a;
 }
 
+template <typename T>
+constexpr T lower_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity && std::numeric_limits<T>::is_signed) {
+    return -std::numeric_limits<T>::infinity();
+  }
+  return std::numeric_limits<T>::lowest();
+}
+
+template <typename T>
+constexpr T upper_bound()
+{
+  if constexpr (std::numeric_limits<T>::has_infinity) { return std::numeric_limits<T>::infinity(); }
+  return std::numeric_limits<T>::max();
+}
+
 }  // namespace raft
 
 #endif
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,33 +31,39 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename K, typename IndexType, bool select_min, int warp_q, int thread_q, int tpb>
-__global__ void select_k_kernel(K* inK,
-                                IndexType* inV,
+template <typename key_t, typename payload_t>
+constexpr int kFaissMaxK()
+{
+  return (sizeof(key_t) + sizeof(payload_t) > 8) ? 512 : 1024;
+}
+
+template <typename key_t, typename payload_t, bool select_min, int warp_q, int thread_q, int tpb>
+__global__ void select_k_kernel(key_t* inK,
+                                payload_t* inV,
                                 size_t n_rows,
                                 size_t n_cols,
-                                K* outK,
-                                IndexType* outV,
-                                K initK,
-                                IndexType initV,
+                                key_t* outK,
+                                payload_t* outV,
+                                key_t initK,
+                                payload_t initV,
                                 int k)
 {
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
-  __shared__ K smemK[kNumWarps * warp_q];
-  __shared__ IndexType smemV[kNumWarps * warp_q];
+  __shared__ key_t smemK[kNumWarps * warp_q];
+  __shared__ payload_t smemV[kNumWarps * warp_q];
 
   faiss::gpu::
-    BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>, warp_q, thread_q, tpb>
+    BlockSelect<key_t, payload_t, select_min, faiss::gpu::Comparator<key_t>, warp_q, thread_q, tpb>
       heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int row = blockIdx.x;
   int i   = threadIdx.x;
 
   int idx             = row * n_cols;
-  K* inKStart         = inK + idx + i;
-  IndexType* inVStart = inV + idx + i;
+  key_t* inKStart     = inK + idx + i;
+  payload_t* inVStart = inV + idx + i;
 
   // Whole warps must participate in the selection
   int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
@@ -84,13 +90,13 @@ __global__ void select_k_kernel(K* inK,
   }
 }
 
-template <typename value_idx = int, typename value_t = float, int warp_q, int thread_q>
-inline void select_k_impl(value_t* inK,
-                          value_idx* inV,
+template <typename payload_t = int, typename key_t = float, int warp_q, int thread_q>
+inline void select_k_impl(key_t* inK,
+                          payload_t* inV,
                           size_t n_rows,
                           size_t n_cols,
-                          value_t* outK,
-                          value_idx* outV,
+                          key_t* outK,
+                          payload_t* outV,
                           bool select_min,
                           int k,
                           cudaStream_t stream)
@@ -100,14 +106,13 @@ inline void select_k_impl(value_t* inK,
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
   auto block              = dim3(n_threads);
 
-  auto kInit =
-    select_min ? faiss::gpu::Limits<value_t>::getMax() : faiss::gpu::Limits<value_t>::getMin();
+  auto kInit = select_min ? upper_bound<key_t>() : lower_bound<key_t>();
   auto vInit = -1;
   if (select_min) {
-    select_k_kernel<value_t, value_idx, false, warp_q, thread_q, n_threads>
+    select_k_kernel<key_t, payload_t, false, warp_q, thread_q, n_threads>
       <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   } else {
-    select_k_kernel<value_t, value_idx, true, warp_q, thread_q, n_threads>
+    select_k_kernel<key_t, payload_t, true, warp_q, thread_q, n_threads>
       <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   }
   RAFT_CUDA_TRY(cudaGetLastError());
@@ -127,38 +132,41 @@ inline void select_k_impl(value_t* inK,
  * @param[in] k number of neighbors per partition (also number of merged neighbors)
  * @param[in] stream CUDA stream to use
  */
-template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
+template <typename payload_t = int, typename key_t = float>
+inline void select_k(key_t* inK,
+                     payload_t* inV,
                      size_t n_rows,
                      size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
+                     key_t* outK,
+                     payload_t* outV,
                      bool select_min,
                      int k,
                      cudaStream_t stream)
 {
+  constexpr int max_k = kFaissMaxK<payload_t, key_t>();
   if (k == 1)
-    select_k_impl<value_idx, value_t, 1, 1>(
+    select_k_impl<payload_t, key_t, 1, 1>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 32)
-    select_k_impl<value_idx, value_t, 32, 2>(
+    select_k_impl<payload_t, key_t, 32, 2>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 64)
-    select_k_impl<value_idx, value_t, 64, 3>(
+    select_k_impl<payload_t, key_t, 64, 3>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 128)
-    select_k_impl<value_idx, value_t, 128, 3>(
+    select_k_impl<payload_t, key_t, 128, 3>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 256)
-    select_k_impl<value_idx, value_t, 256, 4>(
+    select_k_impl<payload_t, key_t, 256, 4>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 512)
-    select_k_impl<value_idx, value_t, 512, 8>(
+    select_k_impl<payload_t, key_t, 512, 8>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
-  else if (k <= 1024)
-    select_k_impl<value_idx, value_t, 1024, 8>(
+  else if (k <= 1024 && k <= max_k)
+    select_k_impl<payload_t, key_t, max_k, 8>(
       inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+  else
+    ASSERT(k <= max_k, "Current max k is %d (requested %d)", max_k, k);
 }
 
 };  // namespace detail