Migrate set retrieve to use the OA implementation (#637)

This PR updates the legacy set retrieve to use the new open-addressing solution. It enhances open-addressing retrieve by eliminating the use of coalesced groups to reduce register pressure, resulting in approximately 10% to 40% speedups in multiset retrieve benchmarks. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Daniel Jünger <[email protected]>
NVIDIA · Nov 21, 2024 · d829576 · d829576
1 parent 644e553
commit d829576
Show file tree

Hide file tree

Showing 8 changed files with 432 additions and 571 deletions.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -52,6 +52,7 @@ ConfigureBench(STATIC_SET_BENCH
   static_set/contains_bench.cu
   static_set/find_bench.cu
   static_set/insert_bench.cu
+  static_set/retrieve_bench.cu
   static_set/retrieve_all_bench.cu
   static_set/size_bench.cu
   static_set/rehash_bench.cu)

diff --git a/benchmarks/static_set/retrieve_bench.cu b/benchmarks/static_set/retrieve_bench.cu
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark_defaults.hpp>
+#include <benchmark_utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::retrieve` performance
+ */
+template <typename Key, typename Dist>
+void static_set_retrieve(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys      = state.get_int64("NumInputs");
+  auto const occupancy     = state.get_float64("Occupancy");
+  auto const matching_rate = state.get_float64("MatchingRate");
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
+
+  state.add_element_count(num_keys);
+
+  cuco::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
+  set.insert(keys.begin(), keys.end());
+
+  auto const output_size = set.count(keys.begin(), keys.end());
+  thrust::device_vector<Key> output_match(output_size);
+  auto output_probe_begin = thrust::discard_iterator{};
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    set.retrieve(
+      keys.begin(), keys.end(), output_probe_begin, output_match.begin(), {launch.get_stream()});
+  });
+}
+
+NVBENCH_BENCH_TYPES(static_set_retrieve,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_set_retrieve_uniform_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::N})
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE)
+  .add_float64_axis("MatchingRate", {defaults::MATCHING_RATE})
+  .add_int64_axis("Multiplicity", {defaults::MULTIPLICITY});
+
+NVBENCH_BENCH_TYPES(static_set_retrieve,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_set_retrieve_uniform_matching_rate")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::N})
+  .add_float64_axis("Occupancy", {defaults::OCCUPANCY})
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE)
+  .add_int64_axis("Multiplicity", {defaults::MULTIPLICITY});
+
+NVBENCH_BENCH_TYPES(static_set_retrieve,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_set_retrieve_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", {defaults::N})
+  .add_float64_axis("Occupancy", {defaults::OCCUPANCY})
+  .add_float64_axis("MatchingRate", {defaults::MATCHING_RATE})
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh
@@ -415,77 +415,6 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
   }
 }
 
-/**
- * @brief Retrieves the equivalent container elements of all keys in the range `[input_probe,
- * input_probe + n)`.
- *
- * If key `k = *(input_probe + i)` has one or more matches in the container, copies `k` to
- * `output_probe` and associated slot contents to `output_match`, respectively. The output order is
- * unspecified.
- *
- * @tparam IsOuter Flag indicating whether it's an outer count or not
- * @tparam block_size The size of the thread block
- * @tparam InputProbeIt Device accessible input iterator
- * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
- * convertible to the `InputProbeIt`'s `value_type`
- * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
- * convertible to the container's `value_type`
- * @tparam AtomicCounter Integral atomic type that follows the same semantics as
- * `cuda::(std::)atomic(_ref)`
- * @tparam Ref Type of non-owning device ref allowing access to storage
- *
- * @param input_probe Beginning of the sequence of input keys
- * @param n Number of the keys to query
- * @param output_probe Beginning of the sequence of keys corresponding to matching elements in
- * `output_match`
- * @param output_match Beginning of the sequence of matching elements
- * @param atomic_counter Pointer to an atomic object of integral type that is used to count the
- * number of output elements
- * @param ref Non-owning container device ref used to access the slot storage
- */
-template <bool IsOuter,
-          int32_t BlockSize,
-          class InputProbeIt,
-          class OutputProbeIt,
-          class OutputMatchIt,
-          class AtomicCounter,
-          class Ref>
-CUCO_KERNEL __launch_bounds__(BlockSize) void retrieve(InputProbeIt input_probe,
-                                                       cuco::detail::index_type n,
-                                                       OutputProbeIt output_probe,
-                                                       OutputMatchIt output_match,
-                                                       AtomicCounter* atomic_counter,
-                                                       Ref ref)
-{
-  namespace cg = cooperative_groups;
-
-  auto const block              = cg::this_thread_block();
-  auto constexpr tiles_in_block = BlockSize / Ref::cg_size;
-  // make sure all but the last block are always occupied
-  auto const items_per_block = detail::int_div_ceil(n, tiles_in_block * gridDim.x) * tiles_in_block;
-
-  auto const block_begin_offset = block.group_index().x * items_per_block;
-  auto const block_end_offset   = min(n, block_begin_offset + items_per_block);
-
-  if (block_begin_offset < block_end_offset) {
-    if constexpr (IsOuter) {
-      ref.retrieve_outer<BlockSize>(block,
-                                    input_probe + block_begin_offset,
-                                    input_probe + block_end_offset,
-                                    output_probe,
-                                    output_match,
-                                    *atomic_counter);
-    } else {
-      ref.retrieve<BlockSize>(block,
-                              input_probe + block_begin_offset,
-                              input_probe + block_end_offset,
-                              output_probe,
-                              output_match,
-                              *atomic_counter);
-    }
-  }
-}
-
 /**
  * @brief Inserts all elements in the range `[first, last)`.
  *
@@ -642,6 +571,76 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
   if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
 }
 
+/**
+ * @brief Retrieves the equivalent container elements of all keys in the range `[input_probe,
+ * input_probe + n)`.
+ *
+ * If key `k = *(input_probe + i)` has one or more matches in the container, copies `k` to
+ * `output_probe` and associated slot contents to `output_match`, respectively. The output order is
+ * unspecified.
+ *
+ * @tparam IsOuter Flag indicating whether it's an outer count or not
+ * @tparam block_size The size of the thread block
+ * @tparam InputProbeIt Device accessible input iterator
+ * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
+ * convertible to the `InputProbeIt`'s `value_type`
+ * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
+ * convertible to the container's `value_type`
+ * @tparam AtomicCounter Integral atomic type that follows the same semantics as
+ * `cuda::(std::)atomic(_ref)`
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param input_probe Beginning of the sequence of input keys
+ * @param n Number of the keys to query
+ * @param output_probe Beginning of the sequence of keys corresponding to matching elements in
+ * `output_match`
+ * @param output_match Beginning of the sequence of matching elements
+ * @param atomic_counter Pointer to an atomic object of integral type that is used to count the
+ * number of output elements
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <bool IsOuter,
+          int32_t BlockSize,
+          class InputProbeIt,
+          class OutputProbeIt,
+          class OutputMatchIt,
+          class AtomicCounter,
+          class Ref>
+CUCO_KERNEL void retrieve(InputProbeIt input_probe,
+                          cuco::detail::index_type n,
+                          OutputProbeIt output_probe,
+                          OutputMatchIt output_match,
+                          AtomicCounter* atomic_counter,
+                          Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  auto const block              = cg::this_thread_block();
+  auto constexpr tiles_in_block = BlockSize / Ref::cg_size;
+
+  auto const block_begin_offset = block.group_index().x * tiles_in_block;
+  auto const block_end_offset =
+    min(n, static_cast<cuco::detail::index_type>(block_begin_offset + tiles_in_block));
+
+  if (block_begin_offset < block_end_offset) {
+    if constexpr (IsOuter) {
+      ref.retrieve_outer<BlockSize>(block,
+                                    input_probe + block_begin_offset,
+                                    input_probe + block_end_offset,
+                                    output_probe,
+                                    output_match,
+                                    atomic_counter);
+    } else {
+      ref.retrieve<BlockSize>(block,
+                              input_probe + block_begin_offset,
+                              input_probe + block_end_offset,
+                              output_probe,
+                              output_match,
+                              atomic_counter);
+    }
+  }
+}
+
 /**
  * @brief Calculates the number of filled slots for the given bucket storage.
  *