NVIDIA · sleeepyjack · Oct 11, 2023 · Oct 6, 2023 · Oct 6, 2023 · Oct 6, 2023
@@ -53,7 +53,8 @@ ConfigureBench(STATIC_SET_BENCH
   hash_table/static_set/find_bench.cu
   hash_table/static_set/insert_bench.cu
   hash_table/static_set/retrieve_all_bench.cu
-  hash_table/static_set/size_bench.cu)
+  hash_table/static_set/size_bench.cu
+  hash_table/static_set/rehash_bench.cu)
 
 ###################################################################################################
 # - static_map benchmarks -------------------------------------------------------------------------

@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::rehash` performance
+ */
+template <typename Key, typename Dist>
+void static_set_rehash(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  std::size_t const capacity = state.get_int64_or_default("Capacity", cuco::benchmark::defaults::N);
+  auto const occupancy =
+    state.get_float64_or_default("Occupancy", cuco::benchmark::defaults::OCCUPANCY);
+
+  std::size_t const num_keys = capacity * occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);  // slots per second
+
+  cuco::utility::key_generator gen;
+  gen.generate(cuco::benchmark::dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(capacity);
+
+  cuco::experimental::static_set<Key> set{capacity, cuco::empty_key<Key>{-1}};
+
+  set.insert(keys.begin(), keys.end());
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { set.rehash({launch.get_stream()}); });
+}
+
+NVBENCH_BENCH_TYPES(static_set_rehash,
+                    NVBENCH_TYPE_AXES(cuco::benchmark::defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<cuco::utility::distribution::unique>))
+  .set_name("static_set_rehash_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(cuco::benchmark::defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", cuco::benchmark::defaults::OCCUPANCY_RANGE);
@@ -293,6 +293,53 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
   if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
 }
 
+template <int32_t BlockSize, typename ContainerRef, typename Predicate>
+__global__ void rehash(typename ContainerRef::storage_ref_type storage_ref,
+                       ContainerRef container_ref,
+                       Predicate is_filled)
+{
+  namespace cg = cooperative_groups;
+
+  __shared__ typename ContainerRef::value_type buffer[BlockSize];
+  __shared__ unsigned int buffer_size;
+
+  auto constexpr cg_size = ContainerRef::cg_size;
+  auto const block       = cg::this_thread_block();
+  auto const tile        = cg::tiled_partition<cg_size>(block);
+
+  auto const thread_rank         = block.thread_rank();
+  auto constexpr tiles_per_block = BlockSize / cg_size;  // tile.meta_group_size() but constexpr
+  auto const tile_rank           = tile.meta_group_rank();
+  auto const loop_stride         = cuco::detail::grid_stride();
+  auto idx                       = cuco::detail::global_thread_id();
+  auto const n                   = storage_ref.num_windows();
+
+  while (idx - thread_rank < n) {
+    if (thread_rank == 0) { buffer_size = 0; }
+    block.sync();
+
+    // gather values in shmem buffer
+    if (idx < n) {
+      auto const window = storage_ref[idx];
+
+      for (auto const& slot : window) {
+        if (is_filled(slot)) { buffer[atomicAdd_block(&buffer_size, 1)] = slot; }
+      }
+    }
+    block.sync();
+
+    auto const local_buffer_size = buffer_size;
+
+    // insert from shmem buffer into the container
+    for (auto tidx = tile_rank; tidx < local_buffer_size; tidx += tiles_per_block) {
+      container_ref.insert(tile, buffer[tidx]);
+    }
+    block.sync();
+
+    idx += loop_stride;
+  }
+}
+
 }  // namespace detail
 }  // namespace experimental
 }  // namespace cuco
@@ -22,6 +22,7 @@
 #include <cuco/detail/storage/counter_storage.cuh>
 #include <cuco/detail/utility/cuda.hpp>
 #include <cuco/extent.cuh>
+#include <cuco/operator.hpp>
 #include <cuco/probing_scheme.cuh>
 #include <cuco/storage.cuh>
 #include <cuco/utility/traits.hpp>
@@ -624,6 +625,120 @@ class open_addressing_impl {
     return counter.load_to_host(stream);
   }
 
+  /**
+   * @brief Regenerates the container
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `rehash_async`.
+   *
+   * @tparam Container The container type this function operates on
+   * @tparam Predicate Type of predicate indicating if the given slot is filled
+   *
+   * @param extent The container's new `window_extent` after this operation took place
+   * @param container The container to be rehashed
+   * @param is_filled Predicate indicating if the given slot is filled
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename Container, typename Predicate>
+  void rehash(Container const& container, Predicate const& is_filled, cuda_stream_ref stream)
+  {
+    this->rehash_async(container, is_filled, stream);
+    stream.synchronize();
+  }
+
+  /**
+   * @brief Asynchronously reserves at least the specified number of slots and regenerates the
+   * container
+   *
+   * @note Changes the number of windows to a value that is not less than `extent`, then
+   * rehashes the container, i.e. puts the elements into appropriate slots considering
+   * that the total number of slots has changed.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `rehash_async`.
+   *
+   * @note Behavior is undefined if the desired `extent` is insufficient to store all of the
+   * contained elements.
+   *
+   * @note This function is not available if the conatiner's `extent_type` is static.
+   *
+   * @tparam Container The container type this function operates on
+   * @tparam Predicate Type of predicate indicating if the given slot is filled
+   *
+   * @param extent The container's new `window_extent` after this operation took place
+   * @param container The container to be rehashed
+   * @param is_filled Predicate indicating if the given slot is filled
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename Container, typename Predicate>
+  void rehash(extent_type extent,
+              Container const& container,
+              Predicate const& is_filled,
+              cuda_stream_ref stream)
+  {
+    this->rehash_async(extent, container, is_filled, stream);
+    stream.synchronize();
+  }
+
+  /**
+   * @brief Asynchronously regenerates the container
+   *
+   * @tparam Container The container type this function operates on
+   * @tparam Predicate Type of predicate indicating if the given slot is filled
+   *
+   * @param extent The container's new `window_extent` after this operation took place
+   * @param container The container to be rehashed
+   * @param is_filled Predicate indicating if the given slot is filled
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename Container, typename Predicate>
+  void rehash_async(Container const& container, Predicate const& is_filled, cuda_stream_ref stream)
+  {
+    this->rehash_async(this->storage_.window_extent(), container, is_filled, stream);
+  }
+
+  /**
+   * @brief Asynchronously reserves at least the specified number of slots and regenerates the
+   * container
+   *
+   * @note Changes the number of windows to a value that is not less than `extent`, then
+   * rehashes the container, i.e. puts the elements into appropriate slots considering
+   * that the total number of slots has changed.
+   *
+   * @note Behavior is undefined if the desired `extent` is insufficient to store all of the
+   * contained elements.
+   *
+   * @note This function is not available if the conatiner's `extent_type` is static.
+   *
+   * @tparam Container The container type this function operates on
+   * @tparam Predicate Type of predicate indicating if the given slot is filled
+   *
+   * @param extent The container's new `window_extent` after this operation took place
+   * @param container The container to be rehashed
+   * @param is_filled Predicate indicating if the given slot is filled
+   * @param stream CUDA stream used for this operation
+   */
+  template <typename Container, typename Predicate>
+  void rehash_async(extent_type extent,
+                    Container const& container,
+                    Predicate const& is_filled,
+                    cuda_stream_ref stream)
+  {
+    auto const old_storage = std::move(this->storage_);
+    new (&storage_) storage_type{extent, this->allocator()};
+    this->clear_async(stream);
+
+    auto const num_windows = old_storage.num_windows();
+    if (num_windows == 0) { return; }
+
+    auto constexpr block_size = cuco::detail::default_block_size();
+    auto constexpr stride     = cuco::detail::default_stride();
+    auto const grid_size      = cuco::detail::grid_size(num_windows, 1, stride, block_size);
+
+    detail::rehash<block_size><<<grid_size, block_size, 0, stream>>>(
+      old_storage.ref(), container.ref(op::insert), is_filled);
+  }
+
   /**
    * @brief Gets the maximum number of elements the container can hold.
    *

@@ -407,6 +407,72 @@ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
   return std::make_pair(keys_out + num, values_out + num);
 }
 
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
+  cuda_stream_ref stream)
+{
+  auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
+                                                                       this->erased_key_sentinel());
+  this->impl_->rehash(*this, is_filled, stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
+  size_type capacity, cuda_stream_ref stream)
+{
+  auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
+                                                                       this->erased_key_sentinel());
+  auto const extent    = make_window_extent<static_map>(capacity);
+  this->impl_->rehash(extent, *this, is_filled, stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
+  cuda_stream_ref stream)
+{
+  auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
+                                                                       this->erased_key_sentinel());
+  this->impl_->rehash_async(*this, is_filled, stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
+  size_type capacity, cuda_stream_ref stream)
+{
+  auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel(),
+                                                                       this->erased_key_sentinel());
+  auto const extent    = make_window_extent<static_map>(capacity);
+  this->impl_->rehash_async(extent, *this, is_filled, stream);
+}
+
 template <class Key,
           class T,
           class Extent,

@@ -335,6 +335,68 @@ OutputIt static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stor
   return impl_->retrieve_all(begin, output_begin, is_filled, stream);
 }
 
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
+  cuda_stream_ref stream)
+{
+  auto const is_filled =
+    static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
+  this->impl_->rehash(*this, is_filled, stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
+  size_type capacity, cuda_stream_ref stream)
+{
+  auto const is_filled =
+    static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
+  auto const extent = make_window_extent<static_set>(capacity);
+  this->impl_->rehash(extent, *this, is_filled, stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
+  cuda_stream_ref stream)
+{
+  auto const is_filled =
+    static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
+  this->impl_->rehash_async(*this, is_filled, stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
+  size_type capacity, cuda_stream_ref stream)
+{
+  auto const is_filled =
+    static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel());
+  auto const extent = make_window_extent<static_set>(capacity);
+  this->impl_->rehash_async(extent, *this, is_filled, stream);
+}
+
 template <class Key,
           class Extent,
           cuda::thread_scope Scope,