NVIDIA · PointKernel · Sep 26, 2023 · Jul 3, 2023 · Aug 5, 2023 · Aug 5, 2023
@@ -35,6 +35,7 @@ endfunction(ConfigureExample)
 
 ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
 ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
+ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
 ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
 ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
 ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")

@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set_ref.cuh>
+#include <cuco/storage.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+
+#include <cooperative_groups.h>
+
+#include <cuda/std/array>
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+
+auto constexpr cg_size     = 8;   ///< A CUDA Cooperative Group of 8 threads to handle each subset
+auto constexpr window_size = 1;   ///< TODO: how to explain window size (vector length) to users
+auto constexpr N           = 10;  ///< Number of elements to insert and query
+
+using key_type = int;
+using probing_scheme_type =
+  cuco::experimental::linear_probing<cg_size, cuco::default_hash_function<key_type>>;
+using storage_type     = cuco::experimental::aow_storage<key_type, window_size>;
+using storage_ref_type = typename storage_type::ref_type;
+template <typename Operator>
+using ref_type = cuco::experimental::static_set_ref<key_type,
+                                                    cuda::thread_scope_device,
+                                                    thrust::equal_to<key_type>,
+                                                    probing_scheme_type,
+                                                    storage_ref_type,
+                                                    Operator>;
+
+/// data to insert/query
+__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+/// Empty slots are represented by reserved "sentinel" values. These values should be selected such
+/// that they never occur in your input data.
+key_type constexpr empty_key_sentinel = -1;
+
+template <typename WindowT>
+__global__ void initialize(WindowT* windows, std::size_t n, typename WindowT::value_type value)
+{
+  using T = typename WindowT::value_type;
+
+  auto const loop_stride = gridDim.x * blockDim.x;
+  auto idx               = blockDim.x * blockIdx.x + threadIdx.x;
+
+  while (idx < n) {
+    auto& window_slots = *(windows + idx);
+#pragma unroll
+    for (auto& slot : window_slots) {
+      new (&slot) T{value};
+    }
+    idx += loop_stride;
+  }
+}
+
+// insert a set of keys into a hash set using one cooperative group for each task
+template <typename Window, typename Size, typename Offset>
+__global__ void insert(Window* windows, Size* sizes, Offset* offsets)
+{
+  namespace cg = cooperative_groups;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const idx  = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  auto set_ref = ref_type<cuco::experimental::insert_tag>{
+    cuco::empty_key<key_type>{-1}, {}, {}, storage_ref_type{sizes[idx], windows + offsets[idx]}};
+
+  // Each cooperative_groups inserts all elements in `data` into its own subset
+  for (int i = 0; i < N; i++) {
+    set_ref.insert(tile, data[i]);
+  }
+}
+
+// insert a set of keys into a hash set using one cooperative group for each task
+template <typename Window, typename Size, typename Offset>
+__global__ void find(Window* windows, Size* sizes, Offset* offsets)
+{
+  namespace cg = cooperative_groups;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const idx  = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  auto set_ref = ref_type<cuco::experimental::find_tag>{
+    cuco::empty_key<key_type>{-1}, {}, {}, storage_ref_type{sizes[idx], windows + offsets[idx]}};
+
+  __shared__ int result;
+  if (threadIdx.x == 0) { result = 0; }
+  __syncthreads();
+
+  for (int i = 0; i < N; i++) {
+    auto const found = set_ref.find(tile, data[i]);
+    // Record if the inserted data has been found
+    atomicOr(&result, *found != data[i]);
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    if (result == 0) { printf("Success! Found all inserted elements.\n"); }
+  }
+}
+
+/**
+ * @file device_subsets_example.cu
+ * @brief Demonstrates usage of the static_set device-side APIs.
+ *
+ * static_set provides a non-owning reference which can be used to interact with
+ * the container from within device code.
+ */
+int main()
+{
+  // Number of subsets
+  auto constexpr num = 16;
+  // Sizes of the 16 subsets to be created on the device
+  auto constexpr subset_sizes =
+    std::array<std::size_t, num>{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50};
+
+  auto valid_sizes = std::vector<std::size_t>(num);
+  std::generate(valid_sizes.begin(), valid_sizes.end(), [&, n = 0]() mutable {
+    return cuco::experimental::make_window_extent<cg_size, window_size>(subset_sizes[n++]);
+  });
+
+  auto const d_sizes = thrust::device_vector<std::size_t>{valid_sizes};
+  auto d_offsets     = thrust::device_vector<std::size_t>(num);
+  thrust::exclusive_scan(d_sizes.begin(), d_sizes.end(), d_offsets.begin());
+
+  auto const num_windows = thrust::reduce(valid_sizes.begin(), valid_sizes.end());
+
+  // One allocation for all subsets
+  auto d_set_storage = storage_type{num_windows};
+  // Initializes the storage with the given sentinel
+  d_set_storage.initialize(empty_key_sentinel);
+
+  insert<<<1, 128>>>(d_set_storage.data(), d_sizes.data().get(), d_offsets.data().get());
+  find<<<1, 128>>>(d_set_storage.data(), d_sizes.data().get(), d_offsets.data().get());
+
+  return 0;
+}
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include <cuco/detail/storage/aow_storage_base.cuh>
-
 #include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/storage/aow_storage_base.cuh>
 #include <cuco/extent.cuh>
+#include <cuco/utility/allocator.hpp>
 
 #include <cuda/std/array>
 
@@ -47,7 +47,10 @@ class aow_storage_ref;
  * @tparam Extent Type of extent denoting number of windows
  * @tparam Allocator Type of allocator used for device storage (de)allocation
  */
-template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+template <typename T,
+          int32_t WindowSize,
+          typename Extent    = cuco::experimental::extent<std::size_t>,
+          typename Allocator = cuco::cuda_allocator<cuco::experimental::window<T, WindowSize>>>
 class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
  public:
   using base_type = detail::aow_storage_base<T, WindowSize, Extent>;  ///< AoW base class type
@@ -78,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
    * @param size Number of windows to (de)allocate
    * @param allocator Allocator used for (de)allocating device storage
    */
-  explicit constexpr aow_storage(Extent size, Allocator const& allocator) noexcept;
+  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;
 
   aow_storage(aow_storage&&) = default;  ///< Move constructor
   /**
@@ -119,7 +122,15 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
    * @param key Key to which all keys in `slots` are initialized
    * @param stream Stream used for executing the kernel
    */
-  void initialize(value_type key, cuda_stream_ref stream) noexcept;
+  void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
+   *
+   * @param key Key to which all keys in `slots` are initialized
+   * @param stream Stream used for executing the kernel
+   */
+  void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept;
 
  private:
   allocator_type allocator_;            ///< Allocator used to (de)allocate windows
@@ -134,7 +145,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
  * @tparam WindowSize Number of slots in each window
  * @tparam Extent Type of extent denoting storage capacity
  */
-template <typename T, int32_t WindowSize, typename Extent>
+template <typename T, int32_t WindowSize, typename Extent = cuco::experimental::extent<std::size_t>>
 class aow_storage_ref : public detail::aow_storage_base<T, WindowSize, Extent> {
  public:
   using base_type = detail::aow_storage_base<T, WindowSize, Extent>;  ///< AoW base class type

@@ -141,11 +141,7 @@ class open_addressing_impl {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear(cuda_stream_ref stream) noexcept
-  {
-    this->clear_async(stream);
-    stream.synchronize();
-  }
+  void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }
 
   /**
    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
@@ -155,7 +151,7 @@ class open_addressing_impl {
    */
   void clear_async(cuda_stream_ref stream) noexcept
   {
-    storage_.initialize(empty_slot_sentinel_, stream);
+    storage_.initialize_async(empty_slot_sentinel_, stream);
   }
 
   /**

@@ -19,6 +19,7 @@
 #include <cuco/detail/equal_wrapper.cuh>
 #include <cuco/extent.cuh>
 #include <cuco/pair.cuh>
+#include <cuco/probing_scheme.cuh>
 
 #include <thrust/distance.h>
 #include <thrust/pair.h>
@@ -63,12 +64,12 @@ class open_addressing_ref_impl {
                       ProbingScheme>,
     "ProbingScheme must inherit from cuco::detail::probing_scheme_base");
 
-  static_assert(is_window_extent_v<typename StorageRef::extent_type>,
-                "Extent is not a valid cuco::window_extent");
-  static_assert(ProbingScheme::cg_size == StorageRef::extent_type::cg_size,
-                "Extent has incompatible CG size");
-  static_assert(StorageRef::window_size == StorageRef::extent_type::window_size,
-                "Extent has incompatible window size");
+  // static_assert(is_window_extent_v<typename StorageRef::extent_type>,
+  //               "Extent is not a valid cuco::window_extent");
+  // static_assert(ProbingScheme::cg_size == StorageRef::extent_type::cg_size,
+  //               "Extent has incompatible CG size");
+  // static_assert(StorageRef::window_size == StorageRef::extent_type::window_size,
+  //               "Extent has incompatible window size");
 
  public:
   using key_type            = Key;                                     ///< Key type

@@ -66,6 +66,14 @@ aow_storage<T, WindowSize, Extent, Allocator>::ref() const noexcept
 template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
 void aow_storage<T, WindowSize, Extent, Allocator>::initialize(value_type key,
                                                                cuda_stream_ref stream) noexcept
+{
+  this->initialize_async(key, stream);
+  stream.synchronize();
+}
+
+template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+void aow_storage<T, WindowSize, Extent, Allocator>::initialize_async(
+  value_type key, cuda_stream_ref stream) noexcept
 {
   auto constexpr stride = 4;
   auto const grid_size  = (this->num_windows() + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) /

@@ -45,6 +45,7 @@ class storage : StorageImpl::template impl<T, Extent, Allocator> {
   using impl_type::capacity;
   using impl_type::data;
   using impl_type::initialize;
+  using impl_type::initialize_async;
   using impl_type::num_windows;
   using impl_type::ref;
 

@@ -18,8 +18,11 @@
 
 #include <cuco/detail/equal_wrapper.cuh>
 #include <cuco/detail/open_addressing_ref_impl.cuh>
+#include <cuco/hash_functions.cuh>
 #include <cuco/operator.hpp>
+#include <cuco/probing_scheme.cuh>
 #include <cuco/sentinel.cuh>
+#include <cuco/storage.cuh>
 
 #include <cuda/std/atomic>
 

@@ -20,6 +20,7 @@
 
 namespace cuco {
 namespace experimental {
+
 /**
  * @brief Public storage class.
  *
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,6 +20,7 @@ @@
     namespace cuco {
     namespace experimental {
     /**
      * @brief Public storage class.
      *
@@ Expand Down @@