Add data structure base classes and cuco::static_set (#278)

This is the first PR related to #110. It introduces the concept of: - New probing scheme via probing iterator - Array of Windows storage instead of flat storage to better deal with memory bandwidth-bound workload when hash collisions are present - Dynamic and static extent type for efficient probing - Mixin to encode concurrent device operators - Synchronous and asynchronous host bulk APIs This PR also adds `cuco::static_set` to evaluate the new design. For now, only 2 basic operations, `insert` and `contains`, are supported. --------- Co-authored-by: Daniel Juenger <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
NVIDIA · Apr 6, 2023 · ed620ab · ed620ab
1 parent 9287726
commit ed620ab
Show file tree

Hide file tree

Showing 39 changed files with 3,768 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -181,6 +181,14 @@ class example_class {
 
 We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables. 
 
+### `static_set`
+
+`cuco::static_set` is a fixed-size container that stores unique elements in no particular order. See the Doxygen documentation in `static_set.cuh` for more detailed information.
+
+#### Examples:
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/jnjcdG16c))
+- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/EGMj6qx73))
+
 ### `static_map`
 
 `cuco::static_map` is a fixed-size hash table using open addressing with linear probing. See the Doxygen documentation in `static_map.cuh` for more detailed information.

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
@@ -46,6 +46,13 @@ endfunction(ConfigureBench)
 ### benchmark sources #############################################################################
 ###################################################################################################
 
+###################################################################################################
+# - static_set benchmarks -------------------------------------------------------------------------
+ConfigureBench(STATIC_SET_BENCH
+  hash_table/static_set/contains_bench.cu
+  hash_table/static_set/insert_bench.cu
+  hash_table/static_set/size_bench.cu)
+
 ###################################################################################################
 # - static_map benchmarks -------------------------------------------------------------------------
 ConfigureBench(STATIC_MAP_BENCH

diff --git a/benchmarks/hash_table/static_set/contains_bench.cu b/benchmarks/hash_table/static_set/contains_bench.cu
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::contains` performance
+ */
+template <typename Key, typename Dist>
+void static_set_contains(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy     = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
+  set.insert(keys.begin(), keys.end());
+
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
+
+  thrust::device_vector<bool> result(num_keys);
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    set.contains(keys.begin(), keys.end(), result.begin(), launch.get_stream());
+  });
+}
+
+NVBENCH_BENCH_TYPES(static_set_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_contains_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_contains_unique_matching_rate")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
diff --git a/benchmarks/hash_table/static_set/insert_bench.cu b/benchmarks/hash_table/static_set/insert_bench.cu
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::insert` performance
+ */
+template <typename Key, typename Dist>
+void static_set_insert(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys  = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               cuco::experimental::static_set<Key> set{
+                 size, cuco::empty_key<Key>{-1}, {}, {}, {}, launch.get_stream()};
+
+               timer.start();
+               set.insert(keys.begin(), keys.end(), launch.get_stream());
+               timer.stop();
+             });
+}
+
+NVBENCH_BENCH_TYPES(static_set_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_set_insert_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_insert_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::gaussian>))
+  .set_name("static_set_insert_gaussian_skew")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Skew", defaults::SKEW_RANGE);
diff --git a/benchmarks/hash_table/static_set/size_bench.cu b/benchmarks/hash_table/static_set/size_bench.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::size` performance
+ */
+template <typename Key, typename Dist>
+void static_set_size(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys  = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(num_keys);
+
+  cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
+
+  set.insert(keys.begin(), keys.end());
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto const size = set.size(launch.get_stream()); });
+}
+
+NVBENCH_BENCH_TYPES(static_set_size,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_size_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -33,6 +33,8 @@ endfunction(ConfigureExample)
 ### Example sources ###############################################################################
 ###################################################################################################
 
+ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
+ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
 ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
 ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
 ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")

diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+
+#include <cooperative_groups.h>
+
+#include <cstddef>
+#include <iostream>
+
+// insert a set of keys into a hash set using one cooperative group for each task
+template <typename SetRef, typename InputIterator>
+__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n)
+{
+  namespace cg = cooperative_groups;
+
+  constexpr auto cg_size = SetRef::cg_size;
+
+  auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+
+  int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
+  int64_t idx               = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  while (idx < n) {
+    set.insert(tile, *(keys + idx));
+    idx += loop_stride;
+  }
+}
+
+template <typename SetRef, typename InputIterator, typename OutputIterator>
+__global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, OutputIterator found)
+{
+  int64_t const loop_stride = gridDim.x * blockDim.x;
+  int64_t idx               = blockDim.x * blockIdx.x + threadIdx.x;
+
+  while (idx < n) {
+    found[idx] = set.contains(*(keys + idx));
+    idx += loop_stride;
+  }
+}
+
+/**
+ * @file device_reference_example.cu
+ * @brief Demonstrates usage of the static_set device-side APIs.
+ *
+ * static_set provides a non-owning reference which can be used to interact with
+ * the container from within device code.
+ *
+ */
+int main(void)
+{
+  using Key = int;
+
+  // Empty slots are represented by reserved "sentinel" values. These values should be selected such
+  // that they never occur in your input data.
+  Key constexpr empty_key_sentinel = -1;
+
+  // Number of keys to be inserted
+  std::size_t constexpr num_keys = 50'000;
+
+  // Compute capacity based on a 50% load factor
+  auto constexpr load_factor = 0.5;
+  std::size_t const capacity = std::ceil(num_keys / load_factor);
+
+  using set_type = cuco::experimental::static_set<Key>;
+
+  // Constructs a hash set with at least "capacity" slots using -1 as the empty key sentinel.
+  set_type set{capacity, cuco::empty_key{empty_key_sentinel}};
+
+  // Create a sequence of keys {0, 1, 2, .., i}
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::sequence(keys.begin(), keys.end(), 0);
+
+  // Insert the first half of the keys into the set
+  set.insert(keys.begin(), keys.begin() + num_keys / 2);
+
+  // Insert the second half of keys using a custom CUDA kernel.
+  custom_cooperative_insert<<<128, 128>>>(
+    set.ref(cuco::experimental::insert), keys.begin() + num_keys / 2, num_keys / 2);
+
+  // Storage for result
+  thrust::device_vector<bool> found(num_keys);
+
+  // Check if all keys are now contained in the set. Note that we pass a reference that already has
+  // the `contains` operator.
+  // In general, using two or more reference objects to the same container but with
+  // a different set of operators concurrently is undefined behavior.
+  // This does not apply here since the two kernels do not overlap.
+  custom_contains<<<128, 128>>>(
+    set.ref(cuco::experimental::contains), keys.begin(), num_keys, found.begin());
+
+  // Verify that all keys have been found
+  bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity<bool>());
+
+  if (all_keys_found) { std::cout << "Success! Found all keys.\n"; }
+
+  return 0;
+}