From ed620ab4b885676991699c5823a802fd0ad90f99 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 5 Apr 2023 17:45:56 -0700 Subject: [PATCH] Add data structure base classes and `cuco::static_set` (#278) This is the first PR related to #110. It introduces the concept of: - New probing scheme via probing iterator - Array of Windows storage instead of flat storage to better deal with memory bandwidth-bound workload when hash collisions are present - Dynamic and static extent type for efficient probing - Mixin to encode concurrent device operators - Synchronous and asynchronous host bulk APIs This PR also adds `cuco::static_set` to evaluate the new design. For now, only 2 basic operations, `insert` and `contains`, are supported. --------- Co-authored-by: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- README.md | 8 + benchmarks/CMakeLists.txt | 7 + .../hash_table/static_set/contains_bench.cu | 75 ++++ .../hash_table/static_set/insert_bench.cu | 81 ++++ .../hash_table/static_set/size_bench.cu | 62 ++++ examples/CMakeLists.txt | 2 + examples/static_set/device_ref_example.cu | 116 ++++++ examples/static_set/host_bulk_example.cu | 72 ++++ include/cuco/detail/equal_wrapper.cuh | 95 +++++ include/cuco/detail/operator.inl | 59 +++ include/cuco/detail/prime.hpp | 1 + include/cuco/detail/probing_scheme_base.cuh | 42 +++ include/cuco/detail/probing_scheme_impl.inl | 146 ++++++++ include/cuco/detail/static_set/kernels.cuh | 287 +++++++++++++++ include/cuco/detail/static_set/static_set.inl | 249 +++++++++++++ .../cuco/detail/static_set/static_set_ref.inl | 348 ++++++++++++++++++ include/cuco/detail/storage/aow_storage.cuh | 247 +++++++++++++ .../cuco/detail/storage/counter_storage.cuh | 112 ++++++ include/cuco/detail/storage/kernels.cuh | 55 +++ include/cuco/detail/storage/storage.cuh | 62 ++++ include/cuco/detail/storage/storage_base.cuh | 91 +++++ include/cuco/detail/tuning.cuh | 27 ++ include/cuco/detail/utils.cuh | 36 +- include/cuco/detail/utils.hpp | 43 ++- include/cuco/extent.cuh | 149 ++++++++ include/cuco/operator.hpp | 40 ++ include/cuco/probing_scheme.cuh | 153 ++++++++ include/cuco/static_set.cuh | 289 +++++++++++++++ include/cuco/static_set_ref.cuh | 131 +++++++ include/cuco/storage.cuh | 47 +++ include/cuco/utility/traits.hpp | 8 +- tests/CMakeLists.txt | 17 +- tests/static_set/capacity_test.cu | 112 ++++++ tests/static_set/heterogeneous_lookup_test.cu | 120 ++++++ tests/static_set/large_input_test.cu | 88 +++++ tests/static_set/size_test.cu | 42 +++ tests/static_set/unique_sequence_test.cu | 110 ++++++ tests/utility/extent_test.cu | 56 +++ tests/utility/storage_test.cu | 90 +++++ 39 files changed, 3768 insertions(+), 7 deletions(-) create mode 100644 benchmarks/hash_table/static_set/contains_bench.cu create mode 100644 benchmarks/hash_table/static_set/insert_bench.cu create mode 100644 benchmarks/hash_table/static_set/size_bench.cu create mode 100644 examples/static_set/device_ref_example.cu create mode 100644 examples/static_set/host_bulk_example.cu create mode 100644 include/cuco/detail/equal_wrapper.cuh create mode 100644 include/cuco/detail/operator.inl create mode 100644 include/cuco/detail/probing_scheme_base.cuh create mode 100644 include/cuco/detail/probing_scheme_impl.inl create mode 100644 include/cuco/detail/static_set/kernels.cuh create mode 100644 include/cuco/detail/static_set/static_set.inl create mode 100644 include/cuco/detail/static_set/static_set_ref.inl create mode 100644 include/cuco/detail/storage/aow_storage.cuh create mode 100644 include/cuco/detail/storage/counter_storage.cuh create mode 100644 include/cuco/detail/storage/kernels.cuh create mode 100644 include/cuco/detail/storage/storage.cuh create mode 100644 include/cuco/detail/storage/storage_base.cuh create mode 100644 include/cuco/detail/tuning.cuh create mode 100644 include/cuco/extent.cuh create mode 100644 include/cuco/operator.hpp create mode 100644 include/cuco/probing_scheme.cuh create mode 100644 include/cuco/static_set.cuh create mode 100644 include/cuco/static_set_ref.cuh create mode 100644 include/cuco/storage.cuh create mode 100644 tests/static_set/capacity_test.cu create mode 100644 tests/static_set/heterogeneous_lookup_test.cu create mode 100644 tests/static_set/large_input_test.cu create mode 100644 tests/static_set/size_test.cu create mode 100644 tests/static_set/unique_sequence_test.cu create mode 100644 tests/utility/extent_test.cu create mode 100644 tests/utility/storage_test.cu diff --git a/README.md b/README.md index 9f3984bc5..ef294f838 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,14 @@ class example_class { We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables. +### `static_set` + +`cuco::static_set` is a fixed-size container that stores unique elements in no particular order. See the Doxygen documentation in `static_set.cuh` for more detailed information. + +#### Examples: +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/jnjcdG16c)) +- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/EGMj6qx73)) + ### `static_map` `cuco::static_map` is a fixed-size hash table using open addressing with linear probing. See the Doxygen documentation in `static_map.cuh` for more detailed information. diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 13f1e3be0..750c9be86 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -46,6 +46,13 @@ endfunction(ConfigureBench) ### benchmark sources ############################################################################# ################################################################################################### +################################################################################################### +# - static_set benchmarks ------------------------------------------------------------------------- +ConfigureBench(STATIC_SET_BENCH + hash_table/static_set/contains_bench.cu + hash_table/static_set/insert_bench.cu + hash_table/static_set/size_bench.cu) + ################################################################################################### # - static_map benchmarks ------------------------------------------------------------------------- ConfigureBench(STATIC_MAP_BENCH diff --git a/benchmarks/hash_table/static_set/contains_bench.cu b/benchmarks/hash_table/static_set/contains_bench.cu new file mode 100644 index 000000000..b0c0f34f4 --- /dev/null +++ b/benchmarks/hash_table/static_set/contains_bench.cu @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::contains` performance + */ +template +void static_set_contains(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::static_set set{size, cuco::empty_key{-1}}; + set.insert(keys.begin(), keys.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + set.contains(keys.begin(), keys.end(), result.begin(), launch.get_stream()); + }); +} + +NVBENCH_BENCH_TYPES(static_set_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_contains_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_contains_unique_matching_rate") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_set/insert_bench.cu b/benchmarks/hash_table/static_set/insert_bench.cu new file mode 100644 index 000000000..cb5dcf1f8 --- /dev/null +++ b/benchmarks/hash_table/static_set/insert_bench.cu @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::insert` performance + */ +template +void static_set_insert(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + cuco::experimental::static_set set{ + size, cuco::empty_key{-1}, {}, {}, {}, launch.get_stream()}; + + timer.start(); + set.insert(keys.begin(), keys.end(), launch.get_stream()); + timer.stop(); + }); +} + +NVBENCH_BENCH_TYPES(static_set_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_uniform_multiplicity") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_gaussian_skew") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Skew", defaults::SKEW_RANGE); diff --git a/benchmarks/hash_table/static_set/size_bench.cu b/benchmarks/hash_table/static_set/size_bench.cu new file mode 100644 index 000000000..ded20fe04 --- /dev/null +++ b/benchmarks/hash_table/static_set/size_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::size` performance + */ +template +void static_set_size(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + state.add_element_count(num_keys); + + cuco::experimental::static_set set{size, cuco::empty_key{-1}}; + + set.insert(keys.begin(), keys.end()); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto const size = set.size(launch.get_stream()); }); +} + +NVBENCH_BENCH_TYPES(static_set_size, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_size_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9e02d62ba..d78627eee 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -33,6 +33,8 @@ endfunction(ConfigureExample) ### Example sources ############################################################################### ################################################################################################### +ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu") +ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu") ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu") ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu new file mode 100644 index 000000000..0179baa83 --- /dev/null +++ b/examples/static_set/device_ref_example.cu @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +#include +#include + +// insert a set of keys into a hash set using one cooperative group for each task +template +__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n) +{ + namespace cg = cooperative_groups; + + constexpr auto cg_size = SetRef::cg_size; + + auto tile = cg::tiled_partition(cg::this_thread_block()); + + int64_t const loop_stride = gridDim.x * blockDim.x / cg_size; + int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; + + while (idx < n) { + set.insert(tile, *(keys + idx)); + idx += loop_stride; + } +} + +template +__global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, OutputIterator found) +{ + int64_t const loop_stride = gridDim.x * blockDim.x; + int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; + + while (idx < n) { + found[idx] = set.contains(*(keys + idx)); + idx += loop_stride; + } +} + +/** + * @file device_reference_example.cu + * @brief Demonstrates usage of the static_set device-side APIs. + * + * static_set provides a non-owning reference which can be used to interact with + * the container from within device code. + * + */ +int main(void) +{ + using Key = int; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + Key constexpr empty_key_sentinel = -1; + + // Number of keys to be inserted + std::size_t constexpr num_keys = 50'000; + + // Compute capacity based on a 50% load factor + auto constexpr load_factor = 0.5; + std::size_t const capacity = std::ceil(num_keys / load_factor); + + using set_type = cuco::experimental::static_set; + + // Constructs a hash set with at least "capacity" slots using -1 as the empty key sentinel. + set_type set{capacity, cuco::empty_key{empty_key_sentinel}}; + + // Create a sequence of keys {0, 1, 2, .., i} + thrust::device_vector keys(num_keys); + thrust::sequence(keys.begin(), keys.end(), 0); + + // Insert the first half of the keys into the set + set.insert(keys.begin(), keys.begin() + num_keys / 2); + + // Insert the second half of keys using a custom CUDA kernel. + custom_cooperative_insert<<<128, 128>>>( + set.ref(cuco::experimental::insert), keys.begin() + num_keys / 2, num_keys / 2); + + // Storage for result + thrust::device_vector found(num_keys); + + // Check if all keys are now contained in the set. Note that we pass a reference that already has + // the `contains` operator. + // In general, using two or more reference objects to the same container but with + // a different set of operators concurrently is undefined behavior. + // This does not apply here since the two kernels do not overlap. + custom_contains<<<128, 128>>>( + set.ref(cuco::experimental::contains), keys.begin(), num_keys, found.begin()); + + // Verify that all keys have been found + bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity()); + + if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } + + return 0; +} diff --git a/examples/static_set/host_bulk_example.cu b/examples/static_set/host_bulk_example.cu new file mode 100644 index 000000000..3b8c4deb4 --- /dev/null +++ b/examples/static_set/host_bulk_example.cu @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of the static_set "bulk" host APIs. + * + * The bulk APIs are only invocable from the host and are used for doing operations like `insert` or + * `contains` on a set of keys. + * + */ +int main(void) +{ + using Key = int; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + Key constexpr empty_key_sentinel = -1; + + // Number of keys to be inserted + std::size_t constexpr num_keys = 50'000; + + // Compute capacity based on a 50% load factor + auto constexpr load_factor = 0.5; + std::size_t const capacity = std::ceil(num_keys / load_factor); + + // Constructs a set with at least `capacity` slots using -1 as the empty keys sentinel. + cuco::experimental::static_set set{capacity, cuco::empty_key{empty_key_sentinel}}; + + // Create a sequence of keys {0, 1, 2, .., i} + thrust::device_vector keys(num_keys); + thrust::sequence(keys.begin(), keys.end(), 0); + + // Inserts all keys into the hash set + set.insert(keys.begin(), keys.end()); + + // Storage for result + thrust::device_vector found(num_keys); + + // Check if all keys are contained in the set + set.contains(keys.begin(), keys.end(), found.begin()); + + // Verify that all keys have been found + bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity()); + + if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } + + return 0; +} diff --git a/include/cuco/detail/equal_wrapper.cuh b/include/cuco/detail/equal_wrapper.cuh new file mode 100644 index 000000000..1774e0bf3 --- /dev/null +++ b/include/cuco/detail/equal_wrapper.cuh @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Enum of equality comparison results. + */ +enum class equal_result : int32_t { UNEQUAL = 0, EMPTY = 1, EQUAL = 2 }; + +/** + * @brief Equality wrapper. + * + * User-provided equality binary callable cannot be used to compare against sentinel value. + * + * @tparam T Right-hand side Element type + * @tparam Equal Type of user-provided equality binary callable + */ +template +struct equal_wrapper { + T sentinel_; ///< Sentinel value + Equal equal_; ///< Custom equality callable + + /** + * @brief Equality wrapper ctor. + * + * @param sentinel Sentinel value + * @param equal Equality binary callable + */ + __host__ __device__ constexpr equal_wrapper(T sentinel, Equal const& equal) + : sentinel_{sentinel}, equal_{equal} + { + } + + /** + * @brief Equality check with the given equality callable. + * + * @tparam LHS Left-hand side Element type + * @tparam RHS Right-hand side Element type + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * @return Three way equality comparison result + */ + template + __device__ constexpr equal_result equal_to(LHS const& lhs, RHS const& rhs) const noexcept + { + return equal_(lhs, rhs) ? equal_result::EQUAL : equal_result::UNEQUAL; + } + + /** + * @brief Order-sensitive equality operator. + * + * This function always compares the left-hand side element against `sentinel_` value first + * then perform a equality check with the given `equal_` callable, i.e., `equal_(lhs, rhs)`. + * + * @note Container (like set or map) keys MUST be always on the left-hand side. + * + * @tparam U Right-hand side Element type + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * @return Three way equality comparison result + */ + template + __device__ constexpr equal_result operator()(T const& lhs, U const& rhs) const noexcept + { + return cuco::detail::bitwise_compare(lhs, sentinel_) ? equal_result::EMPTY + : this->equal_to(lhs, rhs); + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/operator.inl b/include/cuco/detail/operator.inl new file mode 100644 index 000000000..fdd5884e8 --- /dev/null +++ b/include/cuco/detail/operator.inl @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief CRTP mixin which augments a given `Reference` with an `Operator`. + * + * @throw If the operator is not defined in `include/cuco/operator.hpp` + * + * @tparam Operator Operator type, i.e., `cuco::op::*_tag` + * @tparam Reference The reference type. + * + * @note This primary template should never be instantiated. + */ +template +class operator_impl { + static_assert(cuco::dependent_false, + "Operator type is not supported by reference type."); +}; + +/** + * @brief Checks if the given `Operator` is contained in a list of `Operators`. + * + * @tparam Operator Operator type, i.e., `cuco::op::*_tag` + * @tparam Operators List of operators to search in + * + * @return `true` if `Operator` is contained in `Operators`, `false` otherwise. + */ +template +static constexpr bool has_operator() +{ + return ((std::is_same_v) || ...); +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 1180035ae..186a29257 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace cuco { namespace detail { diff --git a/include/cuco/detail/probing_scheme_base.cuh b/include/cuco/detail/probing_scheme_base.cuh new file mode 100644 index 000000000..03f712155 --- /dev/null +++ b/include/cuco/detail/probing_scheme_base.cuh @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Base class of public probing scheme. + * + * This class should not be used directly. + * + * @tparam CGSize Size of CUDA Cooperative Groups + */ +template +class probing_scheme_base { + public: + /** + * @brief The size of the CUDA cooperative thread group. + */ + static constexpr int32_t cg_size = CGSize; +}; +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme_impl.inl new file mode 100644 index 000000000..4f7e904a1 --- /dev/null +++ b/include/cuco/detail/probing_scheme_impl.inl @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Probing iterator class. + * + * @tparam Extent Type of Extent + */ +template +class probing_iterator { + public: + using extent_type = Extent; ///< Extent type + using size_type = typename extent_type::value_type; ///< Size type + + /** + * @brief Constructs an probing iterator + * + * @param start Iteration starting point + * @param step_size Double hashing step size + * @param upper_bound Upper bound of the iteration + */ + __host__ __device__ constexpr probing_iterator(size_type start, + size_type step_size, + extent_type upper_bound) noexcept + : curr_index_{start}, step_size_{step_size}, upper_bound_{upper_bound} + { + // TODO: revise this API when introducing quadratic probing into cuco + } + + /** + * @brief Dereference operator + * + * @return Current slot ndex + */ + __host__ __device__ constexpr auto operator*() const noexcept { return curr_index_; } + + /** + * @brief Prefix increment operator + * + * @return Current iterator + */ + __host__ __device__ constexpr auto operator++() noexcept + { + // TODO: step_size_ can be a build time constant (e.g. linear probing) + // Worth passing another extent type? + curr_index_ = (curr_index_ + step_size_) % upper_bound_; + return *this; + } + + /** + * @brief Postfix increment operator + * + * @return Old iterator before increment + */ + __host__ __device__ constexpr auto operator++(int32_t) noexcept + { + auto temp = *this; + ++(*this); + return temp; + } + + private: + size_type curr_index_; + size_type step_size_; + extent_type upper_bound_; +}; +} // namespace detail + +template +__host__ __device__ constexpr linear_probing::linear_probing(Hash const& hash) + : hash_{hash} +{ +} + +template +template +__host__ __device__ constexpr auto linear_probing::operator()( + ProbeKey const& probe_key, Extent upper_bound) const noexcept +{ + return detail::probing_iterator{hash_(probe_key) % upper_bound, + 1, // step size is 1 + upper_bound}; +} + +template +template +__host__ __device__ constexpr auto linear_probing::operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept +{ + return detail::probing_iterator{ + (hash_(probe_key) + g.thread_rank()) % upper_bound, cg_size, upper_bound}; +} + +template +__host__ __device__ constexpr double_hashing::double_hashing( + Hash1 const& hash1, Hash2 const& hash2) + : hash1_{hash1}, hash2_{hash2} +{ +} + +template +template +__host__ __device__ constexpr auto double_hashing::operator()( + ProbeKey const& probe_key, Extent upper_bound) const noexcept +{ + return detail::probing_iterator{ + hash1_(probe_key) % upper_bound, + hash2_(probe_key) % (upper_bound - 1) + 1, // step size in range [1, prime - 1] + upper_bound}; +} + +template +template +__host__ __device__ constexpr auto double_hashing::operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept +{ + return detail::probing_iterator{ + (hash1_(probe_key) + g.thread_rank()) % upper_bound, + (hash2_(probe_key) % (upper_bound / cg_size - 1) + 1) * cg_size, + upper_bound}; +} +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh new file mode 100644 index 000000000..e7d52ae27 --- /dev/null +++ b/include/cuco/detail/static_set/kernels.cuh @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Inserts all elements in the range `[first, first + n)` and returns the number of + * successful insertions. + * + * If multiple elements in `[first, first + size)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam AtomicT Atomic counter type + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param num_successes Number of successful inserted elements + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void insert(InputIterator first, + cuco::detail::index_type n, + AtomicT* num_successes, + Ref ref) +{ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + typename Ref::size_type thread_num_successes = 0; + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; + + while (idx < n) { + typename Ref::value_type const insert_pair{*(first + idx)}; + if (ref.insert(insert_pair)) { thread_num_successes++; }; + idx += loop_stride; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + typename Ref::size_type block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Inserts all elements in the range `[first, first + n)`. + * + * If multiple elements in `[first, first + n)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void insert_async(InputIterator first, cuco::detail::index_type n, Ref ref) +{ + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; + + while (idx < n) { + typename Ref::value_type const insert_pair{*(first + idx)}; + ref.insert(insert_pair); + idx += loop_stride; + } +} + +/** + * @brief Inserts all elements in the range `[first, first + n)` and returns the number of + * successful insertions. + * + * If multiple elements in `[first, first + n)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam AtomicT Atomic counter type + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param num_successes Number of successful inserted elements + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void insert(InputIterator first, + cuco::detail::index_type n, + AtomicT* num_successes, + Ref ref) +{ + namespace cg = cooperative_groups; + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + typename Ref::size_type thread_num_successes = 0; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + + while (idx < n) { + typename Ref::value_type const insert_pair{*(first + idx)}; + if (ref.insert(tile, insert_pair) && tile.thread_rank() == 0) { thread_num_successes++; }; + idx += loop_stride; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + typename Ref::size_type block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Inserts all elements in the range `[first, first + n)`. + * + * If multiple elements in `[first, first + n)` compare equal, it is unspecified which + * element is inserted. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void insert_async(InputIterator first, cuco::detail::index_type n, Ref ref) +{ + namespace cg = cooperative_groups; + + auto tile = cg::tiled_partition(cg::this_thread_block()); + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + + while (idx < n) { + typename Ref::value_type const insert_pair{*(first + idx)}; + ref.insert(tile, insert_pair); + idx += loop_stride; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data + * structure. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the data + * structure. + * + * @tparam BlockSize The size of the thread block + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param n Number of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +{ + namespace cg = cooperative_groups; + + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = BlockSize * blockIdx.x + threadIdx.x; + __shared__ bool output_buffer[BlockSize]; + + while (idx - thread_idx < n) { // the whole thread block falls into the same iteration + if (idx < n) { + auto const key = *(first + idx); + /* + * The ld.relaxed.gpu instruction used in this operation causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + output_buffer[thread_idx] = ref.contains(key); + } + + block.sync(); + if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; } + idx += loop_stride; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data + * structure. + * + * Writes a `bool` to `(output + i)` indicating if the key `*(first + i)` exists in the data + * structure. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize The size of the thread block + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param n Number of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param ref Non-owing set device ref used to access the slot storage + */ +template +__global__ void contains(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +{ + namespace cg = cooperative_groups; + + auto block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + + auto tile = cg::tiled_partition(cg::this_thread_block()); + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize / CGSize; + cuco::detail::index_type idx = (BlockSize * blockIdx.x + threadIdx.x) / CGSize; + + __shared__ bool output_buffer[BlockSize / CGSize]; + auto const tile_idx = thread_idx / CGSize; + + while (idx - thread_idx < n) { // the whole thread block falls into the same iteration + if (idx < n) { + auto const key = *(first + idx); + auto const found = ref.contains(tile, key); + /* + * The ld.relaxed.gpu instruction used in view.find causes L1 to + * flush more frequently, causing increased sector stores from L2 to global memory. + * By writing results to shared memory and then synchronizing before writing back + * to global, we no longer rely on L1, preventing the increase in sector stores from + * L2 to global and improving performance. + */ + if (tile.thread_rank() == 0) { output_buffer[tile_idx] = found; } + } + + block.sync(); + if (idx < n and tile.thread_rank() == 0) { *(output_begin + idx) = output_buffer[tile_idx]; } + idx += loop_stride; + } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl new file mode 100644 index 000000000..0443c1184 --- /dev/null +++ b/include/cuco/detail/static_set/static_set.inl @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace cuco { +namespace experimental { + +template +constexpr static_set::static_set( + Extent capacity, + empty_key empty_key_sentinel, + KeyEqual pred, + ProbingScheme const& probing_scheme, + Allocator const& alloc, + cudaStream_t stream) + : empty_key_sentinel_{empty_key_sentinel}, + predicate_{pred}, + probing_scheme_{probing_scheme}, + allocator_{alloc}, + storage_{make_valid_extent(capacity), allocator_} +{ + storage_.initialize(empty_key_sentinel_, stream); +} + +template +template +static_set::size_type +static_set::insert( + InputIt first, InputIt last, cudaStream_t stream) +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return 0; } + + auto counter = detail::counter_storage{allocator_}; + counter.reset(stream); + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + if constexpr (cg_size == 1) { + detail::insert + <<>>( + first, num_keys, counter.data(), ref(op::insert)); + } else { + detail::insert + <<>>( + first, num_keys, counter.data(), ref(op::insert)); + } + + return counter.load_to_host(stream); +} + +template +template +void static_set::insert_async( + InputIt first, InputIt last, cudaStream_t stream) +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + if constexpr (cg_size == 1) { + detail::insert_async + <<>>(first, num_keys, ref(op::insert)); + } else { + detail::insert_async + <<>>(first, num_keys, ref(op::insert)); + } +} + +template +template +void static_set::contains( + InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream) const +{ + contains_async(first, last, output_begin, stream); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); +} + +template +template +void static_set::contains_async( + InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream) const +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = + (cg_size * num_keys + detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE); + + if constexpr (cg_size == 1) { + detail::contains + <<>>( + first, num_keys, output_begin, ref(op::contains)); + } else { + detail::contains + <<>>( + first, num_keys, output_begin, ref(op::contains)); + } +} + +template +static_set::size_type +static_set::size( + cudaStream_t stream) const +{ + auto const begin = thrust::make_transform_iterator( + storage_.data(), + cuco::detail::elements_per_window{empty_key_sentinel_}); + + std::size_t temp_storage_bytes = 0; + using temp_allocator_type = typename std::allocator_traits::rebind_alloc; + auto temp_allocator = temp_allocator_type{allocator_}; + auto d_size = reinterpret_cast( + std::allocator_traits::allocate(temp_allocator, sizeof(size_type))); + cub::DeviceReduce::Sum( + nullptr, temp_storage_bytes, begin, d_size, storage_.num_windows(), stream); + + auto d_temp_storage = + std::allocator_traits::allocate(temp_allocator, temp_storage_bytes); + + cub::DeviceReduce::Sum( + d_temp_storage, temp_storage_bytes, begin, d_size, storage_.num_windows(), stream); + + size_type h_size; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&h_size, d_size, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); + std::allocator_traits::deallocate( + temp_allocator, reinterpret_cast(d_size), sizeof(size_type)); + std::allocator_traits::deallocate( + temp_allocator, d_temp_storage, temp_storage_bytes); + + return h_size; +} + +template +constexpr auto +static_set::capacity() + const noexcept +{ + return storage_.capacity(); +} + +template +constexpr static_set::key_type +static_set::empty_key_sentinel() + const noexcept +{ + return empty_key_sentinel_; +} + +template +template +auto static_set::ref( + Operators...) const noexcept +{ + static_assert(sizeof...(Operators), "No operators specified"); + return ref_type{ + cuco::empty_key(empty_key_sentinel_), predicate_, probing_scheme_, storage_.ref()}; +} +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl new file mode 100644 index 000000000..51099243f --- /dev/null +++ b/include/cuco/detail/static_set/static_set_ref.inl @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cuco { +namespace experimental { + +template +__host__ __device__ constexpr static_set_ref< + Key, + Scope, + KeyEqual, + ProbingScheme, + StorageRef, + Operators...>::static_set_ref(cuco::empty_key empty_key_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + StorageRef storage_ref) noexcept + : empty_key_sentinel_{empty_key_sentinel}, + predicate_{empty_key_sentinel, predicate}, + probing_scheme_{probing_scheme}, + storage_ref_{storage_ref} +{ +} + +template +__host__ __device__ constexpr auto +static_set_ref::capacity() + const noexcept +{ + return storage_ref_.capacity(); +} + +template +__host__ __device__ constexpr Key +static_set_ref::empty_key_sentinel() + const noexcept +{ + return empty_key_sentinel_; +} + +namespace detail { + +template +class operator_impl> { + using base_type = static_set_ref; + using ref_type = static_set_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + static constexpr auto thread_scope = base_type::thread_scope; + + public: + /** + * @brief Inserts an element. + * + * @param value The element to insert + * @return True if the given element is successfully inserted + */ + __device__ bool insert(value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + auto probing_iter = ref_.probing_scheme_(value, ref_.storage_ref_.num_windows()); + + while (true) { + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + // TODO: perf gain with #pragma unroll since num_windows is build time constant + for (auto& slot_content : window_slots) { + auto const eq_res = ref_.predicate_(slot_content, value); + + // If the key is already in the container, return false + if (eq_res == detail::equal_result::EQUAL) { return false; } + if (eq_res == detail::equal_result::EMPTY) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + switch (attempt_insert( + (ref_.storage_ref_.data() + *probing_iter)->data() + intra_window_index, value)) { + case insert_result::CONTINUE: continue; + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: return false; + } + } + } + ++probing_iter; + } + } + + /** + * @brief Inserts an element. + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * @return True if the given element is successfully inserted + */ + __device__ bool insert(cooperative_groups::thread_block_tile group, + value_type const& value) noexcept + { + auto& ref_ = static_cast(*this); + auto probing_iter = ref_.probing_scheme_(group, value, ref_.storage_ref_.num_windows()); + + while (true) { + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (ref_.predicate_(window_slots[i], value)) { + case detail::equal_result::EMPTY: return cuco::pair{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: return cuco::pair{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return cuco::pair{detail::equal_result::UNEQUAL, -1}; + }(); + + // If the key is already in the container, return false + if (group.any(state == detail::equal_result::EQUAL)) { return false; } + + auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); + + if (group_contains_empty) { + auto const src_lane = __ffs(group_contains_empty) - 1; + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert( + (ref_.storage_ref_.data() + *probing_iter)->data() + intra_window_index, value) + : insert_result::CONTINUE; + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: return false; + default: continue; + } + } else { + ++probing_iter; + } + } + } + + private: + // TODO: this should be a common enum for all data structures + enum class insert_result : int32_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 }; + + /** + * @brief Attempts to insert an element into a slot. + * + * @note Dispatches the correct implementation depending on the container + * type and presence of other operator mixins. + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, value_type const& value) + { + auto& ref_ = static_cast(*this); + + // temporary workaround due to performance regression + // https://github.com/NVIDIA/libcudacxx/issues/366 + value_type const old = [&]() { + value_type expected = ref_.empty_key_sentinel_.value; + value_type val = value; + if constexpr (sizeof(value_type) == sizeof(uint32_t)) { + auto* expected_ptr = reinterpret_cast(&expected); + auto* value_ptr = reinterpret_cast(&val); + if constexpr (thread_scope == cuda::thread_scope_system) { + return atomicCAS_system(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (thread_scope == cuda::thread_scope_device) { + return atomicCAS(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (thread_scope == cuda::thread_scope_block) { + return atomicCAS_block(reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + if constexpr (sizeof(value_type) == sizeof(uint64_t)) { + auto* expected_ptr = reinterpret_cast(&expected); + auto* value_ptr = reinterpret_cast(&val); + if constexpr (thread_scope == cuda::thread_scope_system) { + return atomicCAS_system( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (thread_scope == cuda::thread_scope_device) { + return atomicCAS( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else if constexpr (thread_scope == cuda::thread_scope_block) { + return atomicCAS_block( + reinterpret_cast(slot), *expected_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + }(); + if (*slot == old) { + // Shouldn't use `predicate_` operator directly since it includes a redundant bitwise compare + return ref_.predicate_.equal_to(old, value) == detail::equal_result::EQUAL + ? insert_result::DUPLICATE + : insert_result::CONTINUE; + } else { + return insert_result::SUCCESS; + } + } +}; + +template +class operator_impl> { + using base_type = static_set_ref; + using ref_type = static_set_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * If the probe key `key` was inserted into the container, returns + * true. Otherwise, returns false. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + + auto probing_iter = ref_.probing_scheme_(key, ref_.storage_ref_.num_windows()); + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + for (auto& slot_content : window_slots) { + switch (ref_.predicate_(slot_content, key)) { + case detail::equal_result::UNEQUAL: continue; + case detail::equal_result::EMPTY: return false; + case detail::equal_result::EQUAL: return true; + } + } + ++probing_iter; + } + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * If the probe key `key` was inserted into the container, returns + * true. Otherwise, returns false. + * + * @tparam ProbeKey Probe key type + * + * @param g The Cooperative Group used to perform group contains + * @param key The key to search for + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(cooperative_groups::thread_block_tile const& g, + ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + + auto probing_iter = ref_.probing_scheme_(g, key, ref_.storage_ref_.num_windows()); + + while (true) { + auto const window_slots = ref_.storage_ref_[*probing_iter]; + + auto const state = [&]() { + for (auto& slot : window_slots) { + switch (ref_.predicate_(slot, key)) { + case detail::equal_result::EMPTY: return detail::equal_result::EMPTY; + case detail::equal_result::EQUAL: return detail::equal_result::EQUAL; + default: continue; + } + } + return detail::equal_result::UNEQUAL; + }(); + + if (g.any(state == detail::equal_result::EQUAL)) { return true; } + if (g.any(state == detail::equal_result::EMPTY)) { return false; } + + ++probing_iter; + } + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/aow_storage.cuh b/include/cuco/detail/storage/aow_storage.cuh new file mode 100644 index 000000000..316f7fbe5 --- /dev/null +++ b/include/cuco/detail/storage/aow_storage.cuh @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Base class of array of slot windows open addressing storage. + * + * This should NOT be used directly. + * + * @tparam WindowSize Number of elements in each window + * @tparam T Element type + * @tparam Extent Type of extent denoting the number of windows + */ +template +class aow_storage_base : public storage_base { + public: + /** + * @brief The number of elements (slots) processed per window. + */ + static constexpr int32_t window_size = WindowSize; + + using extent_type = typename storage_base::extent_type; ///< Storage extent type + using size_type = typename storage_base::size_type; ///< Storage size type + + using value_type = T; ///< Slot type + using window_type = cuda::std::array; ///< Slot window type + + /** + * @brief Constructor of AoW base storage. + * + * @param size Number of windows to store + */ + explicit constexpr aow_storage_base(Extent size) : storage_base{size} {} + + /** + * @brief Gets the total number of slot windows in the current storage. + * + * @return The total number of slot windows + */ + [[nodiscard]] __host__ __device__ constexpr extent_type num_windows() const noexcept + { + return storage_base::capacity(); + } + + /** + * @brief Gets the total number of slots in the current storage. + * + * @return The total number of slots + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept + { + return storage_base::capacity().template multiply(); + } +}; + +/** + * @brief Non-owning AoW storage reference type. + * + * @tparam WindowSize Number of slots in each window + * @tparam T Storage element type + * @tparam Extent Type of extent denoting storage capacity + */ +template +class aow_storage_ref : public aow_storage_base { + public: + using base_type = aow_storage_base; ///< AoW base class type + + using base_type::window_size; ///< Number of elements processed per window + + using extent_type = typename base_type::extent_type; ///< Storage extent type + using size_type = typename base_type::size_type; ///< Storage size type + using value_type = typename base_type::value_type; ///< Slot type + using window_type = typename base_type::window_type; ///< Slot window type + + using base_type::capacity; + using base_type::num_windows; + + /** + * @brief Constructor of AoS storage ref. + * + * @param windows Pointer to the windows array + * @param num_windows Number of windows + */ + explicit constexpr aow_storage_ref(Extent num_windows, window_type* windows) noexcept + : aow_storage_base{num_windows}, windows_{windows} + { + } + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] __device__ constexpr window_type* data() noexcept { return windows_; } + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] __device__ constexpr window_type* data() const noexcept { return windows_; } + + /** + * @brief Returns an array of slots (or a window) for a given index. + * + * @param index Index of the window + * @return An array of slots + */ + [[nodiscard]] __device__ constexpr window_type operator[](size_type index) const noexcept + { + return *reinterpret_cast( + __builtin_assume_aligned(this->data() + index, sizeof(value_type) * window_size)); + } + + private: + window_type* windows_; ///< Pointer to the windows array +}; + +/** + * @brief Array of slot Window open addressing storage class. + * + * @tparam WindowSize Number of slots in each window + * @tparam T Slot type + * @tparam Extent Type of extent denoting number of windows + * @tparam Allocator Type of allocator used for device storage (de)allocation + */ +template +class aow_storage : public aow_storage_base { + public: + using base_type = aow_storage_base; ///< AoW base class type + + using base_type::window_size; ///< Number of elements processed per window + + using extent_type = typename base_type::extent_type; ///< Storage extent type + using size_type = typename base_type::size_type; ///< Storage size type + using value_type = typename base_type::value_type; ///< Slot type + using window_type = typename base_type::window_type; ///< Slot window type + + using base_type::capacity; + using base_type::num_windows; + + using allocator_type = + typename std::allocator_traits::rebind_alloc; ///< Type of the + ///< allocator to + ///< (de)allocate windows + using window_deleter_type = custom_deleter; ///< Type of window deleter + using ref_type = aow_storage_ref; ///< Storage ref type + + /** + * @brief Constructor of AoW storage. + * + * @note The input `size` should be exclusively determined by the return value of + * `make_valid_extent` since it depends on the requested low-bound value, the probing scheme, and + * the storage. + * + * @param size Number of windows to (de)allocate + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr aow_storage(Extent size, Allocator const& allocator) + : aow_storage_base{size}, + allocator_{allocator}, + window_deleter_{capacity(), allocator_}, + windows_{allocator_.allocate(capacity()), window_deleter_} + { + } + + aow_storage(aow_storage&&) = default; ///< Move constructor + /** + * @brief Replaces the contents of the storage with another storage. + * + * @return Reference of the current storage object + */ + aow_storage& operator=(aow_storage&&) = default; + ~aow_storage() = default; ///< Destructor + + aow_storage(aow_storage const&) = delete; + aow_storage& operator=(aow_storage const&) = delete; + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] constexpr window_type* data() const noexcept { return windows_.get(); } + + /** + * @brief Gets window storage reference. + * + * @return Reference of window storage + */ + [[nodiscard]] constexpr ref_type ref() const noexcept + { + return ref_type{this->num_windows(), this->data()}; + } + + /** + * @brief Initializes each slot in the AoW storage to contain `key`. + * + * @param key Key to which all keys in `slots` are initialized + * @param stream Stream used for executing the kernel + */ + void initialize(value_type key, cudaStream_t stream) noexcept + { + auto constexpr stride = 4; + auto const grid_size = (this->num_windows() + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); + + detail::initialize<<>>( + this->data(), this->num_windows(), key); + } + + private: + allocator_type allocator_; ///< Allocator used to (de)allocate windows + window_deleter_type window_deleter_; ///< Custom windows deleter + std::unique_ptr windows_; ///< Pointer to AoW storage +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh new file mode 100644 index 000000000..bf87357a3 --- /dev/null +++ b/include/cuco/detail/storage/counter_storage.cuh @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Device atomic counter storage class. + * + * @tparam SizeType Type of storage size + * @tparam Scope The scope in which the counter will be used by individual threads + * @tparam Allocator Type of allocator used for device storage + */ +template +class counter_storage : public storage_base> { + public: + using storage_base>::capacity_; ///< Storage size + + using size_type = SizeType; ///< Size type + using value_type = cuda::atomic; ///< Type of the counter + using allocator_type = typename std::allocator_traits::rebind_alloc< + value_type>; ///< Type of the allocator to (de)allocate counter + using counter_deleter_type = custom_deleter; ///< Type of counter deleter + + /** + * @brief Constructor of counter storage. + * + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr counter_storage(Allocator const& allocator) + : storage_base>{cuco::experimental::extent{}}, + allocator_{allocator}, + counter_deleter_{capacity_, allocator_}, + counter_{allocator_.allocate(capacity_), counter_deleter_} + { + } + + /** + * @brief Asynchronously resets counter to zero. + * + * @param stream CUDA stream used to reset + */ + void reset(cudaStream_t stream) + { + static_assert(sizeof(size_type) == sizeof(value_type)); + CUCO_CUDA_TRY(cudaMemsetAsync(this->data(), 0, sizeof(value_type), stream)); + } + + /** + * @brief Gets device atomic counter pointer. + * + * @return Pointer to the device atomic counter + */ + [[nodiscard]] constexpr value_type* data() noexcept { return counter_.get(); } + + /** + * @brief Gets device atomic counter pointer. + * + * @return Pointer to the device atomic counter + */ + [[nodiscard]] constexpr value_type* data() const noexcept { return counter_.get(); } + + /** + * @brief Atomically obtains the value of the device atomic counter and copies it to the host. + * + * @note This API synchronizes the given `stream`. + * + * @param stream CUDA stream used to copy device value to the host + * @return Value of the atomic counter + */ + [[nodiscard]] constexpr size_type load_to_host(cudaStream_t stream) const + { + size_type h_count; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); + return h_count; + } + + private: + allocator_type allocator_; ///< Allocator used to (de)allocate counter + counter_deleter_type counter_deleter_; ///< Custom counter deleter + std::unique_ptr counter_; ///< Pointer to counter storage +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/kernels.cuh b/include/cuco/detail/storage/kernels.cuh new file mode 100644 index 000000000..546c58daa --- /dev/null +++ b/include/cuco/detail/storage/kernels.cuh @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Initializes each slot in the window storage to contain `value`. + * + * @tparam WindowT Window type + * + * @param windows Pointer to flat storage for windows + * @param n Number of input windows + * @param value Value to which all values in `slots` are initialized + */ +template +__global__ void initialize(WindowT* windows, + cuco::detail::index_type n, + typename WindowT::value_type value) +{ + cuco::detail::index_type const loop_stride = gridDim.x * blockDim.x; + cuco::detail::index_type idx = blockDim.x * blockIdx.x + threadIdx.x; + + while (idx < n) { + auto& window_slots = *(windows + idx); +#pragma unroll + for (auto& slot : window_slots) { + slot = value; + } + idx += loop_stride; + } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/storage.cuh b/include/cuco/detail/storage/storage.cuh new file mode 100644 index 000000000..b4fc86890 --- /dev/null +++ b/include/cuco/detail/storage/storage.cuh @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Intermediate class internally used by data structures + * + * @tparam StorageImpl Storage implementation class + * @tparam T Storage element type + * @tparam Extent Type of extent denoting number of windows + * @tparam Allocator Type of allocator used for device storage + */ +template +class storage : StorageImpl::template impl { + public: + /// Storage implementation type + using impl_type = typename StorageImpl::template impl; + using ref_type = typename impl_type::ref_type; ///< Storage ref type + using value_type = typename impl_type::value_type; ///< Storage value type + + /// Number of elements per window + static constexpr int window_size = impl_type::window_size; + + using impl_type::capacity; + using impl_type::data; + using impl_type::initialize; + using impl_type::num_windows; + using impl_type::ref; + + /** + * @brief Constructs storage. + * + * @param size Number of slots to (de)allocate + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr storage(Extent size, Allocator const& allocator) : impl_type{size, allocator} + { + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/storage_base.cuh b/include/cuco/detail/storage/storage_base.cuh new file mode 100644 index 000000000..dec443dce --- /dev/null +++ b/include/cuco/detail/storage/storage_base.cuh @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Custom deleter for unique pointer. + * + * @tparam Allocator Type of allocator used for device storage + */ +template +struct custom_deleter { + using pointer = typename Allocator::value_type*; ///< Value pointer type + + /** + * @brief Constructor of custom deleter. + * + * @param size Number of values to deallocate + * @param allocator Allocator used for deallocating device storage + */ + explicit constexpr custom_deleter(std::size_t size, Allocator& allocator) + : size_{size}, allocator_{allocator} + { + } + + /** + * @brief Operator for deallocation + * + * @param ptr Pointer to the first value for deallocation + */ + void operator()(pointer ptr) { allocator_.deallocate(ptr, size_); } + + std::size_t size_; ///< Number of values to delete + Allocator& allocator_; ///< Allocator used deallocating values +}; + +/** + * @brief Base class of open addressing storage. + * + * This class should not be used directly. + * + * @tparam Extent Type of extent denoting storage capacity + */ +template +class storage_base { + public: + using extent_type = Extent; ///< Storage extent type + using size_type = typename extent_type::value_type; ///< Storage size type + + /** + * @brief Constructor of base storage. + * + * @param size Number of elements to (de)allocate + */ + explicit constexpr storage_base(Extent size) : capacity_{size} {} + + /** + * @brief Gets the total number of elements in the current storage. + * + * @return The total number of elements + */ + [[nodiscard]] __host__ __device__ constexpr extent_type capacity() const noexcept + { + return capacity_; + } + + protected: + extent_type capacity_; ///< Total number of elements +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/tuning.cuh b/include/cuco/detail/tuning.cuh new file mode 100644 index 000000000..035b60cc5 --- /dev/null +++ b/include/cuco/detail/tuning.cuh @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +namespace cuco { +namespace experimental { +namespace detail { + +static constexpr int CUCO_DEFAULT_BLOCK_SIZE = 128; +static constexpr int CUCO_DEFAULT_STRIDE = 1; + +} // namespace detail +} // namespace experimental +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index 3aadbb848..ae55f7830 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ #pragma once +#include + #include namespace cuco { @@ -59,7 +61,7 @@ struct slot_to_tuple { */ template struct slot_is_filled { - Key empty_key_sentinel; ///< The value of the empty key sentinel + Key empty_key_sentinel_; ///< The value of the empty key sentinel /** * @brief Indicates if the target slot `s` is filled. @@ -72,7 +74,35 @@ struct slot_is_filled { template __device__ bool operator()(S const& s) { - return thrust::get<0>(s) != empty_key_sentinel; + return not cuco::detail::bitwise_compare(thrust::get<0>(s), empty_key_sentinel_); + } +}; + +/** + * @brief Device functor returning the number of filled elements per window. + * + * @tparam Sentinel Empty sentinel type + */ +template +struct elements_per_window { + Sentinel empty_key_sentinel_; ///< The value of the empty key sentinel + + /** + * @brief Computes the number of filled elements per window. + * + * @tparam Window Window storage type + * + * @param window The window to query + * @return Number of filled elements per window + */ + template + __device__ inline int32_t operator()(Window const& window) const + { + int32_t num = 0; + for (auto const& element : window) { + num += not cuco::detail::bitwise_compare(element, empty_key_sentinel_); + } + return num; } }; diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index d06216c54..513ccd559 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -23,6 +23,8 @@ namespace cuco { namespace detail { +using index_type = int64_t; ///< index type for internal use + /** * @brief Compute the number of bits of a simple type. * @@ -56,13 +58,50 @@ auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_sm } template -constexpr inline int64_t distance(Iterator begin, Iterator end) +constexpr inline index_type distance(Iterator begin, Iterator end) { using category = typename std::iterator_traits::iterator_category; static_assert(std::is_base_of_v, "Input iterator should be a random access iterator."); // `int64_t` instead of arch-dependant `long int` - return static_cast(std::distance(begin, end)); + return static_cast(std::distance(begin, end)); +} + +/** + * @brief C++17 constexpr backport of `std::lower_bound`. + * + * @tparam ForwardIt Type of input iterator + * @tparam T Type of `value` + * + * @param first Iterator defining the start of the range to examine + * @param last Iterator defining the start of the range to examine + * @param value Value to compare the elements to + * + * @return Iterator pointing to the first element in the range [first, last) that does not satisfy + * element < value + */ +template +constexpr ForwardIt lower_bound(ForwardIt first, ForwardIt last, const T& value) +{ + using diff_type = typename std::iterator_traits::difference_type; + + ForwardIt it{}; + diff_type count = std::distance(first, last); + diff_type step{}; + + while (count > 0) { + it = first; + step = count / 2; + std::advance(it, step); + + if (static_cast(*it) < value) { + first = ++it; + count -= step + 1; + } else + count = step; + } + + return first; } } // namespace detail diff --git a/include/cuco/extent.cuh b/include/cuco/extent.cuh new file mode 100644 index 000000000..b825188ed --- /dev/null +++ b/include/cuco/extent.cuh @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace cuco { +namespace experimental { +static constexpr std::size_t dynamic_extent = static_cast(-1); + +/** + * @brief Static extent class. + * + * @tparam SizeType Size type + * @tparam N Extent + */ +template +struct extent { + using value_type = SizeType; ///< Extent value type + + constexpr extent() = default; + + /// Constructs from `SizeType` + __host__ __device__ constexpr explicit extent(SizeType) noexcept {} + + /** + * @brief Conversion to value_type. + * + * @return Extent size + */ + __host__ __device__ constexpr operator value_type() const noexcept { return N; } + + /** + * @brief Multiplies the current extent with the given `Value`. + * + * @tparam Value The input value to multiply with + * + * @return Resulting static extent + */ + template + __host__ __device__ constexpr auto multiply() const noexcept + { + return extent{}; + } +}; + +/** + * @brief Dynamic extent class. + * + * @tparam SizeType Size type + */ +template +struct extent { + using value_type = SizeType; ///< Extent value type + + /** + * @brief Constructs extent from a given `size`. + * + * @param size The extent size + */ + __host__ __device__ constexpr extent(SizeType size) noexcept : value_{size} {} + + /** + * @brief Conversion to value_type. + * + * @return Extent size + */ + __host__ __device__ constexpr operator value_type() const noexcept { return value_; } + + /** + * @brief Multiplies the current extent with the given `Value`. + * + * @tparam Value The input value to multiply with + * + * @return Resulting extent + */ + template + __host__ __device__ constexpr auto multiply() const noexcept + { + return extent{Value * value_}; + } + + private: + value_type value_; ///< Extent value +}; + +/** + * @brief Computes valid extent based on given parameters. + * + * @note The actual capacity of a container (map/set) should be exclusively determined by the return + * value of this utility since the output depends on the requested low-bound size, the probing + * scheme, and the storage. This utility is used internally during container constructions while for + * container ref constructions, it would be users' responsibility to use this function to determine + * the input size of the ref. + * + * @tparam CGSize Number of elements handled per CG + * @tparam WindowSize Number of elements handled per Window + * @tparam SizeType Size type + * @tparam N Extent + * + * @throw If the input extent is invalid + * + * @return Resulting valid extent + */ +template +[[nodiscard]] auto constexpr make_valid_extent(extent ext) +{ + auto constexpr max_prime = cuco::detail::primes.back(); + auto constexpr max_value = + (static_cast(std::numeric_limits::max()) < max_prime) + ? std::numeric_limits::max() + : static_cast(max_prime); + auto const size = SDIV(ext, CGSize * WindowSize); + if (size <= 0 or size > max_value) { CUCO_FAIL("Invalid input extent"); } + + if constexpr (N == dynamic_extent) { + return extent{static_cast( + *cuco::detail::lower_bound( + cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast(size)) * + CGSize)}; + } + if constexpr (N != dynamic_extent) { + return extent(*cuco::detail::lower_bound(cuco::detail::primes.begin(), + cuco::detail::primes.end(), + static_cast(size)) * + CGSize)>{}; + } +} + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp new file mode 100644 index 000000000..abf66e0bd --- /dev/null +++ b/include/cuco/operator.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { +namespace experimental { +inline namespace op { +// TODO enum class of int32_t instead of struct +// https://github.com/NVIDIA/cuCollections/issues/239 +/** + * @brief `insert` operator tag + */ +struct insert_tag { +} inline constexpr insert; + +/** + * @brief `contains` operator tag + */ +struct contains_tag { +} inline constexpr contains; + +} // namespace op +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh new file mode 100644 index 000000000..0880ee97b --- /dev/null +++ b/include/cuco/probing_scheme.cuh @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +/** + * @brief Public linear probing scheme class. + * + * @note Linear probing is efficient when few collisions are present, e.g., low occupancy or low + * multiplicity. + * + * @note `Hash` should be callable object type. + * + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash Unary callable type + */ +template +class linear_probing : private detail::probing_scheme_base { + public: + using probing_scheme_base_type = + detail::probing_scheme_base; ///< The base probe scheme type + using probing_scheme_base_type::cg_size; + + /** + *@brief Constructs linear probing scheme with the hasher callable. + * + * @param hash Hasher + */ + __host__ __device__ constexpr linear_probing(Hash const& hash = {}); + + /** + * @brief Operator to return a probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()(ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + /** + * @brief Operator to return a CG-based probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param g the Cooperative Group to generate probing iterator + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + private: + Hash hash_; +}; + +/** + * @brief Public double hashing scheme class. + * + * @note Default probing scheme for cuco data structures. It shows superior performance over linear + * probing especially when dealing with high multiplicty and/or high occupancy use cases. + * + * @note `Hash1` and `Hash2` should be callable object type. + * + * @note `Hash2` needs to be able to construct from an integer value to avoid secondary clustering. + * + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash1 Unary callable type + * @tparam Hash2 Unary callable type + */ +template +class double_hashing : private detail::probing_scheme_base { + public: + using probing_scheme_base_type = + detail::probing_scheme_base; ///< The base probe scheme type + using probing_scheme_base_type::cg_size; + + /** + *@brief Constructs double hashing probing scheme with the two hasher callables. + * + * @param hash1 First hasher + * @param hash2 Second hasher + */ + __host__ __device__ constexpr double_hashing(Hash1 const& hash1 = {}, Hash2 const& hash2 = {1}); + + /** + * @brief Operator to return a probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()(ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + /** + * @brief Operator to return a CG-based probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param g the Cooperative Group to generate probing iterator + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + private: + Hash1 hash1_; + Hash2 hash2_; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh new file mode 100644 index 000000000..1e6a3f589 --- /dev/null +++ b/include/cuco/static_set.cuh @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#if defined(CUCO_HAS_CUDA_BARRIER) +#include +#endif + +#include +#include + +namespace cuco { +namespace experimental { +/** + * @brief A GPU-accelerated, unordered, associative container of unique keys. + * + * The `static_set` supports two types of operations: + * - Host-side "bulk" operations + * - Device-side "singular" operations + * + * The host-side bulk operations include `insert`, `contains`, etc. These APIs should be used when + * there are a large number of keys to modify or lookup. For example, given a range of keys + * specified by device-accessible iterators, the bulk `insert` function will insert all keys into + * the set. + * + * The singular device-side operations allow individual threads (or cooperative groups) to perform + * independent modify or lookup operations from device code. These operations are accessed through + * non-owning, trivially copyable reference types (or "ref"). User can combine any arbitrary + * operators (see options in `include/cuco/operator.hpp`) when creating the ref. Concurrent modify + * and lookup will be supported if both kinds of operators are specified during the ref + * construction. + * + * @note Allows constant time concurrent modify or lookup operations from threads in device code. + * @note cuCollections data stuctures always place the slot keys on the left-hand side when invoking + * the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive `KeyEqual` + * should be used with caution. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` + * @tparam Extent Data structure size type + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices) + * @tparam Allocator Type of allocator used for device storage + * @tparam Storage Slot window storage type + */ + +template , + cuda::thread_scope Scope = cuda::thread_scope_device, + class KeyEqual = thrust::equal_to, + class ProbingScheme = experimental::double_hashing<1, // CG size + cuco::murmurhash3_32, + cuco::murmurhash3_32>, + class Allocator = cuco::cuda_allocator, + class Storage = cuco::experimental::aow_storage<2>> +class static_set { + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + static_assert( + std::is_base_of_v, + ProbingScheme>, + "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + + public: + static constexpr auto cg_size = ProbingScheme::cg_size; ///< CG size used to for probing + static constexpr auto window_size = Storage::window_size; ///< Window size used to for probing + static constexpr auto thread_scope = Scope; ///< CUDA thread scope + + using key_type = Key; ///< Key type + using value_type = Key; ///< Key type + /// Extent type + using extent_type = decltype(make_valid_extent(std::declval())); + using size_type = typename extent_type::value_type; ///< Size type + using key_equal = KeyEqual; ///< Key equality comparator type + using allocator_type = Allocator; ///< Allocator type + using storage_type = + detail::storage; ///< Storage type + + using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type + using probing_scheme_type = ProbingScheme; ///< Probe scheme type + template + using ref_type = + cuco::experimental::static_set_ref; ///< Non-owning container ref type + + static_set(static_set const&) = delete; + static_set& operator=(static_set const&) = delete; + + static_set(static_set&&) = default; ///< Move constructor + + /** + * @brief Replaces the contents of the container with another container. + * + * @return Reference of the current map object + */ + static_set& operator=(static_set&&) = default; + ~static_set() = default; + + /** + * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values + * and CUDA stream. + * + * The actual set capacity depends on the given `capacity`, the probing scheme, CG size, and the + * window size and it's computed via `make_valid_extent` factory. Insert operations will not + * automatically grow the set. Attempting to insert more unique keys than the capacity of the map + * results in undefined behavior. + * + * The `empty_key_sentinel` is reserved and behavior is undefined when attempting to insert + * this sentinel value. + * + * @param capacity The requested lower-bound set size + * @param empty_key_sentinel The reserved key value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the map + */ + constexpr static_set(Extent capacity, + empty_key empty_key_sentinel, + KeyEqual pred = {}, + ProbingScheme const& probing_scheme = {}, + Allocator const& alloc = {}, + cudaStream_t stream = nullptr); + + /** + * @brief Inserts all keys in the range `[first, last)` and returns the number of successful + * insertions. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_set::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + * + * @return Number of successfully inserted keys + */ + template + size_type insert(InputIt first, InputIt last, cudaStream_t stream = nullptr); + + /** + * @brief Asynchonously inserts all keys in the range `[first, last)`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_set::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_async(InputIt first, InputIt last, cudaStream_t stream = nullptr); + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the set. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_async`. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains(InputIt first, + InputIt last, + OutputIt output_begin, + cudaStream_t stream = nullptr) const; + + /** + * @brief Asynchonously indicates whether the keys in the range `[first, last)` are contained in + * the set. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + cudaStream_t stream = nullptr) const; + + /** + * @brief Gets the number of elements in the container. + * + * @note This function synchronizes the given stream. + * + * @param stream CUDA stream used to get the number of inserted elements + * @return The number of elements in the container + */ + [[nodiscard]] size_type size(cudaStream_t stream = nullptr) const; + + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + [[nodiscard]] constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Get device ref with operators. + * + * @tparam Operators Set of `cuco::op` to be provided by the ref + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return Device ref of the current `static_set` object + */ + template + [[nodiscard]] auto ref(Operators... ops) const noexcept; + + private: + key_type empty_key_sentinel_; ///< Key value that represents an empty slot + key_equal predicate_; ///< Key equality binary predicate + probing_scheme_type probing_scheme_; ///< Probing scheme + allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage + storage_type storage_; ///< Slot window storage +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh new file mode 100644 index 000000000..7c5ae13dc --- /dev/null +++ b/include/cuco/static_set_ref.cuh @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace cuco { +namespace experimental { + +/** + * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary + * operations defined in `include/cuco/operator.hpp` + * + * @note Concurrent modify and lookup will be supported if both kinds of operators are specified + * during the ref construction. + * @note cuCollections data stuctures always place the slot keys on the left-hand + * side when invoking the key comparison predicate. + * @note Ref types are trivially-copyable and are intended to be passed by value. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) + * @tparam StorageRef Storage ref type + * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` + */ +template +class static_set_ref + : public detail::operator_impl< + Operators, + static_set_ref>... { + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + static_assert( + std::is_base_of_v, + ProbingScheme>, + "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + + public: + using key_type = Key; ///< Key Type + using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using storage_ref_type = StorageRef; ///< Type of storage ref + using window_type = typename storage_ref_type::window_type; ///< Window type + using value_type = typename storage_ref_type::value_type; ///< Storage element type + using extent_type = typename storage_ref_type::extent_type; ///< Extent type + using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type + using key_equal = KeyEqual; ///< Type of key equality binary callable + + static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size + static constexpr auto window_size = + storage_ref_type::window_size; ///< Number of elements handled per window + static constexpr auto thread_scope = Scope; ///< Thread scope + + /** + * @brief Constructs static_set_ref. + * + * @param empty_key_sentinel Sentinel indicating empty key + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr static_set_ref( + cuco::empty_key empty_key_sentinel, + key_equal const& predicate, + probing_scheme_type const& probing_scheme, + storage_ref_type storage_ref) noexcept; + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; + + private: + cuco::empty_key empty_key_sentinel_; ///< Empty key sentinel + detail::equal_wrapper predicate_; ///< Key equality binary callable + probing_scheme_type probing_scheme_; ///< Probing scheme + storage_ref_type storage_ref_; ///< Slot storage ref + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/storage.cuh b/include/cuco/storage.cuh new file mode 100644 index 000000000..969b49f37 --- /dev/null +++ b/include/cuco/storage.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +/** + * @brief Public Array of slot Windows storage class. + * + * The window size defines the workload granularity for each CUDA thread, i.e., how many slots a + * thread would concurrently operate on when performing modify or lookup operations. cuCollections + * uses the AoW storage to supersede the raw flat slot storage due to its superior granularity + * control: When window size equals one, AoW performs the same as the flat storage. If the + * underlying operation is more memory bandwidth bound, e.g., high occupancy multimap operations, a + * larger window size can reduce the length of probing sequences thus improve runtime performance. + * + * @tparam WindowSize Number of elements per window storage + */ +template +class aow_storage { + public: + /// Number of elements per window storage + static constexpr int32_t window_size = WindowSize; + + /// Type of implementation details + template + using impl = detail::aow_storage; +}; + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/utility/traits.hpp b/include/cuco/utility/traits.hpp index 445a40daf..78e8dabcb 100644 --- a/include/cuco/utility/traits.hpp +++ b/include/cuco/utility/traits.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,4 +58,10 @@ inline constexpr bool is_bitwise_comparable_v = is_bitwise_comparable::value; }; \ } +template +inline constexpr bool dependent_bool_value = value; + +template +inline constexpr bool dependent_false = dependent_bool_value; + } // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 52c4cd9db..16f6abacf 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -40,13 +40,28 @@ function(ConfigureTest TEST_NAME) RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") target_compile_options(${TEST_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) - catch_discover_tests(${TEST_NAME}) + catch_discover_tests(${TEST_NAME} EXTRA_ARGS --allow-running-no-tests) endfunction(ConfigureTest) ################################################################################################### ### test sources ################################################################################## ################################################################################################### +################################################################################################### +# - utility tests --------------------------------------------------------------------------------- +ConfigureTest(UTILITY_TEST + utility/extent_test.cu + utility/storage_test.cu) + +################################################################################################### +# - static_set tests ------------------------------------------------------------------------------ +ConfigureTest(STATIC_SET_TEST + static_set/capacity_test.cu + static_set/heterogeneous_lookup_test.cu + static_set/large_input_test.cu + static_set/size_test.cu + static_set/unique_sequence_test.cu) + ################################################################################################### # - static_map tests ------------------------------------------------------------------------------ ConfigureTest(STATIC_MAP_TEST diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu new file mode 100644 index 000000000..e4d3e146a --- /dev/null +++ b/tests/static_set/capacity_test.cu @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +TEST_CASE("Static set capacity", "") +{ + constexpr std::size_t num_keys{400}; + using Key = int32_t; + using ProbeT = + cuco::experimental::double_hashing<1, cuco::murmurhash3_32, cuco::murmurhash3_32>; + using Equal = thrust::equal_to; + using AllocatorT = cuco::cuda_allocator; + using StorageT = cuco::experimental::aow_storage<2>; + + SECTION("Static extent must be evaluated at compile time.") + { + auto constexpr gold_capacity = 422; // 211 x 2 + + using extent_type = cuco::experimental::extent; + cuco::experimental:: + static_set + set{extent_type{}, + cuco::empty_key{-1}, + {}, + ProbeT{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}, + {}}; + auto const capacity = set.capacity(); + STATIC_REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + STATIC_REQUIRE(ref_capacity == gold_capacity); + } + + SECTION("Dynamic extent is evaluated at run time.") + { + auto constexpr gold_capacity = 422; // 211 x 2 + + using extent_type = cuco::experimental::extent; + cuco::experimental:: + static_set + set{num_keys, + cuco::empty_key{-1}, + {}, + ProbeT{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}, + {}}; + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } + + SECTION("Static extent must be evaluated at compile time.") + { + auto constexpr gold_capacity = 412; // 103 x 2 x 2 + + using extent_type = cuco::experimental::extent; + using probe = cuco::experimental::linear_probing<2, cuco::murmurhash3_32>; + auto set = cuco::experimental:: + static_set{ + extent_type{}, cuco::empty_key{-1}, {}, probe{cuco::murmurhash3_32{}}, {}}; + + REQUIRE(set.capacity() == gold_capacity); + + auto const capacity = set.capacity(); + STATIC_REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + STATIC_REQUIRE(ref_capacity == gold_capacity); + } + + SECTION("Dynamic extent is evaluated at run time.") + { + auto constexpr gold_capacity = 412; // 103 x 2 x 2 + + using probe = cuco::experimental::linear_probing<2, cuco::murmurhash3_32>; + auto set = cuco::experimental::static_set, + cuda::thread_scope_device, + Equal, + probe, + AllocatorT, + StorageT>{ + num_keys, cuco::empty_key{-1}, {}, probe{cuco::murmurhash3_32{}}, {}}; + + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } +} diff --git a/tests/static_set/heterogeneous_lookup_test.cu b/tests/static_set/heterogeneous_lookup_test.cu new file mode 100644 index 000000000..cbc0efac3 --- /dev/null +++ b/tests/static_set/heterogeneous_lookup_test.cu @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +// insert key type +template +struct key_pair { + T a; + T b; + + __host__ __device__ key_pair() {} + __host__ __device__ key_pair(T x) : a{x}, b{x} {} + + // Device equality operator is mandatory due to libcudacxx bug: + // https://github.com/NVIDIA/libcudacxx/issues/223 + __device__ bool operator==(key_pair const& other) const { return a == other.a and b == other.b; } +}; + +// probe key type +template +struct key_triplet { + T a; + T b; + T c; + + __host__ __device__ key_triplet() {} + __host__ __device__ key_triplet(T x) : a{x}, b{x}, c{x} {} + + // Device equality operator is mandatory due to libcudacxx bug: + // https://github.com/NVIDIA/libcudacxx/issues/223 + __device__ bool operator==(key_triplet const& other) const + { + return a == other.a and b == other.b and c == other.c; + } +}; + +// User-defined device hasher +struct custom_hasher { + template + __device__ uint32_t operator()(CustomKey const& k) const + { + return thrust::raw_reference_cast(k).a; + }; +}; + +// User-defined device key equality +struct custom_key_equal { + template + __device__ bool operator()(LHS const& lhs, RHS const& rhs) const + { + return thrust::raw_reference_cast(lhs).a == thrust::raw_reference_cast(rhs).a; + } +}; + +TEMPLATE_TEST_CASE_SIG( + "Heterogeneous lookup", "", ((typename T, int CGSize), T, CGSize), (int32_t, 1), (int32_t, 2)) +{ + using Key = key_pair; + using ProbeKey = key_triplet; + using probe_type = cuco::experimental::double_hashing; + + auto const sentinel_key = Key{-1}; + + constexpr std::size_t num = 100; + constexpr std::size_t capacity = num * 2; + auto const probe = probe_type{custom_hasher{}, custom_hasher{}}; + auto my_set = cuco::experimental::static_set, + cuda::thread_scope_device, + custom_key_equal, + probe_type>{ + capacity, cuco::empty_key{sentinel_key}, custom_key_equal{}, probe}; + + auto insert_pairs = thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return Key{i}; }); + auto probe_keys = thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return ProbeKey(i); }); + + SECTION("All inserted keys should be contained") + { + thrust::device_vector contained(num); + my_set.insert(insert_pairs, insert_pairs + num); + my_set.contains(probe_keys, probe_keys + num, contained.begin()); + REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), thrust::identity{})); + } + + SECTION("Non-inserted keys should not be contained") + { + thrust::device_vector contained(num); + my_set.contains(probe_keys, probe_keys + num, contained.begin()); + REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), thrust::identity{})); + } +} diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu new file mode 100644 index 000000000..6f4e5803b --- /dev/null +++ b/tests/static_set/large_input_test.cu @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +template +__inline__ void test_unique_sequence(Set& set, bool* res_begin, std::size_t num_keys) +{ + using Key = typename Set::key_type; + + auto const keys_begin = thrust::counting_iterator(0); + auto const keys_end = thrust::counting_iterator(num_keys); + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(set.size() == 0); + + set.contains(keys_begin, keys_end, res_begin); + REQUIRE(cuco::test::none_of(res_begin, res_begin + num_keys, thrust::identity{})); + } + + set.insert(keys_begin, keys_end); + REQUIRE(set.size() == num_keys); + + SECTION("All inserted key/value pairs should be contained.") + { + set.contains(keys_begin, keys_end, res_begin); + REQUIRE(cuco::test::all_of(res_begin, res_begin + num_keys, thrust::identity{})); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Large input", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2)) +{ + constexpr std::size_t num_keys{1'200'000'000}; + + using extent_type = cuco::experimental::extent; + using probe = cuco::experimental:: + double_hashing, cuco::murmurhash3_32>; + + try { + auto set = cuco::experimental:: + static_set, probe>{ + num_keys * 2, + cuco::empty_key{-1}, + thrust::equal_to{}, + probe{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}}; + + thrust::device_vector d_contained(num_keys); + test_unique_sequence(set, d_contained.data().get(), num_keys); + } catch (cuco::cuda_error&) { + SKIP("Out of memory"); + } catch (std::bad_alloc&) { + SKIP("Out of memory"); + } +} diff --git a/tests/static_set/size_test.cu b/tests/static_set/size_test.cu new file mode 100644 index 000000000..05b1f4f8b --- /dev/null +++ b/tests/static_set/size_test.cu @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +TEST_CASE("Size computation", "") +{ + constexpr std::size_t num_keys{400}; + + cuco::experimental::static_set set{cuco::experimental::extent{400}, + cuco::empty_key{-1}}; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + auto const num_successes = set.insert(d_keys.begin(), d_keys.end()); + + auto const size = set.size(); + + REQUIRE(size == num_keys); + REQUIRE(num_successes == num_keys); +} diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu new file mode 100644 index 000000000..c2eab7eff --- /dev/null +++ b/tests/static_set/unique_sequence_test.cu @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +template +__inline__ void test_unique_sequence(Set& set, std::size_t num_keys) +{ + using Key = typename Set::key_type; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + auto key_begin = d_keys.begin(); + thrust::device_vector d_contained(num_keys); + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(set.size() == 0); + + set.contains(key_begin, key_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } + + set.insert(key_begin, key_begin + num_keys); + REQUIRE(set.size() == num_keys); + + SECTION("All inserted key/value pairs should be contained.") + { + set.contains(key_begin, key_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Unique sequence", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr std::size_t num_keys{400}; + auto constexpr gold_capacity = CGSize == 1 ? 422 // 211 x 1 x 2 + : 412 // 103 x 2 x 2 + ; + + using extent_type = cuco::experimental::extent; + + if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { + using probe = cuco::experimental::linear_probing>; + auto set = cuco::experimental:: + static_set, probe>{ + num_keys, + cuco::empty_key{-1}, + thrust::equal_to{}, + probe{cuco::murmurhash3_32{}}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_unique_sequence(set, num_keys); + } + + if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { + using probe = cuco::experimental:: + double_hashing, cuco::murmurhash3_32>; + auto set = cuco::experimental:: + static_set, probe>{ + num_keys, + cuco::empty_key{-1}, + thrust::equal_to{}, + probe{cuco::murmurhash3_32{}, cuco::murmurhash3_32{}}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_unique_sequence(set, num_keys); + } +} diff --git a/tests/utility/extent_test.cu b/tests/utility/extent_test.cu new file mode 100644 index 000000000..2623a8ae5 --- /dev/null +++ b/tests/utility/extent_test.cu @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +TEMPLATE_TEST_CASE_SIG( + "Extent tests", "", ((typename SizeType), SizeType), (int32_t), (int64_t), (std::size_t)) +{ + SizeType constexpr num = 1234; + SizeType constexpr gold_reference = 314; // 157 x 2 + auto constexpr cg_size = 2; + auto constexpr window_size = 4; + + SECTION("Static extent must be evaluated at compile time.") + { + auto const size = cuco::experimental::extent{}; + STATIC_REQUIRE(num == size); + } + + SECTION("Dynamic extent is evaluated at run time.") + { + auto const size = cuco::experimental::extent(num); + REQUIRE(size == num); + } + + SECTION("Compute static valid extent at compile time.") + { + auto constexpr size = cuco::experimental::extent{}; + auto constexpr res = cuco::experimental::make_valid_extent(size); + STATIC_REQUIRE(gold_reference == res); + } + + SECTION("Compute dynamic valid extent at run time.") + { + auto const size = cuco::experimental::extent{num}; + auto const res = cuco::experimental::make_valid_extent(size); + REQUIRE(gold_reference == res); + } +} diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu new file mode 100644 index 000000000..c82b5ab44 --- /dev/null +++ b/tests/utility/storage_test.cu @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +TEMPLATE_TEST_CASE_SIG("Storage tests", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t), + (int32_t, int64_t), + (int64_t, int64_t)) +{ + constexpr std::size_t size{1'000}; + constexpr int window_size{2}; + constexpr std::size_t gold_capacity{2'000}; + + using allocator_type = cuco::cuda_allocator; + auto allocator = allocator_type{}; + + SECTION("Allocate array of pairs with AoS storage.") + { + auto s = cuco::experimental::detail::aow_storage, + cuco::experimental::extent, + allocator_type>( + cuco::experimental::extent{size}, allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + REQUIRE(num_windows == size); + REQUIRE(capacity == gold_capacity); + } + + SECTION("Allocate array of pairs with AoS storage with static extent.") + { + using extent_type = cuco::experimental::extent; + auto s = cuco::experimental::detail:: + aow_storage, extent_type, allocator_type>(extent_type{}, + allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + STATIC_REQUIRE(num_windows == size); + STATIC_REQUIRE(capacity == gold_capacity); + } + + SECTION("Allocate array of keys with AoS storage.") + { + auto s = cuco::experimental::detail:: + aow_storage, allocator_type>( + cuco::experimental::extent{size}, allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + REQUIRE(num_windows == size); + REQUIRE(capacity == gold_capacity); + } + + SECTION("Allocate array of keys with AoS storage with static extent.") + { + using extent_type = cuco::experimental::extent; + auto s = cuco::experimental::detail::aow_storage( + extent_type{}, allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + STATIC_REQUIRE(num_windows == size); + STATIC_REQUIRE(capacity == gold_capacity); + } +}