From 460fc8f06e9f3f31c74a79d2a213586e5c657f72 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 21 Dec 2023 14:28:32 -0800 Subject: [PATCH 1/3] Use libcudacxx stream ref --- include/cuco/cuda_stream_ref.hpp | 120 +----------------- .../open_addressing/open_addressing_impl.cuh | 32 ++--- include/cuco/detail/static_map/static_map.inl | 14 +- include/cuco/detail/static_set/static_set.inl | 10 +- include/cuco/detail/storage/aow_storage.inl | 4 +- .../cuco/detail/storage/counter_storage.cuh | 8 +- .../trie/dynamic_bitset/dynamic_bitset.inl | 49 ++++--- 7 files changed, 66 insertions(+), 171 deletions(-) diff --git a/include/cuco/cuda_stream_ref.hpp b/include/cuco/cuda_stream_ref.hpp index bf0a5dea9..ef41facf7 100644 --- a/include/cuco/cuda_stream_ref.hpp +++ b/include/cuco/cuda_stream_ref.hpp @@ -15,128 +15,14 @@ */ #pragma once -#include +#define LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -#include +#include namespace cuco { namespace experimental { -/** - * @brief Strongly-typed non-owning wrapper for CUDA streams with default constructor. - * - * This wrapper is simply a "view": it does not own the lifetime of the stream it wraps. - */ -class cuda_stream_ref { - public: - constexpr cuda_stream_ref() = default; ///< Default constructor - constexpr cuda_stream_ref(cuda_stream_ref const&) = default; ///< Copy constructor - constexpr cuda_stream_ref(cuda_stream_ref&&) = default; ///< Move constructor - - /** - * @brief Copy-assignment operator. - * - * @return Copy of this stream reference. - */ - constexpr cuda_stream_ref& operator=(cuda_stream_ref const&) = default; - - /** - * @brief Move-assignment operator. - * - * @return New location of this stream reference. - */ - constexpr cuda_stream_ref& operator=(cuda_stream_ref&&) = default; ///< Move-assignment operator - - ~cuda_stream_ref() = default; - - constexpr cuda_stream_ref(int) = delete; //< Prevent cast from literal 0 - constexpr cuda_stream_ref(std::nullptr_t) = delete; //< Prevent cast from nullptr - - /** - * @brief Implicit conversion from `cudaStream_t`. - * - * @param stream The CUDA stream to reference. - */ - constexpr cuda_stream_ref(cudaStream_t stream) noexcept : stream_{stream} {} - - /** - * @brief Get the wrapped stream. - * - * @return The wrapped stream. - */ - [[nodiscard]] constexpr cudaStream_t value() const noexcept { return stream_; } - - /** - * @brief Implicit conversion to `cudaStream_t`. - * - * @return The underlying `cudaStream_t`. - */ - constexpr operator cudaStream_t() const noexcept { return value(); } - - /** - * @brief Return true if the wrapped stream is the CUDA per-thread default stream. - * - * @return True if the wrapped stream is the per-thread default stream; else false. - */ - [[nodiscard]] inline bool is_per_thread_default() const noexcept; - - /** - * @brief Return true if the wrapped stream is explicitly the CUDA legacy default stream. - * - * @return True if the wrapped stream is the default stream; else false. - */ - [[nodiscard]] inline bool is_default() const noexcept; - - /** - * @brief Synchronize the viewed CUDA stream. - * - * Calls `cudaStreamSynchronize()`. - * - * @throw cuco::cuda_error if stream synchronization fails - */ - void synchronize() const; - - private: - cudaStream_t stream_{}; -}; - -/** - * @brief Static `cuda_stream_ref` of the default stream (stream 0), for convenience - */ -static constexpr cuda_stream_ref cuda_stream_default{}; - -/** - * @brief Static `cuda_stream_ref` of cudaStreamLegacy, for convenience - */ -static const cuda_stream_ref cuda_stream_legacy{cudaStreamLegacy}; - -/** - * @brief Static `cuda_stream_ref` of cudaStreamPerThread, for convenience - */ -static const cuda_stream_ref cuda_stream_per_thread{cudaStreamPerThread}; - -// /** -// * @brief Equality comparison operator for streams -// * -// * @param lhs The first stream view to compare -// * @param rhs The second stream view to compare -// * @return true if equal, false if unequal -// */ -// inline bool operator==(cuda_stream_ref lhs, cuda_stream_ref rhs) -// { -// return lhs.value() == rhs.value(); -// } - -// /** -// * @brief Inequality comparison operator for streams -// * -// * @param lhs The first stream view to compare -// * @param rhs The second stream view to compare -// * @return true if unequal, false if equal -// */ -// inline bool operator!=(cuda_stream_ref lhs, cuda_stream_ref rhs) { return not(lhs == rhs); } +using cuda_stream_ref = cuda::stream_ref; } // namespace experimental } // namespace cuco - -#include \ No newline at end of file diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 3adc24d99..f39c7ce88 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -288,7 +288,7 @@ class open_addressing_impl { auto const always_true = thrust::constant_iterator{true}; detail::insert_if_n - <<>>( + <<>>( first, num_keys, always_true, thrust::identity{}, counter.data(), container_ref); return counter.load_to_host(stream); @@ -317,7 +317,7 @@ class open_addressing_impl { auto const always_true = thrust::constant_iterator{true}; detail::insert_if_n - <<>>( + <<>>( first, num_keys, always_true, thrust::identity{}, container_ref); } @@ -365,7 +365,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); detail::insert_if_n - <<>>( + <<>>( first, num_keys, stencil, pred, counter.data(), container_ref); return counter.load_to_host(stream); @@ -407,7 +407,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); detail::insert_if_n - <<>>( + <<>>( first, num_keys, stencil, pred, container_ref); } @@ -447,7 +447,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); detail::erase - <<>>( + <<>>( first, num_keys, container_ref); } @@ -479,7 +479,7 @@ class open_addressing_impl { auto const always_true = thrust::constant_iterator{true}; detail::contains_if_n - <<>>( + <<>>( first, num_keys, always_true, thrust::identity{}, output_begin, container_ref); } @@ -527,7 +527,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); detail::contains_if_n - <<>>( + <<>>( first, num_keys, stencil, pred, output_begin, container_ref); } @@ -571,7 +571,7 @@ class open_addressing_impl { d_num_out, this->capacity(), is_filled, - stream)); + stream.get())); // Allocate temporary storage auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); @@ -583,12 +583,12 @@ class open_addressing_impl { d_num_out, this->capacity(), is_filled, - stream)); + stream.get())); size_type h_num_out; - CUCO_CUDA_TRY( - cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); - stream.synchronize(); + CUCO_CUDA_TRY(cudaMemcpyAsync( + &h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); + stream.wait(); std::allocator_traits::deallocate( temp_allocator, reinterpret_cast(d_num_out), sizeof(size_type)); temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); @@ -620,7 +620,7 @@ class open_addressing_impl { // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to // v2.1.0 detail::size - <<>>( + <<>>( storage_.ref(), is_filled, counter.data()); return counter.load_to_host(stream); @@ -644,7 +644,7 @@ class open_addressing_impl { void rehash(Container const& container, Predicate const& is_filled, cuda_stream_ref stream) { this->rehash_async(container, is_filled, stream); - stream.synchronize(); + stream.wait(); } /** @@ -678,7 +678,7 @@ class open_addressing_impl { cuda_stream_ref stream) { this->rehash_async(extent, container, is_filled, stream); - stream.synchronize(); + stream.wait(); } /** @@ -736,7 +736,7 @@ class open_addressing_impl { auto constexpr stride = cuco::detail::default_stride(); auto const grid_size = cuco::detail::grid_size(num_windows, 1, stride, block_size); - detail::rehash<<>>( + detail::rehash<<>>( old_storage.ref(), container.ref(op::insert), is_filled); } diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 44310927e..961c52cee 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -223,7 +223,7 @@ void static_mapinsert_or_assign_async(first, last, stream); - stream.synchronize(); + stream.wait(); } template - <<>>( + <<>>( first, num, ref(op::insert_or_assign)); } @@ -261,7 +261,7 @@ void static_map - <<>>( + <<>>( first, num_keys, output_begin, ref(op::find)); } diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index 600b4c4c9..e9527be02 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -200,7 +200,7 @@ void static_set InputIt first, InputIt last, cuda_stream_ref stream) { erase_async(first, last, stream); - stream.synchronize(); + stream.wait(); } template InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const { contains_async(first, last, output_begin, stream); - stream.synchronize(); + stream.wait(); } template cuda_stream_ref stream) const { contains_if_async(first, last, stencil, pred, output_begin, stream); - stream.synchronize(); + stream.wait(); } template InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const { find_async(first, last, output_begin, stream); - stream.synchronize(); + stream.wait(); } template auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); static_set_ns::detail::find - <<>>( + <<>>( first, num_keys, output_begin, ref(op::find)); } diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl index c4b5fa8b6..c73f4d95c 100644 --- a/include/cuco/detail/storage/aow_storage.inl +++ b/include/cuco/detail/storage/aow_storage.inl @@ -68,7 +68,7 @@ void aow_storage::initialize(value_type key, cuda_stream_ref stream) noexcept { this->initialize_async(key, stream); - stream.synchronize(); + stream.wait(); } template @@ -79,7 +79,7 @@ void aow_storage::initialize_async( auto constexpr stride = 4; auto const grid_size = cuco::detail::grid_size(this->num_windows(), cg_size, stride); - detail::initialize<<>>( + detail::initialize<<>>( this->data(), this->num_windows(), key); } diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh index 92751443d..3e71994bb 100644 --- a/include/cuco/detail/storage/counter_storage.cuh +++ b/include/cuco/detail/storage/counter_storage.cuh @@ -69,7 +69,7 @@ class counter_storage : public storage_basedata(), 0, sizeof(value_type), stream)); + CUCO_CUDA_TRY(cudaMemsetAsync(this->data(), 0, sizeof(value_type), stream.get())); } /** @@ -97,9 +97,9 @@ class counter_storage : public storage_basedata(), sizeof(size_type), cudaMemcpyDeviceToHost, stream)); - stream.synchronize(); + CUCO_CUDA_TRY(cudaMemcpyAsync( + &h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); + stream.wait(); return h_count; } diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index d56ef9d7c..3bbea22ef 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -87,7 +87,7 @@ constexpr void dynamic_bitset::test(KeyIt keys_begin, auto const grid_size = cuco::detail::grid_size(num_keys); - bitset_test_kernel<<>>( + bitset_test_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -104,7 +104,7 @@ constexpr void dynamic_bitset::rank(KeyIt keys_begin, auto const grid_size = cuco::detail::grid_size(num_keys); - bitset_rank_kernel<<>>( + bitset_rank_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -122,7 +122,7 @@ constexpr void dynamic_bitset::select(KeyIt keys_begin, auto const grid_size = cuco::detail::grid_size(num_keys); - bitset_select_kernel<<>>( + bitset_select_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -145,15 +145,19 @@ constexpr void dynamic_bitset::build_ranks_and_selects( auto const bit_counts_begin = thrust::raw_pointer_cast(bit_counts.data()); auto grid_size = cuco::detail::grid_size(num_words); - bit_counts_kernel<<>>( + bit_counts_kernel<<>>( thrust::raw_pointer_cast(words_.data()), bit_counts_begin, num_words, flip_bits); std::size_t temp_storage_bytes = 0; using temp_allocator_type = typename std::allocator_traits::rebind_alloc; auto temp_allocator = temp_allocator_type{this->allocator_}; - CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum( - nullptr, temp_storage_bytes, bit_counts_begin, bit_counts_begin, bit_counts_size, stream)); + CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(nullptr, + temp_storage_bytes, + bit_counts_begin, + bit_counts_begin, + bit_counts_size, + stream.get())); // Allocate temporary storage auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); @@ -163,7 +167,7 @@ constexpr void dynamic_bitset::build_ranks_and_selects( bit_counts_begin, bit_counts_begin, bit_counts_size, - stream)); + stream.get())); temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); @@ -172,25 +176,30 @@ constexpr void dynamic_bitset::build_ranks_and_selects( ranks.resize(num_blocks); grid_size = cuco::detail::grid_size(num_blocks); - encode_ranks_from_prefix_bit_counts<<>>( - bit_counts_begin, - thrust::raw_pointer_cast(ranks.data()), - num_words, - num_blocks, - words_per_block); + encode_ranks_from_prefix_bit_counts<<>>(bit_counts_begin, + thrust::raw_pointer_cast(ranks.data()), + num_words, + num_blocks, + words_per_block); // Step 3. Compute selects thrust::device_vector select_markers(num_blocks, this->allocator_); auto const select_markers_begin = thrust::raw_pointer_cast(select_markers.data()); - mark_blocks_with_select_entries<<>>( + mark_blocks_with_select_entries<<>>( bit_counts_begin, select_markers_begin, num_blocks, words_per_block, bits_per_block); auto d_sum = reinterpret_cast(thrust::raw_pointer_cast( std::allocator_traits::allocate(temp_allocator, sizeof(size_type)))); CUCO_CUDA_TRY(cub::DeviceReduce::Sum( - nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream)); + nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream.get())); d_temp_storage = temp_allocator.allocate(temp_storage_bytes); @@ -199,12 +208,12 @@ constexpr void dynamic_bitset::build_ranks_and_selects( select_markers_begin, d_sum, num_blocks, - stream)); + stream.get())); size_type num_selects{}; CUCO_CUDA_TRY( - cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); - stream.synchronize(); + cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); + stream.wait(); std::allocator_traits::deallocate( temp_allocator, thrust::device_ptr{reinterpret_cast(d_sum)}, sizeof(size_type)); temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); @@ -220,7 +229,7 @@ constexpr void dynamic_bitset::build_ranks_and_selects( select_begin, thrust::make_discard_iterator(), num_blocks, - stream)); + stream.get())); d_temp_storage = temp_allocator.allocate(temp_storage_bytes); @@ -231,7 +240,7 @@ constexpr void dynamic_bitset::build_ranks_and_selects( select_begin, thrust::make_discard_iterator(), num_blocks, - stream)); + stream.get())); temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); } From ebe3e17cafcc0bb4f5bfb557282e0e22dea5daaf Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 21 Dec 2023 14:29:03 -0800 Subject: [PATCH 2/3] Remove unused file --- include/cuco/detail/cuda_stream_ref.inl | 50 ------------------------- 1 file changed, 50 deletions(-) delete mode 100644 include/cuco/detail/cuda_stream_ref.inl diff --git a/include/cuco/detail/cuda_stream_ref.inl b/include/cuco/detail/cuda_stream_ref.inl deleted file mode 100644 index 64aa078aa..000000000 --- a/include/cuco/detail/cuda_stream_ref.inl +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include - -namespace cuco { -namespace experimental { - -[[nodiscard]] inline bool cuda_stream_ref::is_per_thread_default() const noexcept -{ -#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM - return value() == cuda_stream_per_thread || value() == nullptr; -#else - return value() == cuda_stream_per_thread; -#endif -} - -[[nodiscard]] inline bool cuda_stream_ref::is_default() const noexcept -{ -#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM - return value() == cuda_stream_legacy; -#else - return value() == cuda_stream_legacy || value() == nullptr; -#endif -} - -inline void cuda_stream_ref::synchronize() const -{ - CUCO_CUDA_TRY(cudaStreamSynchronize(this->stream_)); -} - -} // namespace experimental -} // namespace cuco \ No newline at end of file From df678d569223534c39035c808c56f57507c52e7e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 21 Dec 2023 15:45:58 -0800 Subject: [PATCH 3/3] Rename cuda_stream_ref as stream_ref --- include/cuco/aow_storage.cuh | 6 +- .../open_addressing/open_addressing_impl.cuh | 36 ++++++------ include/cuco/detail/static_map/static_map.inl | 52 ++++++++--------- include/cuco/detail/static_set/static_set.inl | 46 +++++++-------- include/cuco/detail/storage/aow_storage.inl | 8 +-- .../cuco/detail/storage/counter_storage.cuh | 6 +- .../trie/dynamic_bitset/dynamic_bitset.cuh | 12 ++-- .../trie/dynamic_bitset/dynamic_bitset.inl | 10 ++-- include/cuco/static_map.cuh | 58 +++++++++---------- include/cuco/static_set.cuh | 54 ++++++++--------- .../{cuda_stream_ref.hpp => stream_ref.hpp} | 2 +- 11 files changed, 139 insertions(+), 151 deletions(-) rename include/cuco/{cuda_stream_ref.hpp => stream_ref.hpp} (95%) diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh index 2ac70095c..f92b65fe4 100644 --- a/include/cuco/aow_storage.cuh +++ b/include/cuco/aow_storage.cuh @@ -16,9 +16,9 @@ #pragma once -#include #include #include +#include #include #include @@ -123,7 +123,7 @@ class aow_storage : public detail::aow_storage_base { * @param key Key to which all keys in `slots` are initialized * @param stream Stream used for executing the kernel */ - void initialize(value_type key, cuda_stream_ref stream = {}) noexcept; + void initialize(value_type key, stream_ref stream = {}) noexcept; /** * @brief Asynchronously initializes each slot in the AoW storage to contain `key`. @@ -131,7 +131,7 @@ class aow_storage : public detail::aow_storage_base { * @param key Key to which all keys in `slots` are initialized * @param stream Stream used for executing the kernel */ - void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept; + void initialize_async(value_type key, stream_ref stream = {}) noexcept; private: allocator_type allocator_; ///< Allocator used to (de)allocate windows diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index f39c7ce88..f117c67a8 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -128,7 +128,7 @@ class open_addressing_impl { KeyEqual const& pred, ProbingScheme const& probing_scheme, Allocator const& alloc, - cuda_stream_ref stream) noexcept + stream_ref stream) noexcept : empty_key_sentinel_{empty_key_sentinel}, empty_slot_sentinel_{empty_slot_sentinel}, erased_key_sentinel_{empty_key_sentinel}, @@ -177,7 +177,7 @@ class open_addressing_impl { KeyEqual const& pred, ProbingScheme const& probing_scheme, Allocator const& alloc, - cuda_stream_ref stream) + stream_ref stream) : empty_key_sentinel_{empty_key_sentinel}, empty_slot_sentinel_{empty_slot_sentinel}, predicate_{pred}, @@ -221,7 +221,7 @@ class open_addressing_impl { KeyEqual const& pred, ProbingScheme const& probing_scheme, Allocator const& alloc, - cuda_stream_ref stream) + stream_ref stream) : empty_key_sentinel_{empty_key_sentinel}, empty_slot_sentinel_{empty_slot_sentinel}, erased_key_sentinel_{erased_key_sentinel}, @@ -242,7 +242,7 @@ class open_addressing_impl { * * @param stream CUDA stream this operation is executed in */ - void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); } + void clear(stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); } /** * @brief Asynchronously erases all elements from the container. After this call, `size()` returns @@ -250,7 +250,7 @@ class open_addressing_impl { * * @param stream CUDA stream this operation is executed in */ - void clear_async(cuda_stream_ref stream) noexcept + void clear_async(stream_ref stream) noexcept { storage_.initialize_async(empty_slot_sentinel_, stream); } @@ -275,7 +275,7 @@ class open_addressing_impl { * @return Number of successfully inserted keys */ template - size_type insert(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream) + size_type insert(InputIt first, InputIt last, Ref container_ref, stream_ref stream) { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return 0; } @@ -308,7 +308,7 @@ class open_addressing_impl { * @param stream CUDA stream used for insert */ template - void insert_async(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream) noexcept + void insert_async(InputIt first, InputIt last, Ref container_ref, stream_ref stream) noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -353,7 +353,7 @@ class open_addressing_impl { StencilIt stencil, Predicate pred, Ref container_ref, - cuda_stream_ref stream) + stream_ref stream) { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return 0; } @@ -399,7 +399,7 @@ class open_addressing_impl { StencilIt stencil, Predicate pred, Ref container_ref, - cuda_stream_ref stream) noexcept + stream_ref stream) noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -435,7 +435,7 @@ class open_addressing_impl { * provided at construction */ template - void erase_async(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream = {}) + void erase_async(InputIt first, InputIt last, Ref container_ref, stream_ref stream = {}) { CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, "The empty key sentinel and erased key sentinel cannot be the same value.", @@ -470,7 +470,7 @@ class open_addressing_impl { InputIt last, OutputIt output_begin, Ref container_ref, - cuda_stream_ref stream) const noexcept + stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -519,7 +519,7 @@ class open_addressing_impl { Predicate pred, OutputIt output_begin, Ref container_ref, - cuda_stream_ref stream) const noexcept + stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -556,7 +556,7 @@ class open_addressing_impl { [[nodiscard]] OutputIt retrieve_all(InputIt begin, OutputIt output_begin, Predicate const& is_filled, - cuda_stream_ref stream) const + stream_ref stream) const { std::size_t temp_storage_bytes = 0; using temp_allocator_type = @@ -609,7 +609,7 @@ class open_addressing_impl { * @return The number of elements in the container */ template - [[nodiscard]] size_type size(Predicate const& is_filled, cuda_stream_ref stream) const noexcept + [[nodiscard]] size_type size(Predicate const& is_filled, stream_ref stream) const noexcept { auto counter = detail::counter_storage{this->allocator()}; @@ -641,7 +641,7 @@ class open_addressing_impl { * @param stream CUDA stream used for this operation */ template - void rehash(Container const& container, Predicate const& is_filled, cuda_stream_ref stream) + void rehash(Container const& container, Predicate const& is_filled, stream_ref stream) { this->rehash_async(container, is_filled, stream); stream.wait(); @@ -675,7 +675,7 @@ class open_addressing_impl { void rehash(extent_type extent, Container const& container, Predicate const& is_filled, - cuda_stream_ref stream) + stream_ref stream) { this->rehash_async(extent, container, is_filled, stream); stream.wait(); @@ -693,7 +693,7 @@ class open_addressing_impl { * @param stream CUDA stream used for this operation */ template - void rehash_async(Container const& container, Predicate const& is_filled, cuda_stream_ref stream) + void rehash_async(Container const& container, Predicate const& is_filled, stream_ref stream) { this->rehash_async(this->storage_.window_extent(), container, is_filled, stream); } @@ -723,7 +723,7 @@ class open_addressing_impl { void rehash_async(extent_type extent, Container const& container, Predicate const& is_filled, - cuda_stream_ref stream) + stream_ref stream) { auto const old_storage = std::move(this->storage_); new (&storage_) storage_type{extent, this->allocator()}; diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 961c52cee..c5cf0785d 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -22,6 +21,7 @@ #include #include #include +#include #include @@ -45,7 +45,7 @@ constexpr static_map, Storage, Allocator const& alloc, - cuda_stream_ref stream) + stream_ref stream) : impl_{std::make_unique(capacity, empty_key_sentinel, cuco::pair{empty_key_sentinel, empty_value_sentinel}, @@ -75,7 +75,7 @@ constexpr static_map, Storage, Allocator const& alloc, - cuda_stream_ref stream) + stream_ref stream) : impl_{std::make_unique(n, desired_load_factor, empty_key_sentinel, @@ -106,7 +106,7 @@ constexpr static_map, Storage, Allocator const& alloc, - cuda_stream_ref stream) + stream_ref stream) : impl_{std::make_unique(capacity, empty_key_sentinel, cuco::pair{empty_key_sentinel, empty_value_sentinel}, @@ -128,7 +128,7 @@ template void static_map::clear( - cuda_stream_ref stream) noexcept + stream_ref stream) noexcept { impl_->clear(stream); } @@ -142,7 +142,7 @@ template void static_map::clear_async( - cuda_stream_ref stream) noexcept + stream_ref stream) noexcept { impl_->clear_async(stream); } @@ -158,7 +158,7 @@ template static_map::size_type static_map::insert( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, stream_ref stream) { return impl_->insert(first, last, ref(op::insert), stream); } @@ -173,7 +173,7 @@ template template void static_map::insert_async( - InputIt first, InputIt last, cuda_stream_ref stream) noexcept + InputIt first, InputIt last, stream_ref stream) noexcept { impl_->insert_async(first, last, ref(op::insert), stream); } @@ -189,7 +189,7 @@ template static_map::size_type static_map::insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) + InputIt first, InputIt last, StencilIt stencil, Predicate pred, stream_ref stream) { return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); } @@ -205,7 +205,7 @@ template void static_map:: insert_if_async( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept + InputIt first, InputIt last, StencilIt stencil, Predicate pred, stream_ref stream) noexcept { impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); } @@ -220,7 +220,7 @@ template template void static_map:: - insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept + insert_or_assign(InputIt first, InputIt last, stream_ref stream) noexcept { return this->insert_or_assign_async(first, last, stream); stream.wait(); @@ -236,7 +236,7 @@ template template void static_map:: - insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream) noexcept + insert_or_assign_async(InputIt first, InputIt last, stream_ref stream) noexcept { auto const num = cuco::detail::distance(first, last); if (num == 0) { return; } @@ -258,7 +258,7 @@ template template void static_map::erase( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, stream_ref stream) { erase_async(first, last, stream); stream.wait(); @@ -274,7 +274,7 @@ template template void static_map::erase_async( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, stream_ref stream) { impl_->erase_async(first, last, ref(op::erase), stream); } @@ -289,7 +289,7 @@ template template void static_map::contains( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, stream_ref stream) const { contains_async(first, last, output_begin, stream); stream.wait(); @@ -305,7 +305,7 @@ template template void static_map::contains_async( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept + InputIt first, InputIt last, OutputIt output_begin, stream_ref stream) const noexcept { impl_->contains_async(first, last, output_begin, ref(op::contains), stream); } @@ -325,7 +325,7 @@ void static_mapcontains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); } @@ -361,7 +361,7 @@ template template void static_map::find( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, stream_ref stream) const { find_async(first, last, output_begin, stream); stream.wait(); @@ -377,7 +377,7 @@ template template void static_map::find_async( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, stream_ref stream) const { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -400,7 +400,7 @@ template std::pair static_map::retrieve_all( - KeyOut keys_out, ValueOut values_out, cuda_stream_ref stream) const + KeyOut keys_out, ValueOut values_out, stream_ref stream) const { auto const begin = thrust::make_transform_iterator( thrust::counting_iterator{0}, @@ -423,7 +423,7 @@ template void static_map::rehash( - cuda_stream_ref stream) + stream_ref stream) { auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); @@ -439,7 +439,7 @@ template void static_map::rehash( - size_type capacity, cuda_stream_ref stream) + size_type capacity, stream_ref stream) { auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); @@ -456,7 +456,7 @@ template void static_map::rehash_async( - cuda_stream_ref stream) + stream_ref stream) { auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); @@ -472,7 +472,7 @@ template void static_map::rehash_async( - size_type capacity, cuda_stream_ref stream) + size_type capacity, stream_ref stream) { auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); @@ -490,7 +490,7 @@ template static_map::size_type static_map::size( - cuda_stream_ref stream) const noexcept + stream_ref stream) const noexcept { auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index e9527be02..7736f4755 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -42,7 +42,7 @@ constexpr static_set, Storage, Allocator const& alloc, - cuda_stream_ref stream) + stream_ref stream) : impl_{std::make_unique( capacity, empty_key_sentinel, empty_key_sentinel, pred, probing_scheme, alloc, stream)} { @@ -64,7 +64,7 @@ constexpr static_set, Storage, Allocator const& alloc, - cuda_stream_ref stream) + stream_ref stream) : impl_{std::make_unique(n, desired_load_factor, empty_key_sentinel, @@ -92,7 +92,7 @@ constexpr static_set, Storage, Allocator const& alloc, - cuda_stream_ref stream) + stream_ref stream) : impl_{std::make_unique(capacity, empty_key_sentinel, empty_key_sentinel, @@ -112,7 +112,7 @@ template void static_set::clear( - cuda_stream_ref stream) noexcept + stream_ref stream) noexcept { impl_->clear(stream); } @@ -125,7 +125,7 @@ template void static_set::clear_async( - cuda_stream_ref stream) noexcept + stream_ref stream) noexcept { impl_->clear_async(stream); } @@ -140,7 +140,7 @@ template static_set::size_type static_set::insert( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, stream_ref stream) { return impl_->insert(first, last, ref(op::insert), stream); } @@ -154,7 +154,7 @@ template template void static_set::insert_async( - InputIt first, InputIt last, cuda_stream_ref stream) noexcept + InputIt first, InputIt last, stream_ref stream) noexcept { impl_->insert_async(first, last, ref(op::insert), stream); } @@ -169,7 +169,7 @@ template static_set::size_type static_set::insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) + InputIt first, InputIt last, StencilIt stencil, Predicate pred, stream_ref stream) { return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); } @@ -183,7 +183,7 @@ template template void static_set::insert_if_async( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept + InputIt first, InputIt last, StencilIt stencil, Predicate pred, stream_ref stream) noexcept { impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); } @@ -197,7 +197,7 @@ template template void static_set::erase( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, stream_ref stream) { erase_async(first, last, stream); stream.wait(); @@ -212,7 +212,7 @@ template template void static_set::erase_async( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, stream_ref stream) { impl_->erase_async(first, last, ref(op::erase), stream); } @@ -226,7 +226,7 @@ template template void static_set::contains( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, stream_ref stream) const { contains_async(first, last, output_begin, stream); stream.wait(); @@ -241,7 +241,7 @@ template template void static_set::contains_async( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept + InputIt first, InputIt last, OutputIt output_begin, stream_ref stream) const noexcept { impl_->contains_async(first, last, output_begin, ref(op::contains), stream); } @@ -260,7 +260,7 @@ void static_set StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream) const + stream_ref stream) const { contains_if_async(first, last, stencil, pred, output_begin, stream); stream.wait(); @@ -280,7 +280,7 @@ void static_set StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream) const noexcept + stream_ref stream) const noexcept { impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); } @@ -294,7 +294,7 @@ template template void static_set::find( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, stream_ref stream) const { find_async(first, last, output_begin, stream); stream.wait(); @@ -309,7 +309,7 @@ template template void static_set::find_async( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, stream_ref stream) const { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -330,7 +330,7 @@ template template OutputIt static_set::retrieve_all( - OutputIt output_begin, cuda_stream_ref stream) const + OutputIt output_begin, stream_ref stream) const { auto const begin = thrust::make_transform_iterator(thrust::counting_iterator{0}, @@ -349,7 +349,7 @@ template void static_set::rehash( - cuda_stream_ref stream) + stream_ref stream) { auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); @@ -364,7 +364,7 @@ template void static_set::rehash( - size_type capacity, cuda_stream_ref stream) + size_type capacity, stream_ref stream) { auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); @@ -380,7 +380,7 @@ template void static_set::rehash_async( - cuda_stream_ref stream) + stream_ref stream) { auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); @@ -395,7 +395,7 @@ template void static_set::rehash_async( - size_type capacity, cuda_stream_ref stream) + size_type capacity, stream_ref stream) { auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); @@ -412,7 +412,7 @@ template static_set::size_type static_set::size( - cuda_stream_ref stream) const noexcept + stream_ref stream) const noexcept { auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel(), this->erased_key_sentinel()); diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl index c73f4d95c..1fcf4c164 100644 --- a/include/cuco/detail/storage/aow_storage.inl +++ b/include/cuco/detail/storage/aow_storage.inl @@ -16,11 +16,11 @@ #pragma once -#include #include #include #include #include +#include #include @@ -65,15 +65,15 @@ aow_storage::ref() const noexcept template void aow_storage::initialize(value_type key, - cuda_stream_ref stream) noexcept + stream_ref stream) noexcept { this->initialize_async(key, stream); stream.wait(); } template -void aow_storage::initialize_async( - value_type key, cuda_stream_ref stream) noexcept +void aow_storage::initialize_async(value_type key, + stream_ref stream) noexcept { auto constexpr cg_size = 1; auto constexpr stride = 4; diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh index 3e71994bb..a71e60f8d 100644 --- a/include/cuco/detail/storage/counter_storage.cuh +++ b/include/cuco/detail/storage/counter_storage.cuh @@ -16,10 +16,10 @@ #pragma once -#include #include #include #include +#include #include @@ -66,7 +66,7 @@ class counter_storage : public storage_basedata(), 0, sizeof(value_type), stream.get())); @@ -94,7 +94,7 @@ class counter_storage : public storage_base +#include #include #include @@ -144,7 +144,7 @@ class dynamic_bitset { constexpr void test(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream = {}) noexcept; + stream_ref stream = {}) noexcept; /** * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total @@ -164,7 +164,7 @@ class dynamic_bitset { constexpr void rank(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream = {}) noexcept; + stream_ref stream = {}) noexcept; /** * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the @@ -184,7 +184,7 @@ class dynamic_bitset { constexpr void select(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream = {}) noexcept; + stream_ref stream = {}) noexcept; using rank_type = cuco::experimental::detail::rank; ///< Rank type @@ -351,7 +351,7 @@ class dynamic_bitset { * * @param stream Stream to execute kernels */ - constexpr void build(cuda_stream_ref stream = {}) noexcept; + constexpr void build(stream_ref stream = {}) noexcept; /** * @brief Populates rank and select indexes for true or false bits @@ -365,7 +365,7 @@ class dynamic_bitset { thrust::device_vector& ranks, thrust::device_vector& selects, bool flip_bits, - cuda_stream_ref stream = {}); + stream_ref stream = {}); }; } // namespace detail diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 3bbea22ef..41a635920 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -78,7 +78,7 @@ template constexpr void dynamic_bitset::test(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream) noexcept + stream_ref stream) noexcept { build(); @@ -96,7 +96,7 @@ template constexpr void dynamic_bitset::rank(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream) noexcept + stream_ref stream) noexcept { build(); auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -113,7 +113,7 @@ template constexpr void dynamic_bitset::select(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream) noexcept + stream_ref stream) noexcept { build(); @@ -131,7 +131,7 @@ constexpr void dynamic_bitset::build_ranks_and_selects( thrust::device_vector& ranks, thrust::device_vector& selects, bool flip_bits, - cuda_stream_ref stream) + stream_ref stream) { if (n_bits_ == 0) { return; } @@ -246,7 +246,7 @@ constexpr void dynamic_bitset::build_ranks_and_selects( } template -constexpr void dynamic_bitset::build(cuda_stream_ref stream) noexcept +constexpr void dynamic_bitset::build(stream_ref stream) noexcept { if (not is_built_) { build_ranks_and_selects(ranks_true_, selects_true_, false, stream); // 1 bits diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index ea791dce3..925fe3474 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include @@ -24,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -185,7 +185,7 @@ class static_map { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + stream_ref stream = {}); /** * @brief Constructs a statically-sized map with the number of elements to insert `n`, the desired @@ -229,7 +229,7 @@ class static_map { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + stream_ref stream = {}); /** * @brief Constructs a statically-sized map with the specified initial capacity, sentinel values @@ -265,7 +265,7 @@ class static_map { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + stream_ref stream = {}); /** * @brief Erases all elements from the container. After this call, `size()` returns zero. @@ -273,7 +273,7 @@ class static_map { * * @param stream CUDA stream this operation is executed in */ - void clear(cuda_stream_ref stream = {}) noexcept; + void clear(stream_ref stream = {}) noexcept; /** * @brief Asynchronously erases all elements from the container. After this call, `size()` returns @@ -281,7 +281,7 @@ class static_map { * * @param stream CUDA stream this operation is executed in */ - void clear_async(cuda_stream_ref stream = {}) noexcept; + void clear_async(stream_ref stream = {}) noexcept; /** * @brief Inserts all keys in the range `[first, last)` and returns the number of successful @@ -301,7 +301,7 @@ class static_map { * @return Number of successful insertions */ template - size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); + size_type insert(InputIt first, InputIt last, stream_ref stream = {}); /** * @brief Asynchronously inserts all keys in the range `[first, last)`. @@ -315,7 +315,7 @@ class static_map { * @param stream CUDA stream used for insert */ template - void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + void insert_async(InputIt first, InputIt last, stream_ref stream = {}) noexcept; /** * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns @@ -343,7 +343,7 @@ class static_map { */ template size_type insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); + InputIt first, InputIt last, StencilIt stencil, Predicate pred, stream_ref stream = {}); /** * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding @@ -370,7 +370,7 @@ class static_map { InputIt last, StencilIt stencil, Predicate pred, - cuda_stream_ref stream = {}) noexcept; + stream_ref stream = {}) noexcept; /** * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` @@ -391,7 +391,7 @@ class static_map { * @param stream CUDA stream used for insert */ template - void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + void insert_or_assign(InputIt first, InputIt last, stream_ref stream = {}) noexcept; /** * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` @@ -410,7 +410,7 @@ class static_map { * @param stream CUDA stream used for insert */ template - void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + void insert_or_assign_async(InputIt first, InputIt last, stream_ref stream = {}) noexcept; /** * @brief Erases keys in the range `[first, last)`. @@ -437,7 +437,7 @@ class static_map { * provided at construction */ template - void erase(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void erase(InputIt first, InputIt last, stream_ref stream = {}); /** * @brief Asynchronously erases keys in the range `[first, last)`. @@ -462,7 +462,7 @@ class static_map { * provided at construction */ template - void erase_async(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void erase_async(InputIt first, InputIt last, stream_ref stream = {}); /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. @@ -479,10 +479,7 @@ class static_map { * @param stream Stream used for executing the kernels */ template - void contains(InputIt first, - InputIt last, - OutputIt output_begin, - cuda_stream_ref stream = {}) const; + void contains(InputIt first, InputIt last, OutputIt output_begin, stream_ref stream = {}) const; /** * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in @@ -500,7 +497,7 @@ class static_map { void contains_async(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const noexcept; + stream_ref stream = {}) const noexcept; /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map if @@ -533,7 +530,7 @@ class static_map { StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + stream_ref stream = {}) const; /** * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in @@ -564,7 +561,7 @@ class static_map { StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream = {}) const noexcept; + stream_ref stream = {}) const noexcept; /** * @brief For all keys in the range `[first, last)`, finds a payload with its key equivalent to @@ -584,7 +581,7 @@ class static_map { * @param stream Stream used for executing the kernels */ template - void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; + void find(InputIt first, InputIt last, OutputIt output_begin, stream_ref stream = {}) const; /** * @brief For all keys in the range `[first, last)`, asynchronously finds a payload with its key @@ -603,10 +600,7 @@ class static_map { * @param stream Stream used for executing the kernels */ template - void find_async(InputIt first, - InputIt last, - OutputIt output_begin, - cuda_stream_ref stream = {}) const; + void find_async(InputIt first, InputIt last, OutputIt output_begin, stream_ref stream = {}) const; /** * @brief Retrieves all of the keys and their associated values. @@ -631,7 +625,7 @@ class static_map { template std::pair retrieve_all(KeyOut keys_out, ValueOut values_out, - cuda_stream_ref stream = {}) const; + stream_ref stream = {}) const; /** * @brief Regenerates the container. @@ -641,7 +635,7 @@ class static_map { * * @param stream CUDA stream used for this operation */ - void rehash(cuda_stream_ref stream = {}); + void rehash(stream_ref stream = {}); /** * @brief Reserves at least the specified number of slots and regenerates the container @@ -661,14 +655,14 @@ class static_map { * @param capacity New capacity of the container * @param stream CUDA stream used for this operation */ - void rehash(size_type capacity, cuda_stream_ref stream = {}); + void rehash(size_type capacity, stream_ref stream = {}); /** * @brief Asynchronously regenerates the container. * * @param stream CUDA stream used for this operation */ - void rehash_async(cuda_stream_ref stream = {}); + void rehash_async(stream_ref stream = {}); /** * @brief Asynchronously reserves at least the specified number of slots and regenerates the @@ -686,7 +680,7 @@ class static_map { * @param capacity New capacity of the container * @param stream CUDA stream used for this operation */ - void rehash_async(size_type capacity, cuda_stream_ref stream = {}); + void rehash_async(size_type capacity, stream_ref stream = {}); /** * @brief Gets the number of elements in the container. @@ -696,7 +690,7 @@ class static_map { * @param stream CUDA stream used to get the number of inserted elements * @return The number of elements in the container */ - [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; + [[nodiscard]] size_type size(stream_ref stream = {}) const noexcept; /** * @brief Gets the maximum number of elements the hash map can hold. diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 2ce0a724e..7b1e02167 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include @@ -24,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -158,7 +158,7 @@ class static_set { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + stream_ref stream = {}); /** * @brief Constructs a statically-sized map with the number of elements to insert `n`, the desired @@ -200,7 +200,7 @@ class static_set { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + stream_ref stream = {}); /** * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values @@ -234,7 +234,7 @@ class static_set { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + stream_ref stream = {}); /** * @brief Erases all elements from the container. After this call, `size()` returns zero. @@ -242,7 +242,7 @@ class static_set { * * @param stream CUDA stream this operation is executed in */ - void clear(cuda_stream_ref stream = {}) noexcept; + void clear(stream_ref stream = {}) noexcept; /** * @brief Asynchronously erases all elements from the container. After this call, `size()` returns @@ -250,7 +250,7 @@ class static_set { * * @param stream CUDA stream this operation is executed in */ - void clear_async(cuda_stream_ref stream = {}) noexcept; + void clear_async(stream_ref stream = {}) noexcept; /** * @brief Inserts all keys in the range `[first, last)` and returns the number of successful @@ -270,7 +270,7 @@ class static_set { * @return Number of successfully inserted keys */ template - size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); + size_type insert(InputIt first, InputIt last, stream_ref stream = {}); /** * @brief Asynchronously inserts all keys in the range `[first, last)`. @@ -284,7 +284,7 @@ class static_set { * @param stream CUDA stream used for insert */ template - void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + void insert_async(InputIt first, InputIt last, stream_ref stream = {}) noexcept; /** * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns @@ -312,7 +312,7 @@ class static_set { */ template size_type insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); + InputIt first, InputIt last, StencilIt stencil, Predicate pred, stream_ref stream = {}); /** * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding @@ -339,7 +339,7 @@ class static_set { InputIt last, StencilIt stencil, Predicate pred, - cuda_stream_ref stream = {}) noexcept; + stream_ref stream = {}) noexcept; /** * @brief Erases keys in the range `[first, last)`. @@ -366,7 +366,7 @@ class static_set { * provided at construction */ template - void erase(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void erase(InputIt first, InputIt last, stream_ref stream = {}); /** * @brief Asynchronously erases keys in the range `[first, last)`. @@ -391,7 +391,7 @@ class static_set { * provided at construction */ template - void erase_async(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void erase_async(InputIt first, InputIt last, stream_ref stream = {}); /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the set. @@ -408,10 +408,7 @@ class static_set { * @param stream Stream used for executing the kernels */ template - void contains(InputIt first, - InputIt last, - OutputIt output_begin, - cuda_stream_ref stream = {}) const; + void contains(InputIt first, InputIt last, OutputIt output_begin, stream_ref stream = {}) const; /** * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in @@ -429,7 +426,7 @@ class static_set { void contains_async(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const noexcept; + stream_ref stream = {}) const noexcept; /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the set if @@ -462,7 +459,7 @@ class static_set { StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + stream_ref stream = {}) const; /** * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in @@ -493,7 +490,7 @@ class static_set { StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream = {}) const noexcept; + stream_ref stream = {}) const noexcept; /** * @brief For all keys in the range `[first, last)`, finds an element with key equivalent to the @@ -512,7 +509,7 @@ class static_set { * @param stream Stream used for executing the kernels */ template - void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; + void find(InputIt first, InputIt last, OutputIt output_begin, stream_ref stream = {}) const; /** * @brief For all keys in the range `[first, last)`, asynchronously finds an element with key @@ -530,10 +527,7 @@ class static_set { * @param stream Stream used for executing the kernels */ template - void find_async(InputIt first, - InputIt last, - OutputIt output_begin, - cuda_stream_ref stream = {}) const; + void find_async(InputIt first, InputIt last, OutputIt output_begin, stream_ref stream = {}) const; /** * @brief Retrieves all keys contained in the set. @@ -553,7 +547,7 @@ class static_set { * @return Iterator indicating the end of the output */ template - OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const; + OutputIt retrieve_all(OutputIt output_begin, stream_ref stream = {}) const; /** * @brief Regenerates the container. @@ -563,7 +557,7 @@ class static_set { * * @param stream CUDA stream used for this operation */ - void rehash(cuda_stream_ref stream = {}); + void rehash(stream_ref stream = {}); /** * @brief Reserves at least the specified number of slots and regenerates the container @@ -583,14 +577,14 @@ class static_set { * @param capacity New capacity of the container * @param stream CUDA stream used for this operation */ - void rehash(size_type capacity, cuda_stream_ref stream = {}); + void rehash(size_type capacity, stream_ref stream = {}); /** * @brief Asynchronously regenerates the container. * * @param stream CUDA stream used for this operation */ - void rehash_async(cuda_stream_ref stream = {}); + void rehash_async(stream_ref stream = {}); /** * @brief Asynchronously reserves at least the specified number of slots and regenerates the @@ -608,7 +602,7 @@ class static_set { * @param capacity New capacity of the container * @param stream CUDA stream used for this operation */ - void rehash_async(size_type capacity, cuda_stream_ref stream = {}); + void rehash_async(size_type capacity, stream_ref stream = {}); /** * @brief Gets the number of elements in the container. @@ -618,7 +612,7 @@ class static_set { * @param stream CUDA stream used to get the number of inserted elements * @return The number of elements in the container */ - [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; + [[nodiscard]] size_type size(stream_ref stream = {}) const noexcept; /** * @brief Gets the maximum number of elements the hash map can hold. diff --git a/include/cuco/cuda_stream_ref.hpp b/include/cuco/stream_ref.hpp similarity index 95% rename from include/cuco/cuda_stream_ref.hpp rename to include/cuco/stream_ref.hpp index ef41facf7..997c87b3f 100644 --- a/include/cuco/cuda_stream_ref.hpp +++ b/include/cuco/stream_ref.hpp @@ -22,7 +22,7 @@ namespace cuco { namespace experimental { -using cuda_stream_ref = cuda::stream_ref; +using stream_ref = cuda::stream_ref; } // namespace experimental } // namespace cuco