diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh index 522819311..abdbaaff7 100644 --- a/include/cuco/aow_storage.cuh +++ b/include/cuco/aow_storage.cuh @@ -16,12 +16,12 @@ #pragma once -#include #include #include #include #include +#include #include #include @@ -122,7 +122,7 @@ class aow_storage : public detail::aow_storage_base { * @param key Key to which all keys in `slots` are initialized * @param stream Stream used for executing the kernel */ - void initialize(value_type key, cuda_stream_ref stream = {}); + void initialize(value_type key, cuda::stream_ref stream = {}); /** * @brief Asynchronously initializes each slot in the AoW storage to contain `key`. @@ -130,7 +130,7 @@ class aow_storage : public detail::aow_storage_base { * @param key Key to which all keys in `slots` are initialized * @param stream Stream used for executing the kernel */ - void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept; + void initialize_async(value_type key, cuda::stream_ref stream = {}) noexcept; private: allocator_type allocator_; ///< Allocator used to (de)allocate windows diff --git a/include/cuco/cuda_stream_ref.hpp b/include/cuco/cuda_stream_ref.hpp deleted file mode 100644 index 19cd3093e..000000000 --- a/include/cuco/cuda_stream_ref.hpp +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -namespace cuco { - -/** - * @brief Strongly-typed non-owning wrapper for CUDA streams with default constructor. - * - * This wrapper is simply a "view": it does not own the lifetime of the stream it wraps. - */ -class cuda_stream_ref { - public: - constexpr cuda_stream_ref() = default; ///< Default constructor - constexpr cuda_stream_ref(cuda_stream_ref const&) = default; ///< Copy constructor - constexpr cuda_stream_ref(cuda_stream_ref&&) = default; ///< Move constructor - - /** - * @brief Copy-assignment operator. - * - * @return Copy of this stream reference. - */ - constexpr cuda_stream_ref& operator=(cuda_stream_ref const&) = default; - - /** - * @brief Move-assignment operator. - * - * @return New location of this stream reference. - */ - constexpr cuda_stream_ref& operator=(cuda_stream_ref&&) = default; ///< Move-assignment operator - - ~cuda_stream_ref() = default; - - constexpr cuda_stream_ref(int) = delete; //< Prevent cast from literal 0 - constexpr cuda_stream_ref(std::nullptr_t) = delete; //< Prevent cast from nullptr - - /** - * @brief Implicit conversion from `cudaStream_t`. - * - * @param stream The CUDA stream to reference. - */ - constexpr cuda_stream_ref(cudaStream_t stream) noexcept : stream_{stream} {} - - /** - * @brief Get the wrapped stream. - * - * @return The wrapped stream. - */ - [[nodiscard]] constexpr cudaStream_t value() const noexcept { return stream_; } - - /** - * @brief Implicit conversion to `cudaStream_t`. - * - * @return The underlying `cudaStream_t`. - */ - constexpr operator cudaStream_t() const noexcept { return value(); } - - /** - * @brief Return true if the wrapped stream is the CUDA per-thread default stream. - * - * @return True if the wrapped stream is the per-thread default stream; else false. - */ - [[nodiscard]] inline bool is_per_thread_default() const noexcept; - - /** - * @brief Return true if the wrapped stream is explicitly the CUDA legacy default stream. - * - * @return True if the wrapped stream is the default stream; else false. - */ - [[nodiscard]] inline bool is_default() const noexcept; - - /** - * @brief Synchronize the viewed CUDA stream. - * - * Calls `cudaStreamSynchronize()`. - * - * @throw cuco::cuda_error if stream synchronization fails - */ - void synchronize() const; - - private: - cudaStream_t stream_{}; -}; - -/** - * @brief Static `cuda_stream_ref` of the default stream (stream 0), for convenience - */ -static constexpr cuda_stream_ref cuda_stream_default{}; - -/** - * @brief Static `cuda_stream_ref` of cudaStreamLegacy, for convenience - */ -static const cuda_stream_ref cuda_stream_legacy{cudaStreamLegacy}; - -/** - * @brief Static `cuda_stream_ref` of cudaStreamPerThread, for convenience - */ -static const cuda_stream_ref cuda_stream_per_thread{cudaStreamPerThread}; - -// /** -// * @brief Equality comparison operator for streams -// * -// * @param lhs The first stream view to compare -// * @param rhs The second stream view to compare -// * @return true if equal, false if unequal -// */ -// inline bool operator==(cuda_stream_ref lhs, cuda_stream_ref rhs) -// { -// return lhs.value() == rhs.value(); -// } - -// /** -// * @brief Inequality comparison operator for streams -// * -// * @param lhs The first stream view to compare -// * @param rhs The second stream view to compare -// * @return true if unequal, false if equal -// */ -// inline bool operator!=(cuda_stream_ref lhs, cuda_stream_ref rhs) { return not(lhs == rhs); } - -} // namespace cuco - -#include diff --git a/include/cuco/detail/cuda_stream_ref.inl b/include/cuco/detail/cuda_stream_ref.inl deleted file mode 100644 index 16f11f04d..000000000 --- a/include/cuco/detail/cuda_stream_ref.inl +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include - -namespace cuco { - -[[nodiscard]] inline bool cuda_stream_ref::is_per_thread_default() const noexcept -{ -#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM - return value() == cuda_stream_per_thread || value() == nullptr; -#else - return value() == cuda_stream_per_thread; -#endif -} - -[[nodiscard]] inline bool cuda_stream_ref::is_default() const noexcept -{ -#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM - return value() == cuda_stream_legacy; -#else - return value() == cuda_stream_legacy || value() == nullptr; -#endif -} - -inline void cuda_stream_ref::synchronize() const -{ - CUCO_CUDA_TRY(cudaStreamSynchronize(this->stream_)); -} - -} // namespace cuco diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index 2caa806ea..be5c3764e 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -21,7 +21,7 @@ constexpr distinct_count_estimator::distinct_count_es cuco::sketch_size_kb sketch_size_kb, Hash const& hash, Allocator const& alloc, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique(sketch_size_kb, hash, alloc, stream)} { } @@ -31,21 +31,20 @@ constexpr distinct_count_estimator::distinct_count_es cuco::standard_deviation standard_deviation, Hash const& hash, Allocator const& alloc, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique(standard_deviation, hash, alloc, stream)} { } template constexpr void distinct_count_estimator::clear_async( - cuco::cuda_stream_ref stream) noexcept + cuda::stream_ref stream) noexcept { this->impl_->clear_async(stream); } template -constexpr void distinct_count_estimator::clear( - cuco::cuda_stream_ref stream) +constexpr void distinct_count_estimator::clear(cuda::stream_ref stream) { this->impl_->clear(stream); } @@ -53,15 +52,16 @@ constexpr void distinct_count_estimator::clear( template template constexpr void distinct_count_estimator::add_async( - InputIt first, InputIt last, cuco::cuda_stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) { this->impl_->add_async(first, last, stream); } template template -constexpr void distinct_count_estimator::add( - InputIt first, InputIt last, cuco::cuda_stream_ref stream) +constexpr void distinct_count_estimator::add(InputIt first, + InputIt last, + cuda::stream_ref stream) { this->impl_->add(first, last, stream); } @@ -70,7 +70,7 @@ template template constexpr void distinct_count_estimator::merge_async( distinct_count_estimator const& other, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) { this->impl_->merge_async(*(other.impl_), stream); } @@ -79,7 +79,7 @@ template template constexpr void distinct_count_estimator::merge( distinct_count_estimator const& other, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) { this->impl_->merge(*(other.impl_), stream); } @@ -87,7 +87,7 @@ constexpr void distinct_count_estimator::merge( template template constexpr void distinct_count_estimator::merge_async( - ref_type const& other_ref, cuco::cuda_stream_ref stream) + ref_type const& other_ref, cuda::stream_ref stream) { this->impl_->merge_async(other_ref.impl_, stream); } @@ -95,14 +95,14 @@ constexpr void distinct_count_estimator::merge_async( template template constexpr void distinct_count_estimator::merge( - ref_type const& other_ref, cuco::cuda_stream_ref stream) + ref_type const& other_ref, cuda::stream_ref stream) { this->impl_->merge(other_ref.impl_, stream); } template constexpr std::size_t distinct_count_estimator::estimate( - cuco::cuda_stream_ref stream) const + cuda::stream_ref stream) const { return this->impl_->estimate(stream); } diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index 97649d5bc..6ded30148 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -34,14 +34,13 @@ __device__ constexpr void distinct_count_estimator_ref::clear( template __host__ constexpr void distinct_count_estimator_ref::clear_async( - cuco::cuda_stream_ref stream) noexcept + cuda::stream_ref stream) noexcept { this->impl_.clear_async(stream); } template -__host__ constexpr void distinct_count_estimator_ref::clear( - cuco::cuda_stream_ref stream) +__host__ constexpr void distinct_count_estimator_ref::clear(cuda::stream_ref stream) { this->impl_.clear(stream); } @@ -55,15 +54,16 @@ __device__ constexpr void distinct_count_estimator_ref::add(T co template template __host__ constexpr void distinct_count_estimator_ref::add_async( - InputIt first, InputIt last, cuco::cuda_stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) { this->impl_.add_async(first, last, stream); } template template -__host__ constexpr void distinct_count_estimator_ref::add( - InputIt first, InputIt last, cuco::cuda_stream_ref stream) +__host__ constexpr void distinct_count_estimator_ref::add(InputIt first, + InputIt last, + cuda::stream_ref stream) { this->impl_.add(first, last, stream); } @@ -79,7 +79,7 @@ __device__ constexpr void distinct_count_estimator_ref::merge( template template __host__ constexpr void distinct_count_estimator_ref::merge_async( - distinct_count_estimator_ref const& other, cuco::cuda_stream_ref stream) + distinct_count_estimator_ref const& other, cuda::stream_ref stream) { this->impl_.merge_async(other, stream); } @@ -87,7 +87,7 @@ __host__ constexpr void distinct_count_estimator_ref::merge_asyn template template __host__ constexpr void distinct_count_estimator_ref::merge( - distinct_count_estimator_ref const& other, cuco::cuda_stream_ref stream) + distinct_count_estimator_ref const& other, cuda::stream_ref stream) { this->impl_.merge(other, stream); } @@ -101,7 +101,7 @@ __device__ std::size_t distinct_count_estimator_ref::estimate( template __host__ constexpr std::size_t distinct_count_estimator_ref::estimate( - cuco::cuda_stream_ref stream) const + cuda::stream_ref stream) const { return this->impl_.estimate(stream); } diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 011d2bee7..98a5e4857 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include #include @@ -23,6 +22,8 @@ #include #include +#include + #include #include #include @@ -69,7 +70,7 @@ class hyperloglog { constexpr hyperloglog(std::size_t sketch_size_b, Hash const& hash, Allocator const& alloc, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) : allocator_{alloc}, sketch_{this->allocator_.allocate(sketch_size_b / sizeof(register_type)), custom_deleter{sketch_size_b / sizeof(register_type), this->allocator_}}, @@ -92,7 +93,7 @@ class hyperloglog { constexpr hyperloglog(cuco::sketch_size_kb sketch_size_kb, Hash const& hash, Allocator const& alloc, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) : hyperloglog{sketch_bytes(sketch_size_kb), hash, alloc, stream} { } @@ -110,7 +111,7 @@ class hyperloglog { constexpr hyperloglog(cuco::standard_deviation standard_deviation, Hash const& hash, Allocator const& alloc, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) : hyperloglog{sketch_bytes(standard_deviation), hash, alloc, stream} { } @@ -133,10 +134,7 @@ class hyperloglog { * * @param stream CUDA stream this operation is executed in */ - constexpr void clear_async(cuco::cuda_stream_ref stream) noexcept - { - this->ref_.clear_async(stream); - } + constexpr void clear_async(cuda::stream_ref stream) noexcept { this->ref_.clear_async(stream); } /** * @brief Resets the estimator, i.e., clears the current count estimate. @@ -146,7 +144,7 @@ class hyperloglog { * * @param stream CUDA stream this operation is executed in */ - constexpr void clear(cuco::cuda_stream_ref stream) { this->ref_.clear(stream); } + constexpr void clear(cuda::stream_ref stream) { this->ref_.clear(stream); } /** * @brief Asynchronously adds to be counted items to the estimator. @@ -160,7 +158,7 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream) { this->ref_.add_async(first, last, stream); } @@ -180,7 +178,7 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream) { this->ref_.add(first, last, stream); } @@ -198,7 +196,7 @@ class hyperloglog { */ template constexpr void merge_async(hyperloglog const& other, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) { this->ref_.merge_async(other.ref(), stream); } @@ -219,7 +217,7 @@ class hyperloglog { */ template constexpr void merge(hyperloglog const& other, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) { this->ref_.merge(other.ref(), stream); } @@ -235,7 +233,7 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - constexpr void merge_async(ref_type const& other_ref, cuco::cuda_stream_ref stream) + constexpr void merge_async(ref_type const& other_ref, cuda::stream_ref stream) { this->ref_.merge_async(other_ref, stream); } @@ -254,7 +252,7 @@ class hyperloglog { * @param stream CUDA stream this operation is executed in */ template - constexpr void merge(ref_type const& other_ref, cuco::cuda_stream_ref stream) + constexpr void merge(ref_type const& other_ref, cuda::stream_ref stream) { this->ref_.merge(other_ref, stream); } @@ -268,7 +266,7 @@ class hyperloglog { * * @return Approximate distinct items count */ - [[nodiscard]] constexpr std::size_t estimate(cuco::cuda_stream_ref stream) const + [[nodiscard]] constexpr std::size_t estimate(cuda::stream_ref stream) const { return this->ref_.estimate(stream); } diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index bd7cd95ad..f68f54594 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include #include @@ -29,6 +28,7 @@ #include #include #include +#include #include #include @@ -122,10 +122,10 @@ class hyperloglog_ref { * * @param stream CUDA stream this operation is executed in */ - __host__ constexpr void clear(cuco::cuda_stream_ref stream) + __host__ constexpr void clear(cuda::stream_ref stream) { this->clear_async(stream); - stream.synchronize(); + stream.wait(); } /** @@ -133,10 +133,10 @@ class hyperloglog_ref { * * @param stream CUDA stream this operation is executed in */ - __host__ constexpr void clear_async(cuco::cuda_stream_ref stream) noexcept + __host__ constexpr void clear_async(cuda::stream_ref stream) noexcept { auto constexpr block_size = 1024; - cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(*this); + cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream.get()>>>(*this); } /** @@ -169,7 +169,7 @@ class hyperloglog_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + __host__ constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream) { auto const num_items = cuco::detail::distance(first, last); if (num_items == 0) { return; } @@ -222,7 +222,7 @@ class hyperloglog_ref { (void*)(&num_items), reinterpret_cast(this)}; CUCO_CUDA_TRY( - cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream)); + cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream.get())); } } else { kernel = reinterpret_cast( @@ -233,7 +233,7 @@ class hyperloglog_ref { cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes)); CUCO_CUDA_TRY( - cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream)); + cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream.get())); } else { // Computes sketch directly in global memory. (Fallback path in case there is not enough // shared memory avalable) @@ -242,7 +242,8 @@ class hyperloglog_ref { CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0)); - CUCO_CUDA_TRY(cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, 0, stream)); + CUCO_CUDA_TRY( + cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, 0, stream.get())); } } } @@ -262,10 +263,10 @@ class hyperloglog_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream) + __host__ constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream) { this->add_async(first, last, stream); - stream.synchronize(); + stream.wait(); } /** @@ -304,13 +305,13 @@ class hyperloglog_ref { */ template __host__ constexpr void merge_async(hyperloglog_ref const& other, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) { CUCO_EXPECTS(other.precision_ == this->precision_, "Cannot merge estimators with different sketch sizes", std::runtime_error); auto constexpr block_size = 1024; - cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, *this); + cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream.get()>>>(other, *this); } /** @@ -328,10 +329,10 @@ class hyperloglog_ref { */ template __host__ constexpr void merge(hyperloglog_ref const& other, - cuco::cuda_stream_ref stream) + cuda::stream_ref stream) { this->merge_async(other, stream); - stream.synchronize(); + stream.wait(); } /** @@ -403,7 +404,7 @@ class hyperloglog_ref { * * @return Approximate distinct items count */ - [[nodiscard]] __host__ constexpr std::size_t estimate(cuco::cuda_stream_ref stream) const + [[nodiscard]] __host__ constexpr std::size_t estimate(cuda::stream_ref stream) const { auto const num_regs = 1ull << this->precision_; std::vector host_sketch(num_regs); @@ -413,8 +414,8 @@ class hyperloglog_ref { this->sketch_.data(), sizeof(register_type) * num_regs, cudaMemcpyDefault, - stream)); - stream.synchronize(); + stream.get())); + stream.wait(); fp_type sum = 0; int zeroes = 0; @@ -566,4 +567,4 @@ class hyperloglog_ref { template friend class hyperloglog_ref; }; -} // namespace cuco::detail \ No newline at end of file +} // namespace cuco::detail diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 548ad09a6..befde9685 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -125,7 +125,7 @@ class open_addressing_impl { KeyEqual const& pred, ProbingScheme const& probing_scheme, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : empty_slot_sentinel_{empty_slot_sentinel}, erased_key_sentinel_{this->extract_key(empty_slot_sentinel)}, predicate_{pred}, @@ -171,7 +171,7 @@ class open_addressing_impl { KeyEqual const& pred, ProbingScheme const& probing_scheme, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : empty_slot_sentinel_{empty_slot_sentinel}, erased_key_sentinel_{this->extract_key(empty_slot_sentinel)}, predicate_{pred}, @@ -213,7 +213,7 @@ class open_addressing_impl { KeyEqual const& pred, ProbingScheme const& probing_scheme, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : empty_slot_sentinel_{empty_slot_sentinel}, erased_key_sentinel_{erased_key_sentinel}, predicate_{pred}, @@ -233,7 +233,7 @@ class open_addressing_impl { * * @param stream CUDA stream this operation is executed in */ - void clear(cuda_stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); } + void clear(cuda::stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); } /** * @brief Asynchronously erases all elements from the container. After this call, `size()` returns @@ -241,7 +241,7 @@ class open_addressing_impl { * * @param stream CUDA stream this operation is executed in */ - void clear_async(cuda_stream_ref stream) noexcept + void clear_async(cuda::stream_ref stream) noexcept { storage_.initialize_async(empty_slot_sentinel_, stream); } @@ -266,7 +266,7 @@ class open_addressing_impl { * @return Number of successfully inserted keys */ template - size_type insert(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream) + size_type insert(InputIt first, InputIt last, Ref container_ref, cuda::stream_ref stream) { auto const always_true = thrust::constant_iterator{true}; return this->insert_if(first, last, always_true, thrust::identity{}, container_ref, stream); @@ -286,7 +286,10 @@ class open_addressing_impl { * @param stream CUDA stream used for insert */ template - void insert_async(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream) noexcept + void insert_async(InputIt first, + InputIt last, + Ref container_ref, + cuda::stream_ref stream) noexcept { auto const always_true = thrust::constant_iterator{true}; this->insert_if_async(first, last, always_true, thrust::identity{}, container_ref, stream); @@ -324,7 +327,7 @@ class open_addressing_impl { StencilIt stencil, Predicate pred, Ref container_ref, - cuda_stream_ref stream) + cuda::stream_ref stream) { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return 0; } @@ -336,7 +339,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); detail::insert_if_n - <<>>( + <<>>( first, num_keys, stencil, pred, counter.data(), container_ref); return counter.load_to_host(stream); @@ -370,7 +373,7 @@ class open_addressing_impl { StencilIt stencil, Predicate pred, Ref container_ref, - cuda_stream_ref stream) noexcept + cuda::stream_ref stream) noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -378,7 +381,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); detail::insert_if_n - <<>>( + <<>>( first, num_keys, stencil, pred, container_ref); } @@ -406,7 +409,7 @@ class open_addressing_impl { * provided at construction */ template - void erase_async(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream = {}) + void erase_async(InputIt first, InputIt last, Ref container_ref, cuda::stream_ref stream = {}) { CUCO_EXPECTS(this->empty_key_sentinel() != this->erased_key_sentinel(), "The empty key sentinel and erased key sentinel cannot be the same value.", @@ -418,7 +421,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); detail::erase - <<>>( + <<>>( first, num_keys, container_ref); } @@ -441,7 +444,7 @@ class open_addressing_impl { InputIt last, OutputIt output_begin, Ref container_ref, - cuda_stream_ref stream) const noexcept + cuda::stream_ref stream) const noexcept { auto const always_true = thrust::constant_iterator{true}; this->contains_if_async( @@ -484,7 +487,7 @@ class open_addressing_impl { Predicate pred, OutputIt output_begin, Ref container_ref, - cuda_stream_ref stream) const noexcept + cuda::stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -492,7 +495,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); detail::contains_if_n - <<>>( + <<>>( first, num_keys, stencil, pred, output_begin, container_ref); } @@ -515,7 +518,7 @@ class open_addressing_impl { InputIt last, OutputIt output_begin, Ref container_ref, - cuda_stream_ref stream) const noexcept + cuda::stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } @@ -523,7 +526,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); detail::find - <<>>( + <<>>( first, num_keys, output_begin, container_ref); } @@ -543,7 +546,7 @@ class open_addressing_impl { [[nodiscard]] size_type count(InputIt first, InputIt last, Ref container_ref, - cuda_stream_ref stream) const noexcept + cuda::stream_ref stream) const noexcept { auto constexpr is_outer = false; return this->count(first, last, container_ref, stream); @@ -567,7 +570,7 @@ class open_addressing_impl { [[nodiscard]] size_type count_outer(InputIt first, InputIt last, Ref container_ref, - cuda_stream_ref stream) const noexcept + cuda::stream_ref stream) const noexcept { auto constexpr is_outer = true; return this->count(first, last, container_ref, stream); @@ -591,7 +594,7 @@ class open_addressing_impl { * @return Iterator indicating the end of the output */ template - [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream) const + [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda::stream_ref stream) const { std::size_t temp_storage_bytes = 0; using temp_allocator_type = @@ -611,7 +614,7 @@ class open_addressing_impl { d_num_out, this->capacity(), is_filled, - stream)); + stream.get())); // Allocate temporary storage auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); @@ -623,12 +626,12 @@ class open_addressing_impl { d_num_out, this->capacity(), is_filled, - stream)); + stream.get())); size_type h_num_out; - CUCO_CUDA_TRY( - cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); - stream.synchronize(); + CUCO_CUDA_TRY(cudaMemcpyAsync( + &h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); + stream.wait(); std::allocator_traits::deallocate( temp_allocator, reinterpret_cast(d_num_out), sizeof(size_type)); temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); @@ -645,7 +648,7 @@ class open_addressing_impl { * * @return The number of elements in the container */ - [[nodiscard]] size_type size(cuda_stream_ref stream) const + [[nodiscard]] size_type size(cuda::stream_ref stream) const { auto counter = detail::counter_storage{this->allocator()}; @@ -658,7 +661,7 @@ class open_addressing_impl { // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to // v2.1.0 detail::size - <<>>( + <<>>( storage_.ref(), is_filled, counter.data()); return counter.load_to_host(stream); @@ -677,10 +680,10 @@ class open_addressing_impl { * @param stream CUDA stream used for this operation */ template - void rehash(Container const& container, cuda_stream_ref stream) + void rehash(Container const& container, cuda::stream_ref stream) { this->rehash_async(container, stream); - stream.synchronize(); + stream.wait(); } /** @@ -706,10 +709,10 @@ class open_addressing_impl { * @param stream CUDA stream used for this operation */ template - void rehash(extent_type extent, Container const& container, cuda_stream_ref stream) + void rehash(extent_type extent, Container const& container, cuda::stream_ref stream) { this->rehash_async(extent, container, stream); - stream.synchronize(); + stream.wait(); } /** @@ -722,7 +725,7 @@ class open_addressing_impl { * @param stream CUDA stream used for this operation */ template - void rehash_async(Container const& container, cuda_stream_ref stream) + void rehash_async(Container const& container, cuda::stream_ref stream) { this->rehash_async(this->storage_.window_extent(), container, stream); } @@ -747,7 +750,7 @@ class open_addressing_impl { * @param stream CUDA stream used for this operation */ template - void rehash_async(extent_type extent, Container const& container, cuda_stream_ref stream) + void rehash_async(extent_type extent, Container const& container, cuda::stream_ref stream) { auto const old_storage = std::move(this->storage_); new (&storage_) storage_type{extent, this->allocator()}; @@ -762,7 +765,7 @@ class open_addressing_impl { auto const is_filled = open_addressing_ns::detail::slot_is_filled{ this->empty_key_sentinel(), this->erased_key_sentinel()}; - detail::rehash<<>>( + detail::rehash<<>>( old_storage.ref(), container.ref(op::insert), is_filled); } @@ -844,7 +847,7 @@ class open_addressing_impl { [[nodiscard]] size_type count(InputIt first, InputIt last, Ref container_ref, - cuda_stream_ref stream) const noexcept + cuda::stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return 0; } @@ -856,7 +859,7 @@ class open_addressing_impl { auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); detail::count - <<>>( + <<>>( first, num_keys, counter.data(), container_ref); return counter.load_to_host(stream); diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 3fa1d0220..9ca129038 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -22,6 +21,8 @@ #include #include +#include + #include namespace cuco { @@ -43,7 +44,7 @@ constexpr static_map, Storage, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique(capacity, cuco::pair{empty_key_sentinel, empty_value_sentinel}, pred, @@ -72,7 +73,7 @@ constexpr static_map, Storage, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique(n, desired_load_factor, cuco::pair{empty_key_sentinel, empty_value_sentinel}, @@ -102,7 +103,7 @@ constexpr static_map, Storage, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique(capacity, cuco::pair{empty_key_sentinel, empty_value_sentinel}, erased_key_sentinel, @@ -123,7 +124,7 @@ template void static_map::clear( - cuda_stream_ref stream) + cuda::stream_ref stream) { impl_->clear(stream); } @@ -137,7 +138,7 @@ template void static_map::clear_async( - cuda_stream_ref stream) noexcept + cuda::stream_ref stream) noexcept { impl_->clear_async(stream); } @@ -153,7 +154,7 @@ template static_map::size_type static_map::insert( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) { return impl_->insert(first, last, ref(op::insert), stream); } @@ -168,7 +169,7 @@ template template void static_map::insert_async( - InputIt first, InputIt last, cuda_stream_ref stream) noexcept + InputIt first, InputIt last, cuda::stream_ref stream) noexcept { impl_->insert_async(first, last, ref(op::insert), stream); } @@ -184,7 +185,7 @@ template static_map::size_type static_map::insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) { return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); } @@ -199,8 +200,11 @@ template template void static_map:: - insert_if_async( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept + insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream) noexcept { impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); } @@ -215,10 +219,10 @@ template template void static_map:: - insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) + insert_or_assign(InputIt first, InputIt last, cuda::stream_ref stream) { return this->insert_or_assign_async(first, last, stream); - stream.synchronize(); + stream.wait(); } template template void static_map:: - insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream) noexcept + insert_or_assign_async(InputIt first, InputIt last, cuda::stream_ref stream) noexcept { auto const num = cuco::detail::distance(first, last); if (num == 0) { return; } @@ -239,7 +243,7 @@ void static_map - <<>>( + <<>>( first, num, ref(op::insert_or_assign)); } @@ -253,10 +257,10 @@ template template void static_map::erase( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) { erase_async(first, last, stream); - stream.synchronize(); + stream.wait(); } template template void static_map::erase_async( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) { impl_->erase_async(first, last, ref(op::erase), stream); } @@ -284,10 +288,10 @@ template template void static_map::contains( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { contains_async(first, last, output_begin, stream); - stream.synchronize(); + stream.wait(); } template template void static_map::contains_async( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const noexcept { impl_->contains_async(first, last, output_begin, ref(op::contains), stream); } @@ -320,10 +324,10 @@ void static_mapcontains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); } @@ -356,10 +360,10 @@ template template void static_map::find( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { find_async(first, last, output_begin, stream); - stream.synchronize(); + stream.wait(); } template template void static_map::find_async( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { impl_->find_async(first, last, output_begin, ref(op::find), stream); } @@ -388,7 +392,7 @@ template std::pair static_map::retrieve_all( - KeyOut keys_out, ValueOut values_out, cuda_stream_ref stream) const + KeyOut keys_out, ValueOut values_out, cuda::stream_ref stream) const { auto const zipped_out_begin = thrust::make_zip_iterator(thrust::make_tuple(keys_out, values_out)); auto const zipped_out_end = impl_->retrieve_all(zipped_out_begin, stream); @@ -406,7 +410,7 @@ template void static_map::rehash( - cuda_stream_ref stream) + cuda::stream_ref stream) { this->impl_->rehash(*this, stream); } @@ -420,7 +424,7 @@ template void static_map::rehash( - size_type capacity, cuda_stream_ref stream) + size_type capacity, cuda::stream_ref stream) { auto const extent = make_window_extent(capacity); this->impl_->rehash(extent, *this, stream); @@ -435,7 +439,7 @@ template void static_map::rehash_async( - cuda_stream_ref stream) + cuda::stream_ref stream) { this->impl_->rehash_async(*this, stream); } @@ -449,7 +453,7 @@ template void static_map::rehash_async( - size_type capacity, cuda_stream_ref stream) + size_type capacity, cuda::stream_ref stream) { auto const extent = make_window_extent(capacity); this->impl_->rehash_async(extent, *this, stream); @@ -465,7 +469,7 @@ template static_map::size_type static_map::size( - cuda_stream_ref stream) const + cuda::stream_ref stream) const { return impl_->size(stream); } diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl index d24a090dd..4b2a5af23 100644 --- a/include/cuco/detail/static_multiset/static_multiset.inl +++ b/include/cuco/detail/static_multiset/static_multiset.inl @@ -39,7 +39,7 @@ constexpr static_multiset, Storage, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique( capacity, empty_key_sentinel, pred, probing_scheme, alloc, stream)} { @@ -61,7 +61,7 @@ constexpr static_multiset, Storage, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique( n, desired_load_factor, empty_key_sentinel, pred, probing_scheme, alloc, stream)} { @@ -83,7 +83,7 @@ constexpr static_multiset, Storage, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique( capacity, empty_key_sentinel, erased_key_sentinel, pred, probing_scheme, alloc, stream)} { @@ -97,7 +97,7 @@ template void static_multiset::clear( - cuda_stream_ref stream) + cuda::stream_ref stream) { impl_->clear(stream); } @@ -110,7 +110,7 @@ template void static_multiset::clear_async( - cuda_stream_ref stream) noexcept + cuda::stream_ref stream) noexcept { impl_->clear_async(stream); } @@ -124,10 +124,10 @@ template template void static_multiset::insert( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) { this->insert_async(first, last, stream); - stream.synchronize(); + stream.wait(); } template template void static_multiset::insert_async( - InputIt first, InputIt last, cuda_stream_ref stream) noexcept + InputIt first, InputIt last, cuda::stream_ref stream) noexcept { impl_->insert_async(first, last, ref(op::insert), stream); } @@ -153,10 +153,10 @@ template template void static_multiset::insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) { this->insert_if_async(first, last, stencil, pred, stream); - stream.synchronize(); + stream.wait(); } template template void static_multiset:: - insert_if_async( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept + insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda::stream_ref stream) noexcept { impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); } @@ -183,10 +186,10 @@ template template void static_multiset::contains( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { this->contains_async(first, last, output_begin, stream); - stream.synchronize(); + stream.wait(); } template contains_async(first, last, output_begin, ref(op::contains), stream); } @@ -220,10 +223,10 @@ void static_multisetcontains_if_async(first, last, stencil, pred, output_begin, stream); - stream.synchronize(); + stream.wait(); } template contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); } @@ -254,10 +257,10 @@ template template void static_multiset::find( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { find_async(first, last, output_begin, stream); - stream.synchronize(); + stream.wait(); } template template void static_multiset::find_async( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { impl_->find_async(first, last, output_begin, ref(op::find), stream); } @@ -284,7 +287,7 @@ template static_multiset::size_type static_multiset::count( - InputIt first, InputIt last, cuda_stream_ref stream) const noexcept + InputIt first, InputIt last, cuda::stream_ref stream) const noexcept { return impl_->count(first, last, ref(op::count), stream); } @@ -303,7 +306,7 @@ static_multiset InputIt last, ProbeKeyEqual const& probe_key_equal, ProbeHash const& probe_hash, - cuda_stream_ref stream) const noexcept + cuda::stream_ref stream) const noexcept { return impl_->count(first, last, @@ -325,7 +328,7 @@ static_multiset InputIt last, ProbeKeyEqual const& probe_key_equal, ProbeHash const& probe_hash, - cuda_stream_ref stream) const noexcept + cuda::stream_ref stream) const noexcept { return impl_->count_outer( first, @@ -343,7 +346,7 @@ template static_multiset::size_type static_multiset::size( - cuda_stream_ref stream) const + cuda::stream_ref stream) const { return impl_->size(stream); } diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl index d3cece00f..3802454d9 100644 --- a/include/cuco/detail/static_set/static_set.inl +++ b/include/cuco/detail/static_set/static_set.inl @@ -40,7 +40,7 @@ constexpr static_set, Storage, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique( capacity, empty_key_sentinel, pred, probing_scheme, alloc, stream)} { @@ -62,7 +62,7 @@ constexpr static_set, Storage, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique( n, desired_load_factor, empty_key_sentinel, pred, probing_scheme, alloc, stream)} { @@ -84,7 +84,7 @@ constexpr static_set, Storage, Allocator const& alloc, - cuda_stream_ref stream) + cuda::stream_ref stream) : impl_{std::make_unique( capacity, empty_key_sentinel, erased_key_sentinel, pred, probing_scheme, alloc, stream)} { @@ -98,7 +98,7 @@ template void static_set::clear( - cuda_stream_ref stream) + cuda::stream_ref stream) { impl_->clear(stream); } @@ -111,7 +111,7 @@ template void static_set::clear_async( - cuda_stream_ref stream) noexcept + cuda::stream_ref stream) noexcept { impl_->clear_async(stream); } @@ -126,7 +126,7 @@ template static_set::size_type static_set::insert( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) { return impl_->insert(first, last, ref(op::insert), stream); } @@ -140,7 +140,7 @@ template template void static_set::insert_async( - InputIt first, InputIt last, cuda_stream_ref stream) noexcept + InputIt first, InputIt last, cuda::stream_ref stream) noexcept { impl_->insert_async(first, last, ref(op::insert), stream); } @@ -155,7 +155,7 @@ template static_set::size_type static_set::insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) { return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); } @@ -169,7 +169,7 @@ template template void static_set::insert_if_async( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) noexcept { impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); } @@ -183,10 +183,10 @@ template template void static_set::erase( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) { erase_async(first, last, stream); - stream.synchronize(); + stream.wait(); } template template void static_set::erase_async( - InputIt first, InputIt last, cuda_stream_ref stream) + InputIt first, InputIt last, cuda::stream_ref stream) { impl_->erase_async(first, last, ref(op::erase), stream); } @@ -212,10 +212,10 @@ template template void static_set::contains( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { contains_async(first, last, output_begin, stream); - stream.synchronize(); + stream.wait(); } template template void static_set::contains_async( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const noexcept { impl_->contains_async(first, last, output_begin, ref(op::contains), stream); } @@ -246,10 +246,10 @@ void static_set StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream) const + cuda::stream_ref stream) const { contains_if_async(first, last, stencil, pred, output_begin, stream); - stream.synchronize(); + stream.wait(); } template StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream) const noexcept + cuda::stream_ref stream) const noexcept { impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); } @@ -280,10 +280,10 @@ template template void static_set::find( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { find_async(first, last, output_begin, stream); - stream.synchronize(); + stream.wait(); } template template void static_set::find_async( - InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const + InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const { impl_->find_async(first, last, output_begin, ref(op::find), stream); } @@ -314,7 +314,7 @@ static_set::ret InputIt last, OutputIt1 output_probe, OutputIt2 output_match, - cuda_stream_ref stream) const + cuda::stream_ref stream) const { auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return {output_probe, output_match}; } @@ -326,7 +326,7 @@ static_set::ret auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); static_set_ns::detail::retrieve - <<>>( + <<>>( first, num_keys, output_probe, output_match, counter.data(), ref(op::find)); auto const count = counter.load_to_host(stream); @@ -347,7 +347,7 @@ OutputIt static_set template OutputIt static_set::retrieve_all( - OutputIt output_begin, cuda_stream_ref stream) const + OutputIt output_begin, cuda::stream_ref stream) const { return impl_->retrieve_all(output_begin, stream); } @@ -374,7 +374,7 @@ template void static_set::rehash( - cuda_stream_ref stream) + cuda::stream_ref stream) { this->impl_->rehash(*this, stream); } @@ -387,7 +387,7 @@ template void static_set::rehash( - size_type capacity, cuda_stream_ref stream) + size_type capacity, cuda::stream_ref stream) { auto const extent = make_window_extent(capacity); this->impl_->rehash(extent, *this, stream); @@ -401,7 +401,7 @@ template void static_set::rehash_async( - cuda_stream_ref stream) + cuda::stream_ref stream) { this->impl_->rehash_async(*this, stream); } @@ -414,7 +414,7 @@ template void static_set::rehash_async( - size_type capacity, cuda_stream_ref stream) + size_type capacity, cuda::stream_ref stream) { auto const extent = make_window_extent(capacity); this->impl_->rehash_async(extent, *this, stream); @@ -429,7 +429,7 @@ template static_set::size_type static_set::size( - cuda_stream_ref stream) const + cuda::stream_ref stream) const { return impl_->size(stream); } diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl index 94b7f98d0..dfbb90327 100644 --- a/include/cuco/detail/storage/aow_storage.inl +++ b/include/cuco/detail/storage/aow_storage.inl @@ -16,13 +16,13 @@ #pragma once -#include #include #include #include #include #include +#include #include #include @@ -64,21 +64,21 @@ aow_storage::ref() const noexcept template void aow_storage::initialize(value_type key, - cuda_stream_ref stream) + cuda::stream_ref stream) { this->initialize_async(key, stream); - stream.synchronize(); + stream.wait(); } template void aow_storage::initialize_async( - value_type key, cuda_stream_ref stream) noexcept + value_type key, cuda::stream_ref stream) noexcept { auto constexpr cg_size = 1; auto constexpr stride = 4; auto const grid_size = cuco::detail::grid_size(this->num_windows(), cg_size, stride); - detail::initialize<<>>( + detail::initialize<<>>( this->data(), this->num_windows(), key); } diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh index 3055a68e7..6f891b404 100644 --- a/include/cuco/detail/storage/counter_storage.cuh +++ b/include/cuco/detail/storage/counter_storage.cuh @@ -16,12 +16,12 @@ #pragma once -#include #include #include #include #include +#include #include @@ -64,10 +64,10 @@ class counter_storage : public storage_base> { * * @param stream CUDA stream used to reset */ - void reset(cuda_stream_ref stream) + void reset(cuda::stream_ref stream) { static_assert(sizeof(size_type) == sizeof(value_type)); - CUCO_CUDA_TRY(cudaMemsetAsync(this->data(), 0, sizeof(value_type), stream)); + CUCO_CUDA_TRY(cudaMemsetAsync(this->data(), 0, sizeof(value_type), stream.get())); } /** @@ -92,12 +92,12 @@ class counter_storage : public storage_base> { * @param stream CUDA stream used to copy device value to the host * @return Value of the atomic counter */ - [[nodiscard]] constexpr size_type load_to_host(cuda_stream_ref stream) const + [[nodiscard]] constexpr size_type load_to_host(cuda::stream_ref stream) const { size_type h_count; - CUCO_CUDA_TRY( - cudaMemcpyAsync(&h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream)); - stream.synchronize(); + CUCO_CUDA_TRY(cudaMemcpyAsync( + &h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); + stream.wait(); return h_count; } diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 7ada54123..be324ab8d 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -17,9 +17,8 @@ #pragma once -#include - #include +#include #include #include @@ -143,7 +142,7 @@ class dynamic_bitset { constexpr void test(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream = {}) noexcept; + cuda::stream_ref stream = {}) noexcept; /** * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total @@ -163,7 +162,7 @@ class dynamic_bitset { constexpr void rank(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream = {}) noexcept; + cuda::stream_ref stream = {}) noexcept; /** * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the @@ -183,7 +182,7 @@ class dynamic_bitset { constexpr void select(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream = {}) noexcept; + cuda::stream_ref stream = {}) noexcept; using rank_type = cuco::experimental::detail::rank; ///< Rank type @@ -350,7 +349,7 @@ class dynamic_bitset { * * @param stream Stream to execute kernels */ - constexpr void build(cuda_stream_ref stream = {}) noexcept; + constexpr void build(cuda::stream_ref stream = {}) noexcept; /** * @brief Populates rank and select indexes for true or false bits @@ -364,7 +363,7 @@ class dynamic_bitset { thrust::device_vector& ranks, thrust::device_vector& selects, bool flip_bits, - cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); }; } // namespace detail diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index a62ce502c..8d53d52e0 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -76,7 +76,7 @@ template constexpr void dynamic_bitset::test(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream) noexcept + cuda::stream_ref stream) noexcept { build(); @@ -85,7 +85,7 @@ constexpr void dynamic_bitset::test(KeyIt keys_begin, auto const grid_size = cuco::detail::grid_size(num_keys); - bitset_test_kernel<<>>( + bitset_test_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -94,7 +94,7 @@ template constexpr void dynamic_bitset::rank(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream) noexcept + cuda::stream_ref stream) noexcept { build(); auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -102,7 +102,7 @@ constexpr void dynamic_bitset::rank(KeyIt keys_begin, auto const grid_size = cuco::detail::grid_size(num_keys); - bitset_rank_kernel<<>>( + bitset_rank_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -111,7 +111,7 @@ template constexpr void dynamic_bitset::select(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream) noexcept + cuda::stream_ref stream) noexcept { build(); @@ -120,7 +120,7 @@ constexpr void dynamic_bitset::select(KeyIt keys_begin, auto const grid_size = cuco::detail::grid_size(num_keys); - bitset_select_kernel<<>>( + bitset_select_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -129,7 +129,7 @@ constexpr void dynamic_bitset::build_ranks_and_selects( thrust::device_vector& ranks, thrust::device_vector& selects, bool flip_bits, - cuda_stream_ref stream) + cuda::stream_ref stream) { if (n_bits_ == 0) { return; } @@ -143,15 +143,19 @@ constexpr void dynamic_bitset::build_ranks_and_selects( auto const bit_counts_begin = thrust::raw_pointer_cast(bit_counts.data()); auto grid_size = cuco::detail::grid_size(num_words); - bit_counts_kernel<<>>( + bit_counts_kernel<<>>( thrust::raw_pointer_cast(words_.data()), bit_counts_begin, num_words, flip_bits); std::size_t temp_storage_bytes = 0; using temp_allocator_type = typename std::allocator_traits::rebind_alloc; auto temp_allocator = temp_allocator_type{this->allocator_}; - CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum( - nullptr, temp_storage_bytes, bit_counts_begin, bit_counts_begin, bit_counts_size, stream)); + CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(nullptr, + temp_storage_bytes, + bit_counts_begin, + bit_counts_begin, + bit_counts_size, + stream.get())); // Allocate temporary storage auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); @@ -161,7 +165,7 @@ constexpr void dynamic_bitset::build_ranks_and_selects( bit_counts_begin, bit_counts_begin, bit_counts_size, - stream)); + stream.get())); temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); @@ -170,25 +174,30 @@ constexpr void dynamic_bitset::build_ranks_and_selects( ranks.resize(num_blocks); grid_size = cuco::detail::grid_size(num_blocks); - encode_ranks_from_prefix_bit_counts<<>>( - bit_counts_begin, - thrust::raw_pointer_cast(ranks.data()), - num_words, - num_blocks, - words_per_block); + encode_ranks_from_prefix_bit_counts<<>>(bit_counts_begin, + thrust::raw_pointer_cast(ranks.data()), + num_words, + num_blocks, + words_per_block); // Step 3. Compute selects thrust::device_vector select_markers(num_blocks, this->allocator_); auto const select_markers_begin = thrust::raw_pointer_cast(select_markers.data()); - mark_blocks_with_select_entries<<>>( + mark_blocks_with_select_entries<<>>( bit_counts_begin, select_markers_begin, num_blocks, words_per_block, bits_per_block); auto d_sum = reinterpret_cast(thrust::raw_pointer_cast( std::allocator_traits::allocate(temp_allocator, sizeof(size_type)))); CUCO_CUDA_TRY(cub::DeviceReduce::Sum( - nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream)); + nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream.get())); d_temp_storage = temp_allocator.allocate(temp_storage_bytes); @@ -197,12 +206,12 @@ constexpr void dynamic_bitset::build_ranks_and_selects( select_markers_begin, d_sum, num_blocks, - stream)); + stream.get())); size_type num_selects{}; CUCO_CUDA_TRY( - cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); - stream.synchronize(); + cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get())); + stream.wait(); std::allocator_traits::deallocate( temp_allocator, thrust::device_ptr{reinterpret_cast(d_sum)}, sizeof(size_type)); temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); @@ -218,7 +227,7 @@ constexpr void dynamic_bitset::build_ranks_and_selects( select_begin, thrust::make_discard_iterator(), num_blocks, - stream)); + stream.get())); d_temp_storage = temp_allocator.allocate(temp_storage_bytes); @@ -229,13 +238,13 @@ constexpr void dynamic_bitset::build_ranks_and_selects( select_begin, thrust::make_discard_iterator(), num_blocks, - stream)); + stream.get())); temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); } template -constexpr void dynamic_bitset::build(cuda_stream_ref stream) noexcept +constexpr void dynamic_bitset::build(cuda::stream_ref stream) noexcept { if (not is_built_) { build_ranks_and_selects(ranks_true_, selects_true_, false, stream); // 1 bits diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index e4dee193f..feb8f00a0 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -15,7 +15,6 @@ */ #pragma once -#include #include #include #include @@ -23,6 +22,8 @@ #include #include +#include + #include #include #include @@ -71,7 +72,7 @@ class distinct_count_estimator { constexpr distinct_count_estimator(cuco::sketch_size_kb sketch_size_kb = 32_KB, Hash const& hash = {}, Allocator const& alloc = {}, - cuco::cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Constructs a `distinct_count_estimator` host object. @@ -84,9 +85,9 @@ class distinct_count_estimator { * @param stream CUDA stream used to initialize the object */ constexpr distinct_count_estimator(cuco::standard_deviation standard_deviation, - Hash const& hash = {}, - Allocator const& alloc = {}, - cuco::cuda_stream_ref stream = {}); + Hash const& hash = {}, + Allocator const& alloc = {}, + cuda::stream_ref stream = {}); ~distinct_count_estimator() = default; @@ -106,7 +107,7 @@ class distinct_count_estimator { * * @param stream CUDA stream this operation is executed in */ - constexpr void clear_async(cuco::cuda_stream_ref stream = {}) noexcept; + constexpr void clear_async(cuda::stream_ref stream = {}) noexcept; /** * @brief Resets the estimator, i.e., clears the current count estimate. @@ -116,7 +117,7 @@ class distinct_count_estimator { * * @param stream CUDA stream this operation is executed in */ - constexpr void clear(cuco::cuda_stream_ref stream = {}); + constexpr void clear(cuda::stream_ref stream = {}); /** * @brief Asynchronously adds to be counted items to the estimator. @@ -130,7 +131,7 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Adds to be counted items to the estimator. @@ -147,7 +148,7 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Asynchronously merges the result of `other` estimator into `*this` estimator. @@ -163,7 +164,7 @@ class distinct_count_estimator { template constexpr void merge_async( distinct_count_estimator const& other, - cuco::cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Merges the result of `other` estimator into `*this` estimator. @@ -181,7 +182,7 @@ class distinct_count_estimator { */ template constexpr void merge(distinct_count_estimator const& other, - cuco::cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. @@ -194,8 +195,7 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - constexpr void merge_async(ref_type const& other_ref, - cuco::cuda_stream_ref stream = {}); + constexpr void merge_async(ref_type const& other_ref, cuda::stream_ref stream = {}); /** * @brief Merges the result of `other` estimator reference into `*this` estimator. @@ -211,7 +211,7 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - constexpr void merge(ref_type const& other_ref, cuco::cuda_stream_ref stream = {}); + constexpr void merge(ref_type const& other_ref, cuda::stream_ref stream = {}); /** * @brief Compute the estimated distinct items count. @@ -222,7 +222,7 @@ class distinct_count_estimator { * * @return Approximate distinct items count */ - [[nodiscard]] constexpr std::size_t estimate(cuco::cuda_stream_ref stream = {}) const; + [[nodiscard]] constexpr std::size_t estimate(cuda::stream_ref stream = {}) const; /** * @brief Get device ref. diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index 44374c6b5..cb566990e 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -15,12 +15,13 @@ */ #pragma once -#include #include #include #include #include +#include + #include #include @@ -81,7 +82,7 @@ class distinct_count_estimator_ref { * * @param stream CUDA stream this operation is executed in */ - __host__ constexpr void clear_async(cuco::cuda_stream_ref stream = {}) noexcept; + __host__ constexpr void clear_async(cuda::stream_ref stream = {}) noexcept; /** * @brief Resets the estimator, i.e., clears the current count estimate. @@ -91,7 +92,7 @@ class distinct_count_estimator_ref { * * @param stream CUDA stream this operation is executed in */ - __host__ constexpr void clear(cuco::cuda_stream_ref stream = {}); + __host__ constexpr void clear(cuda::stream_ref stream = {}); /** * @brief Adds an item to the estimator. @@ -112,7 +113,7 @@ class distinct_count_estimator_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + __host__ constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Adds to be counted items to the estimator. @@ -129,7 +130,7 @@ class distinct_count_estimator_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {}); + __host__ constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Merges the result of `other` estimator reference into `*this` estimator reference. @@ -158,8 +159,7 @@ class distinct_count_estimator_ref { */ template __host__ constexpr void merge_async( - distinct_count_estimator_ref const& other, - cuco::cuda_stream_ref stream = {}); + distinct_count_estimator_ref const& other, cuda::stream_ref stream = {}); /** * @brief Merges the result of `other` estimator reference into `*this` estimator. @@ -176,7 +176,7 @@ class distinct_count_estimator_ref { */ template __host__ constexpr void merge(distinct_count_estimator_ref const& other, - cuco::cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Compute the estimated distinct items count. @@ -197,7 +197,7 @@ class distinct_count_estimator_ref { * * @return Approximate distinct items count */ - [[nodiscard]] __host__ constexpr std::size_t estimate(cuco::cuda_stream_ref stream = {}) const; + [[nodiscard]] __host__ constexpr std::size_t estimate(cuda::stream_ref stream = {}) const; /** * @brief Gets the hash function. diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 95da423f0..40556ed06 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include @@ -29,6 +28,7 @@ #include #include +#include #include #if defined(CUCO_HAS_CUDA_BARRIER) @@ -181,7 +181,7 @@ class static_map { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Constructs a statically-sized map with the number of elements to insert `n`, the desired @@ -225,7 +225,7 @@ class static_map { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Constructs a statically-sized map with the specified initial capacity, sentinel values @@ -261,7 +261,7 @@ class static_map { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Erases all elements from the container. After this call, `size()` returns zero. @@ -269,7 +269,7 @@ class static_map { * * @param stream CUDA stream this operation is executed in */ - void clear(cuda_stream_ref stream = {}); + void clear(cuda::stream_ref stream = {}); /** * @brief Asynchronously erases all elements from the container. After this call, `size()` returns @@ -277,7 +277,7 @@ class static_map { * * @param stream CUDA stream this operation is executed in */ - void clear_async(cuda_stream_ref stream = {}) noexcept; + void clear_async(cuda::stream_ref stream = {}) noexcept; /** * @brief Inserts all keys in the range `[first, last)` and returns the number of successful @@ -297,7 +297,7 @@ class static_map { * @return Number of successful insertions */ template - size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); + size_type insert(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Asynchronously inserts all keys in the range `[first, last)`. @@ -311,7 +311,7 @@ class static_map { * @param stream CUDA stream used for insert */ template - void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept; /** * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns @@ -339,7 +339,7 @@ class static_map { */ template size_type insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); /** * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding @@ -366,7 +366,7 @@ class static_map { InputIt last, StencilIt stencil, Predicate pred, - cuda_stream_ref stream = {}) noexcept; + cuda::stream_ref stream = {}) noexcept; /** * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` @@ -387,7 +387,7 @@ class static_map { * @param stream CUDA stream used for insert */ template - void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void insert_or_assign(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` @@ -406,7 +406,7 @@ class static_map { * @param stream CUDA stream used for insert */ template - void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + void insert_or_assign_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept; /** * @brief Erases keys in the range `[first, last)`. @@ -433,7 +433,7 @@ class static_map { * provided at construction */ template - void erase(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void erase(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Asynchronously erases keys in the range `[first, last)`. @@ -458,7 +458,7 @@ class static_map { * provided at construction */ template - void erase_async(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void erase_async(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. @@ -478,7 +478,7 @@ class static_map { void contains(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in @@ -496,7 +496,7 @@ class static_map { void contains_async(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const noexcept; + cuda::stream_ref stream = {}) const noexcept; /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map if @@ -529,7 +529,7 @@ class static_map { StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in @@ -560,7 +560,7 @@ class static_map { StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream = {}) const noexcept; + cuda::stream_ref stream = {}) const noexcept; /** * @brief For all keys in the range `[first, last)`, finds a payload with its key equivalent to @@ -579,7 +579,7 @@ class static_map { * @param stream Stream used for executing the kernels */ template - void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; + void find(InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream = {}) const; /** * @brief For all keys in the range `[first, last)`, asynchronously finds a payload with its key @@ -600,7 +600,7 @@ class static_map { void find_async(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Retrieves all of the keys and their associated values. @@ -625,7 +625,7 @@ class static_map { template std::pair retrieve_all(KeyOut keys_out, ValueOut values_out, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Regenerates the container. @@ -635,7 +635,7 @@ class static_map { * * @param stream CUDA stream used for this operation */ - void rehash(cuda_stream_ref stream = {}); + void rehash(cuda::stream_ref stream = {}); /** * @brief Reserves at least the specified number of slots and regenerates the container @@ -655,14 +655,14 @@ class static_map { * @param capacity New capacity of the container * @param stream CUDA stream used for this operation */ - void rehash(size_type capacity, cuda_stream_ref stream = {}); + void rehash(size_type capacity, cuda::stream_ref stream = {}); /** * @brief Asynchronously regenerates the container. * * @param stream CUDA stream used for this operation */ - void rehash_async(cuda_stream_ref stream = {}); + void rehash_async(cuda::stream_ref stream = {}); /** * @brief Asynchronously reserves at least the specified number of slots and regenerates the @@ -680,7 +680,7 @@ class static_map { * @param capacity New capacity of the container * @param stream CUDA stream used for this operation */ - void rehash_async(size_type capacity, cuda_stream_ref stream = {}); + void rehash_async(size_type capacity, cuda::stream_ref stream = {}); /** * @brief Gets the number of elements in the container. @@ -690,7 +690,7 @@ class static_map { * @param stream CUDA stream used to get the number of inserted elements * @return The number of elements in the container */ - [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; + [[nodiscard]] size_type size(cuda::stream_ref stream = {}) const; /** * @brief Gets the maximum number of elements the hash map can hold. diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh index 81e5cda74..e23d1b02b 100644 --- a/include/cuco/static_multiset.cuh +++ b/include/cuco/static_multiset.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include @@ -29,6 +28,7 @@ #include #include +#include #include #include @@ -151,7 +151,7 @@ class static_multiset { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Constructs a statically-sized multiset with the number of elements to insert `n`, the @@ -193,7 +193,7 @@ class static_multiset { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values @@ -227,7 +227,7 @@ class static_multiset { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Erases all elements from the container. After this call, `size()` returns zero. @@ -235,7 +235,7 @@ class static_multiset { * * @param stream CUDA stream this operation is executed in */ - void clear(cuda_stream_ref stream = {}); + void clear(cuda::stream_ref stream = {}); /** * @brief Asynchronously erases all elements from the container. After this call, `size()` returns @@ -243,7 +243,7 @@ class static_multiset { * * @param stream CUDA stream this operation is executed in */ - void clear_async(cuda_stream_ref stream = {}) noexcept; + void clear_async(cuda::stream_ref stream = {}) noexcept; /** * @brief Inserts all keys in the range `[first, last)` @@ -261,7 +261,7 @@ class static_multiset { * @param stream CUDA stream used for insert */ template - void insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void insert(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Asynchronously inserts all keys in the range `[first, last)`. @@ -276,7 +276,7 @@ class static_multiset { * @param stream CUDA stream used for insert */ template - void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept; /** * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns @@ -302,7 +302,7 @@ class static_multiset { */ template void insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); /** * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding @@ -329,7 +329,7 @@ class static_multiset { InputIt last, StencilIt stencil, Predicate pred, - cuda_stream_ref stream = {}) noexcept; + cuda::stream_ref stream = {}) noexcept; /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the multiset. @@ -349,7 +349,7 @@ class static_multiset { void contains(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in @@ -367,7 +367,7 @@ class static_multiset { void contains_async(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const noexcept; + cuda::stream_ref stream = {}) const noexcept; /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the multiset if @@ -400,7 +400,7 @@ class static_multiset { StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in @@ -431,7 +431,7 @@ class static_multiset { StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream = {}) const noexcept; + cuda::stream_ref stream = {}) const noexcept; /** * @brief For all keys in the range `[first, last)`, finds an element with key equivalent to the @@ -450,7 +450,7 @@ class static_multiset { * @param stream Stream used for executing the kernels */ template - void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; + void find(InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream = {}) const; /** * @brief For all keys in the range `[first, last)`, asynchronously finds an element with key @@ -471,7 +471,7 @@ class static_multiset { void find_async(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset @@ -487,7 +487,7 @@ class static_multiset { * @return The sum of total occurrences of all keys in `[first, last)` */ template - size_type count(InputIt first, InputIt last, cuda_stream_ref stream = {}) const noexcept; + size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const noexcept; /** * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset @@ -511,7 +511,7 @@ class static_multiset { InputIt last, ProbeKeyEqual const& probe_key_equal, ProbeHash const& probe_hash, - cuda_stream_ref stream = {}) const noexcept; + cuda::stream_ref stream = {}) const noexcept; /** * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset @@ -537,7 +537,7 @@ class static_multiset { InputIt last, ProbeKeyEqual const& probe_key_equal, ProbeHash const& probe_hash, - cuda_stream_ref stream = {}) const noexcept; + cuda::stream_ref stream = {}) const noexcept; /** * @brief Gets the number of elements in the container. @@ -547,7 +547,7 @@ class static_multiset { * @param stream CUDA stream used to get the number of inserted elements * @return The number of elements in the container */ - [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; + [[nodiscard]] size_type size(cuda::stream_ref stream = {}) const; /** * @brief Gets the maximum number of elements the multiset can hold. diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh index 3517f8429..a83649409 100644 --- a/include/cuco/static_set.cuh +++ b/include/cuco/static_set.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include #include @@ -30,6 +29,7 @@ #include #include +#include #include #if defined(CUCO_HAS_CUDA_BARRIER) @@ -156,7 +156,7 @@ class static_set { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Constructs a statically-sized set with the number of elements to insert `n`, the desired @@ -198,7 +198,7 @@ class static_set { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values @@ -232,7 +232,7 @@ class static_set { cuda_thread_scope scope = {}, Storage storage = {}, Allocator const& alloc = {}, - cuda_stream_ref stream = {}); + cuda::stream_ref stream = {}); /** * @brief Erases all elements from the container. After this call, `size()` returns zero. @@ -240,7 +240,7 @@ class static_set { * * @param stream CUDA stream this operation is executed in */ - void clear(cuda_stream_ref stream = {}); + void clear(cuda::stream_ref stream = {}); /** * @brief Asynchronously erases all elements from the container. After this call, `size()` returns @@ -248,7 +248,7 @@ class static_set { * * @param stream CUDA stream this operation is executed in */ - void clear_async(cuda_stream_ref stream = {}) noexcept; + void clear_async(cuda::stream_ref stream = {}) noexcept; /** * @brief Inserts all keys in the range `[first, last)` and returns the number of successful @@ -268,7 +268,7 @@ class static_set { * @return Number of successfully inserted keys */ template - size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); + size_type insert(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Asynchronously inserts all keys in the range `[first, last)`. @@ -282,7 +282,7 @@ class static_set { * @param stream CUDA stream used for insert */ template - void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept; /** * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns @@ -310,7 +310,7 @@ class static_set { */ template size_type insert_if( - InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {}); /** * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding @@ -337,7 +337,7 @@ class static_set { InputIt last, StencilIt stencil, Predicate pred, - cuda_stream_ref stream = {}) noexcept; + cuda::stream_ref stream = {}) noexcept; /** * @brief Erases keys in the range `[first, last)`. @@ -364,7 +364,7 @@ class static_set { * provided at construction */ template - void erase(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void erase(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Asynchronously erases keys in the range `[first, last)`. @@ -389,7 +389,7 @@ class static_set { * provided at construction */ template - void erase_async(InputIt first, InputIt last, cuda_stream_ref stream = {}); + void erase_async(InputIt first, InputIt last, cuda::stream_ref stream = {}); /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the set. @@ -409,7 +409,7 @@ class static_set { void contains(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in @@ -427,7 +427,7 @@ class static_set { void contains_async(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const noexcept; + cuda::stream_ref stream = {}) const noexcept; /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the set if @@ -460,7 +460,7 @@ class static_set { StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in @@ -491,7 +491,7 @@ class static_set { StencilIt stencil, Predicate pred, OutputIt output_begin, - cuda_stream_ref stream = {}) const noexcept; + cuda::stream_ref stream = {}) const noexcept; /** * @brief For all keys in the range `[first, last)`, finds an element with key equivalent to the @@ -510,7 +510,7 @@ class static_set { * @param stream Stream used for executing the kernels */ template - void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; + void find(InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream = {}) const; /** * @brief For all keys in the range `[first, last)`, asynchronously finds an element with key @@ -531,7 +531,7 @@ class static_set { void find_async(InputIt first, InputIt last, OutputIt output_begin, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Retrieves the matched key in the set corresponding to all probe keys in the range @@ -564,7 +564,7 @@ class static_set { InputIt last, OutputIt1 output_probe, OutputIt2 output_match, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Asynchronously retrieves the matched key in the set corresponding to all probe keys in @@ -602,7 +602,7 @@ class static_set { OutputIt output_begin, ProbeEqual const& probe_equal = ProbeEqual{}, ProbeHash const& probe_hash = ProbeHash{}, - cuda_stream_ref stream = {}) const; + cuda::stream_ref stream = {}) const; /** * @brief Retrieves all keys contained in the set. @@ -622,7 +622,7 @@ class static_set { * @return Iterator indicating the end of the output */ template - OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const; + OutputIt retrieve_all(OutputIt output_begin, cuda::stream_ref stream = {}) const; /** * @brief Regenerates the container. @@ -632,7 +632,7 @@ class static_set { * * @param stream CUDA stream used for this operation */ - void rehash(cuda_stream_ref stream = {}); + void rehash(cuda::stream_ref stream = {}); /** * @brief Reserves at least the specified number of slots and regenerates the container @@ -652,14 +652,14 @@ class static_set { * @param capacity New capacity of the container * @param stream CUDA stream used for this operation */ - void rehash(size_type capacity, cuda_stream_ref stream = {}); + void rehash(size_type capacity, cuda::stream_ref stream = {}); /** * @brief Asynchronously regenerates the container. * * @param stream CUDA stream used for this operation */ - void rehash_async(cuda_stream_ref stream = {}); + void rehash_async(cuda::stream_ref stream = {}); /** * @brief Asynchronously reserves at least the specified number of slots and regenerates the @@ -677,7 +677,7 @@ class static_set { * @param capacity New capacity of the container * @param stream CUDA stream used for this operation */ - void rehash_async(size_type capacity, cuda_stream_ref stream = {}); + void rehash_async(size_type capacity, cuda::stream_ref stream = {}); /** * @brief Gets the number of elements in the container. @@ -687,7 +687,7 @@ class static_set { * @param stream CUDA stream used to get the number of inserted elements * @return The number of elements in the container */ - [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const; + [[nodiscard]] size_type size(cuda::stream_ref stream = {}) const; /** * @brief Gets the maximum number of elements the hash set can hold.