diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
index 522819311..abdbaaff7 100644
--- a/include/cuco/aow_storage.cuh
+++ b/include/cuco/aow_storage.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/storage/aow_storage_base.cuh>
 #include <cuco/extent.cuh>
 #include <cuco/utility/allocator.hpp>
 
 #include <cuda/std/array>
+#include <cuda/stream_ref>
 
 #include <cstddef>
 #include <cstdint>
@@ -122,7 +122,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
    * @param key Key to which all keys in `slots` are initialized
    * @param stream Stream used for executing the kernel
    */
-  void initialize(value_type key, cuda_stream_ref stream = {});
+  void initialize(value_type key, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
@@ -130,7 +130,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
    * @param key Key to which all keys in `slots` are initialized
    * @param stream Stream used for executing the kernel
    */
-  void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept;
+  void initialize_async(value_type key, cuda::stream_ref stream = {}) noexcept;
 
  private:
   allocator_type allocator_;            ///< Allocator used to (de)allocate windows
diff --git a/include/cuco/cuda_stream_ref.hpp b/include/cuco/cuda_stream_ref.hpp
deleted file mode 100644
index 19cd3093e..000000000
--- a/include/cuco/cuda_stream_ref.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuda_runtime_api.h>
-
-#include <cstddef>
-
-namespace cuco {
-
-/**
- * @brief Strongly-typed non-owning wrapper for CUDA streams with default constructor.
- *
- * This wrapper is simply a "view": it does not own the lifetime of the stream it wraps.
- */
-class cuda_stream_ref {
- public:
-  constexpr cuda_stream_ref()                       = default;  ///< Default constructor
-  constexpr cuda_stream_ref(cuda_stream_ref const&) = default;  ///< Copy constructor
-  constexpr cuda_stream_ref(cuda_stream_ref&&)      = default;  ///< Move constructor
-
-  /**
-   * @brief Copy-assignment operator.
-   *
-   * @return Copy of this stream reference.
-   */
-  constexpr cuda_stream_ref& operator=(cuda_stream_ref const&) = default;
-
-  /**
-   * @brief Move-assignment operator.
-   *
-   * @return New location of this stream reference.
-   */
-  constexpr cuda_stream_ref& operator=(cuda_stream_ref&&) = default;  ///< Move-assignment operator
-
-  ~cuda_stream_ref() = default;
-
-  constexpr cuda_stream_ref(int)            = delete;  //< Prevent cast from literal 0
-  constexpr cuda_stream_ref(std::nullptr_t) = delete;  //< Prevent cast from nullptr
-
-  /**
-   * @brief Implicit conversion from `cudaStream_t`.
-   *
-   * @param stream The CUDA stream to reference.
-   */
-  constexpr cuda_stream_ref(cudaStream_t stream) noexcept : stream_{stream} {}
-
-  /**
-   * @brief Get the wrapped stream.
-   *
-   * @return The wrapped stream.
-   */
-  [[nodiscard]] constexpr cudaStream_t value() const noexcept { return stream_; }
-
-  /**
-   * @brief Implicit conversion to `cudaStream_t`.
-   *
-   * @return The underlying `cudaStream_t`.
-   */
-  constexpr operator cudaStream_t() const noexcept { return value(); }
-
-  /**
-   * @brief Return true if the wrapped stream is the CUDA per-thread default stream.
-   *
-   * @return True if the wrapped stream is the per-thread default stream; else false.
-   */
-  [[nodiscard]] inline bool is_per_thread_default() const noexcept;
-
-  /**
-   * @brief Return true if the wrapped stream is explicitly the CUDA legacy default stream.
-   *
-   * @return True if the wrapped stream is the default stream; else false.
-   */
-  [[nodiscard]] inline bool is_default() const noexcept;
-
-  /**
-   * @brief Synchronize the viewed CUDA stream.
-   *
-   * Calls `cudaStreamSynchronize()`.
-   *
-   * @throw cuco::cuda_error if stream synchronization fails
-   */
-  void synchronize() const;
-
- private:
-  cudaStream_t stream_{};
-};
-
-/**
- * @brief Static `cuda_stream_ref` of the default stream (stream 0), for convenience
- */
-static constexpr cuda_stream_ref cuda_stream_default{};
-
-/**
- * @brief Static `cuda_stream_ref` of cudaStreamLegacy, for convenience
- */
-static const cuda_stream_ref cuda_stream_legacy{cudaStreamLegacy};
-
-/**
- * @brief Static `cuda_stream_ref` of cudaStreamPerThread, for convenience
- */
-static const cuda_stream_ref cuda_stream_per_thread{cudaStreamPerThread};
-
-// /**
-//  * @brief Equality comparison operator for streams
-//  *
-//  * @param lhs The first stream view to compare
-//  * @param rhs The second stream view to compare
-//  * @return true if equal, false if unequal
-//  */
-// inline bool operator==(cuda_stream_ref lhs, cuda_stream_ref rhs)
-// {
-//   return lhs.value() == rhs.value();
-// }
-
-// /**
-//  * @brief Inequality comparison operator for streams
-//  *
-//  * @param lhs The first stream view to compare
-//  * @param rhs The second stream view to compare
-//  * @return true if unequal, false if equal
-//  */
-// inline bool operator!=(cuda_stream_ref lhs, cuda_stream_ref rhs) { return not(lhs == rhs); }
-
-}  // namespace cuco
-
-#include <cuco/detail/cuda_stream_ref.inl>
diff --git a/include/cuco/detail/cuda_stream_ref.inl b/include/cuco/detail/cuda_stream_ref.inl
deleted file mode 100644
index 16f11f04d..000000000
--- a/include/cuco/detail/cuda_stream_ref.inl
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuco/cuda_stream_ref.hpp>
-#include <cuco/detail/error.hpp>
-
-#include <cuda_runtime_api.h>
-
-namespace cuco {
-
-[[nodiscard]] inline bool cuda_stream_ref::is_per_thread_default() const noexcept
-{
-#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
-  return value() == cuda_stream_per_thread || value() == nullptr;
-#else
-  return value() == cuda_stream_per_thread;
-#endif
-}
-
-[[nodiscard]] inline bool cuda_stream_ref::is_default() const noexcept
-{
-#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
-  return value() == cuda_stream_legacy;
-#else
-  return value() == cuda_stream_legacy || value() == nullptr;
-#endif
-}
-
-inline void cuda_stream_ref::synchronize() const
-{
-  CUCO_CUDA_TRY(cudaStreamSynchronize(this->stream_));
-}
-
-}  // namespace cuco
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index 2caa806ea..be5c3764e 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -21,7 +21,7 @@ constexpr distinct_count_estimator<T, Scope, Hash, Allocator>::distinct_count_es
   cuco::sketch_size_kb sketch_size_kb,
   Hash const& hash,
   Allocator const& alloc,
-  cuco::cuda_stream_ref stream)
+  cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(sketch_size_kb, hash, alloc, stream)}
 {
 }
@@ -31,21 +31,20 @@ constexpr distinct_count_estimator<T, Scope, Hash, Allocator>::distinct_count_es
   cuco::standard_deviation standard_deviation,
   Hash const& hash,
   Allocator const& alloc,
-  cuco::cuda_stream_ref stream)
+  cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(standard_deviation, hash, alloc, stream)}
 {
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::clear_async(
-  cuco::cuda_stream_ref stream) noexcept
+  cuda::stream_ref stream) noexcept
 {
   this->impl_->clear_async(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::clear(
-  cuco::cuda_stream_ref stream)
+constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::clear(cuda::stream_ref stream)
 {
   this->impl_->clear(stream);
 }
@@ -53,15 +52,16 @@ constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::clear(
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <class InputIt>
 constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::add_async(
-  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream)
 {
   this->impl_->add_async(first, last, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <class InputIt>
-constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::add(
-  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::add(InputIt first,
+                                                                        InputIt last,
+                                                                        cuda::stream_ref stream)
 {
   this->impl_->add(first, last, stream);
 }
@@ -70,7 +70,7 @@ template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope, class OtherAllocator>
 constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
   distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-  cuco::cuda_stream_ref stream)
+  cuda::stream_ref stream)
 {
   this->impl_->merge_async(*(other.impl_), stream);
 }
@@ -79,7 +79,7 @@ template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope, class OtherAllocator>
 constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
   distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-  cuco::cuda_stream_ref stream)
+  cuda::stream_ref stream)
 {
   this->impl_->merge(*(other.impl_), stream);
 }
@@ -87,7 +87,7 @@ constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope>
 constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
-  ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
+  ref_type<OtherScope> const& other_ref, cuda::stream_ref stream)
 {
   this->impl_->merge_async(other_ref.impl_, stream);
 }
@@ -95,14 +95,14 @@ constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 template <cuda::thread_scope OtherScope>
 constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
-  ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
+  ref_type<OtherScope> const& other_ref, cuda::stream_ref stream)
 {
   this->impl_->merge(other_ref.impl_, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
 constexpr std::size_t distinct_count_estimator<T, Scope, Hash, Allocator>::estimate(
-  cuco::cuda_stream_ref stream) const
+  cuda::stream_ref stream) const
 {
   return this->impl_->estimate(stream);
 }
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index 97649d5bc..6ded30148 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -34,14 +34,13 @@ __device__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::clear(
 
 template <class T, cuda::thread_scope Scope, class Hash>
 __host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::clear_async(
-  cuco::cuda_stream_ref stream) noexcept
+  cuda::stream_ref stream) noexcept
 {
   this->impl_.clear_async(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::clear(
-  cuco::cuda_stream_ref stream)
+__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::clear(cuda::stream_ref stream)
 {
   this->impl_.clear(stream);
 }
@@ -55,15 +54,16 @@ __device__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::add(T co
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class InputIt>
 __host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::add_async(
-  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream)
 {
   this->impl_.add_async(first, last, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class InputIt>
-__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::add(
-  InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::add(InputIt first,
+                                                                          InputIt last,
+                                                                          cuda::stream_ref stream)
 {
   this->impl_.add(first, last, stream);
 }
@@ -79,7 +79,7 @@ __device__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::merge(
 template <class T, cuda::thread_scope Scope, class Hash>
 template <cuda::thread_scope OtherScope>
 __host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::merge_async(
-  distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuco::cuda_stream_ref stream)
+  distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuda::stream_ref stream)
 {
   this->impl_.merge_async(other, stream);
 }
@@ -87,7 +87,7 @@ __host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::merge_asyn
 template <class T, cuda::thread_scope Scope, class Hash>
 template <cuda::thread_scope OtherScope>
 __host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::merge(
-  distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuco::cuda_stream_ref stream)
+  distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuda::stream_ref stream)
 {
   this->impl_.merge(other, stream);
 }
@@ -101,7 +101,7 @@ __device__ std::size_t distinct_count_estimator_ref<T, Scope, Hash>::estimate(
 
 template <class T, cuda::thread_scope Scope, class Hash>
 __host__ constexpr std::size_t distinct_count_estimator_ref<T, Scope, Hash>::estimate(
-  cuco::cuda_stream_ref stream) const
+  cuda::stream_ref stream) const
 {
   return this->impl_.estimate(stream);
 }
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 011d2bee7..98a5e4857 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/error.hpp>
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
 #include <cuco/detail/storage/storage_base.cuh>
@@ -23,6 +22,8 @@
 #include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/stream_ref>
+
 #include <cstddef>
 #include <iterator>
 #include <memory>
@@ -69,7 +70,7 @@ class hyperloglog {
   constexpr hyperloglog(std::size_t sketch_size_b,
                         Hash const& hash,
                         Allocator const& alloc,
-                        cuco::cuda_stream_ref stream)
+                        cuda::stream_ref stream)
     : allocator_{alloc},
       sketch_{this->allocator_.allocate(sketch_size_b / sizeof(register_type)),
               custom_deleter{sketch_size_b / sizeof(register_type), this->allocator_}},
@@ -92,7 +93,7 @@ class hyperloglog {
   constexpr hyperloglog(cuco::sketch_size_kb sketch_size_kb,
                         Hash const& hash,
                         Allocator const& alloc,
-                        cuco::cuda_stream_ref stream)
+                        cuda::stream_ref stream)
     : hyperloglog{sketch_bytes(sketch_size_kb), hash, alloc, stream}
   {
   }
@@ -110,7 +111,7 @@ class hyperloglog {
   constexpr hyperloglog(cuco::standard_deviation standard_deviation,
                         Hash const& hash,
                         Allocator const& alloc,
-                        cuco::cuda_stream_ref stream)
+                        cuda::stream_ref stream)
     : hyperloglog{sketch_bytes(standard_deviation), hash, alloc, stream}
   {
   }
@@ -133,10 +134,7 @@ class hyperloglog {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  constexpr void clear_async(cuco::cuda_stream_ref stream) noexcept
-  {
-    this->ref_.clear_async(stream);
-  }
+  constexpr void clear_async(cuda::stream_ref stream) noexcept { this->ref_.clear_async(stream); }
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -146,7 +144,7 @@ class hyperloglog {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  constexpr void clear(cuco::cuda_stream_ref stream) { this->ref_.clear(stream); }
+  constexpr void clear(cuda::stream_ref stream) { this->ref_.clear(stream); }
 
   /**
    * @brief Asynchronously adds to be counted items to the estimator.
@@ -160,7 +158,7 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream)
   {
     this->ref_.add_async(first, last, stream);
   }
@@ -180,7 +178,7 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream)
   {
     this->ref_.add(first, last, stream);
   }
@@ -198,7 +196,7 @@ class hyperloglog {
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
   constexpr void merge_async(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
-                             cuco::cuda_stream_ref stream)
+                             cuda::stream_ref stream)
   {
     this->ref_.merge_async(other.ref(), stream);
   }
@@ -219,7 +217,7 @@ class hyperloglog {
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
   constexpr void merge(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
-                       cuco::cuda_stream_ref stream)
+                       cuda::stream_ref stream)
   {
     this->ref_.merge(other.ref(), stream);
   }
@@ -235,7 +233,7 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  constexpr void merge_async(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
+  constexpr void merge_async(ref_type<OtherScope> const& other_ref, cuda::stream_ref stream)
   {
     this->ref_.merge_async(other_ref, stream);
   }
@@ -254,7 +252,7 @@ class hyperloglog {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  constexpr void merge(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream)
+  constexpr void merge(ref_type<OtherScope> const& other_ref, cuda::stream_ref stream)
   {
     this->ref_.merge(other_ref, stream);
   }
@@ -268,7 +266,7 @@ class hyperloglog {
    *
    * @return Approximate distinct items count
    */
-  [[nodiscard]] constexpr std::size_t estimate(cuco::cuda_stream_ref stream) const
+  [[nodiscard]] constexpr std::size_t estimate(cuda::stream_ref stream) const
   {
     return this->ref_.estimate(stream);
   }
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index bd7cd95ad..f68f54594 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/__config>
 #include <cuco/detail/error.hpp>
 #include <cuco/detail/hyperloglog/finalizer.cuh>
@@ -29,6 +28,7 @@
 #include <cuda/std/bit>
 #include <cuda/std/span>
 #include <cuda/std/utility>
+#include <cuda/stream_ref>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
 #include <cooperative_groups.h>
@@ -122,10 +122,10 @@ class hyperloglog_ref {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  __host__ constexpr void clear(cuco::cuda_stream_ref stream)
+  __host__ constexpr void clear(cuda::stream_ref stream)
   {
     this->clear_async(stream);
-    stream.synchronize();
+    stream.wait();
   }
 
   /**
@@ -133,10 +133,10 @@ class hyperloglog_ref {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  __host__ constexpr void clear_async(cuco::cuda_stream_ref stream) noexcept
+  __host__ constexpr void clear_async(cuda::stream_ref stream) noexcept
   {
     auto constexpr block_size = 1024;
-    cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream>>>(*this);
+    cuco::hyperloglog_ns::detail::clear<<<1, block_size, 0, stream.get()>>>(*this);
   }
 
   /**
@@ -169,7 +169,7 @@ class hyperloglog_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  __host__ constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  __host__ constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream)
   {
     auto const num_items = cuco::detail::distance(first, last);
     if (num_items == 0) { return; }
@@ -222,7 +222,7 @@ class hyperloglog_ref {
           (void*)(&num_items),
           reinterpret_cast<void*>(this)};
         CUCO_CUDA_TRY(
-          cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream));
+          cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream.get()));
       }
     } else {
       kernel = reinterpret_cast<void const*>(
@@ -233,7 +233,7 @@ class hyperloglog_ref {
           cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, shmem_bytes));
 
         CUCO_CUDA_TRY(
-          cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream));
+          cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, shmem_bytes, stream.get()));
       } else {
         // Computes sketch directly in global memory. (Fallback path in case there is not enough
         // shared memory avalable)
@@ -242,7 +242,8 @@ class hyperloglog_ref {
 
         CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0));
 
-        CUCO_CUDA_TRY(cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, 0, stream));
+        CUCO_CUDA_TRY(
+          cudaLaunchKernel(kernel, grid_size, block_size, kernel_args, 0, stream.get()));
       }
     }
   }
@@ -262,10 +263,10 @@ class hyperloglog_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  __host__ constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream)
+  __host__ constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream)
   {
     this->add_async(first, last, stream);
-    stream.synchronize();
+    stream.wait();
   }
 
   /**
@@ -304,13 +305,13 @@ class hyperloglog_ref {
    */
   template <cuda::thread_scope OtherScope>
   __host__ constexpr void merge_async(hyperloglog_ref<T, OtherScope, Hash> const& other,
-                                      cuco::cuda_stream_ref stream)
+                                      cuda::stream_ref stream)
   {
     CUCO_EXPECTS(other.precision_ == this->precision_,
                  "Cannot merge estimators with different sketch sizes",
                  std::runtime_error);
     auto constexpr block_size = 1024;
-    cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream>>>(other, *this);
+    cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream.get()>>>(other, *this);
   }
 
   /**
@@ -328,10 +329,10 @@ class hyperloglog_ref {
    */
   template <cuda::thread_scope OtherScope>
   __host__ constexpr void merge(hyperloglog_ref<T, OtherScope, Hash> const& other,
-                                cuco::cuda_stream_ref stream)
+                                cuda::stream_ref stream)
   {
     this->merge_async(other, stream);
-    stream.synchronize();
+    stream.wait();
   }
 
   /**
@@ -403,7 +404,7 @@ class hyperloglog_ref {
    *
    * @return Approximate distinct items count
    */
-  [[nodiscard]] __host__ constexpr std::size_t estimate(cuco::cuda_stream_ref stream) const
+  [[nodiscard]] __host__ constexpr std::size_t estimate(cuda::stream_ref stream) const
   {
     auto const num_regs = 1ull << this->precision_;
     std::vector<register_type> host_sketch(num_regs);
@@ -413,8 +414,8 @@ class hyperloglog_ref {
                                   this->sketch_.data(),
                                   sizeof(register_type) * num_regs,
                                   cudaMemcpyDefault,
-                                  stream));
-    stream.synchronize();
+                                  stream.get()));
+    stream.wait();
 
     fp_type sum = 0;
     int zeroes  = 0;
@@ -566,4 +567,4 @@ class hyperloglog_ref {
   template <class T_, cuda::thread_scope Scope_, class Hash_>
   friend class hyperloglog_ref;
 };
-}  // namespace cuco::detail
\ No newline at end of file
+}  // namespace cuco::detail
diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
index 548ad09a6..befde9685 100644
--- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
@@ -125,7 +125,7 @@ class open_addressing_impl {
                                  KeyEqual const& pred,
                                  ProbingScheme const& probing_scheme,
                                  Allocator const& alloc,
-                                 cuda_stream_ref stream)
+                                 cuda::stream_ref stream)
     : empty_slot_sentinel_{empty_slot_sentinel},
       erased_key_sentinel_{this->extract_key(empty_slot_sentinel)},
       predicate_{pred},
@@ -171,7 +171,7 @@ class open_addressing_impl {
                                  KeyEqual const& pred,
                                  ProbingScheme const& probing_scheme,
                                  Allocator const& alloc,
-                                 cuda_stream_ref stream)
+                                 cuda::stream_ref stream)
     : empty_slot_sentinel_{empty_slot_sentinel},
       erased_key_sentinel_{this->extract_key(empty_slot_sentinel)},
       predicate_{pred},
@@ -213,7 +213,7 @@ class open_addressing_impl {
                                  KeyEqual const& pred,
                                  ProbingScheme const& probing_scheme,
                                  Allocator const& alloc,
-                                 cuda_stream_ref stream)
+                                 cuda::stream_ref stream)
     : empty_slot_sentinel_{empty_slot_sentinel},
       erased_key_sentinel_{erased_key_sentinel},
       predicate_{pred},
@@ -233,7 +233,7 @@ class open_addressing_impl {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear(cuda_stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); }
+  void clear(cuda::stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); }
 
   /**
    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
@@ -241,7 +241,7 @@ class open_addressing_impl {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear_async(cuda_stream_ref stream) noexcept
+  void clear_async(cuda::stream_ref stream) noexcept
   {
     storage_.initialize_async(empty_slot_sentinel_, stream);
   }
@@ -266,7 +266,7 @@ class open_addressing_impl {
    * @return Number of successfully inserted keys
    */
   template <typename InputIt, typename Ref>
-  size_type insert(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream)
+  size_type insert(InputIt first, InputIt last, Ref container_ref, cuda::stream_ref stream)
   {
     auto const always_true = thrust::constant_iterator<bool>{true};
     return this->insert_if(first, last, always_true, thrust::identity{}, container_ref, stream);
@@ -286,7 +286,10 @@ class open_addressing_impl {
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt, typename Ref>
-  void insert_async(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream) noexcept
+  void insert_async(InputIt first,
+                    InputIt last,
+                    Ref container_ref,
+                    cuda::stream_ref stream) noexcept
   {
     auto const always_true = thrust::constant_iterator<bool>{true};
     this->insert_if_async(first, last, always_true, thrust::identity{}, container_ref, stream);
@@ -324,7 +327,7 @@ class open_addressing_impl {
                       StencilIt stencil,
                       Predicate pred,
                       Ref container_ref,
-                      cuda_stream_ref stream)
+                      cuda::stream_ref stream)
   {
     auto const num_keys = cuco::detail::distance(first, last);
     if (num_keys == 0) { return 0; }
@@ -336,7 +339,7 @@ class open_addressing_impl {
     auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
 
     detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
-      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
         first, num_keys, stencil, pred, counter.data(), container_ref);
 
     return counter.load_to_host(stream);
@@ -370,7 +373,7 @@ class open_addressing_impl {
                        StencilIt stencil,
                        Predicate pred,
                        Ref container_ref,
-                       cuda_stream_ref stream) noexcept
+                       cuda::stream_ref stream) noexcept
   {
     auto const num_keys = cuco::detail::distance(first, last);
     if (num_keys == 0) { return; }
@@ -378,7 +381,7 @@ class open_addressing_impl {
     auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
 
     detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
-      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
         first, num_keys, stencil, pred, container_ref);
   }
 
@@ -406,7 +409,7 @@ class open_addressing_impl {
    * provided at construction
    */
   template <typename InputIt, typename Ref>
-  void erase_async(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream = {})
+  void erase_async(InputIt first, InputIt last, Ref container_ref, cuda::stream_ref stream = {})
   {
     CUCO_EXPECTS(this->empty_key_sentinel() != this->erased_key_sentinel(),
                  "The empty key sentinel and erased key sentinel cannot be the same value.",
@@ -418,7 +421,7 @@ class open_addressing_impl {
     auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
 
     detail::erase<cg_size, cuco::detail::default_block_size()>
-      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
         first, num_keys, container_ref);
   }
 
@@ -441,7 +444,7 @@ class open_addressing_impl {
                       InputIt last,
                       OutputIt output_begin,
                       Ref container_ref,
-                      cuda_stream_ref stream) const noexcept
+                      cuda::stream_ref stream) const noexcept
   {
     auto const always_true = thrust::constant_iterator<bool>{true};
     this->contains_if_async(
@@ -484,7 +487,7 @@ class open_addressing_impl {
                          Predicate pred,
                          OutputIt output_begin,
                          Ref container_ref,
-                         cuda_stream_ref stream) const noexcept
+                         cuda::stream_ref stream) const noexcept
   {
     auto const num_keys = cuco::detail::distance(first, last);
     if (num_keys == 0) { return; }
@@ -492,7 +495,7 @@ class open_addressing_impl {
     auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
 
     detail::contains_if_n<cg_size, cuco::detail::default_block_size()>
-      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
         first, num_keys, stencil, pred, output_begin, container_ref);
   }
 
@@ -515,7 +518,7 @@ class open_addressing_impl {
                   InputIt last,
                   OutputIt output_begin,
                   Ref container_ref,
-                  cuda_stream_ref stream) const noexcept
+                  cuda::stream_ref stream) const noexcept
   {
     auto const num_keys = cuco::detail::distance(first, last);
     if (num_keys == 0) { return; }
@@ -523,7 +526,7 @@ class open_addressing_impl {
     auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
 
     detail::find<cg_size, cuco::detail::default_block_size()>
-      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
         first, num_keys, output_begin, container_ref);
   }
 
@@ -543,7 +546,7 @@ class open_addressing_impl {
   [[nodiscard]] size_type count(InputIt first,
                                 InputIt last,
                                 Ref container_ref,
-                                cuda_stream_ref stream) const noexcept
+                                cuda::stream_ref stream) const noexcept
   {
     auto constexpr is_outer = false;
     return this->count<is_outer>(first, last, container_ref, stream);
@@ -567,7 +570,7 @@ class open_addressing_impl {
   [[nodiscard]] size_type count_outer(InputIt first,
                                       InputIt last,
                                       Ref container_ref,
-                                      cuda_stream_ref stream) const noexcept
+                                      cuda::stream_ref stream) const noexcept
   {
     auto constexpr is_outer = true;
     return this->count<is_outer>(first, last, container_ref, stream);
@@ -591,7 +594,7 @@ class open_addressing_impl {
    * @return Iterator indicating the end of the output
    */
   template <typename OutputIt>
-  [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream) const
+  [[nodiscard]] OutputIt retrieve_all(OutputIt output_begin, cuda::stream_ref stream) const
   {
     std::size_t temp_storage_bytes = 0;
     using temp_allocator_type =
@@ -611,7 +614,7 @@ class open_addressing_impl {
                                         d_num_out,
                                         this->capacity(),
                                         is_filled,
-                                        stream));
+                                        stream.get()));
 
     // Allocate temporary storage
     auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
@@ -623,12 +626,12 @@ class open_addressing_impl {
                                         d_num_out,
                                         this->capacity(),
                                         is_filled,
-                                        stream));
+                                        stream.get()));
 
     size_type h_num_out;
-    CUCO_CUDA_TRY(
-      cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
-    stream.synchronize();
+    CUCO_CUDA_TRY(cudaMemcpyAsync(
+      &h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
+    stream.wait();
     std::allocator_traits<temp_allocator_type>::deallocate(
       temp_allocator, reinterpret_cast<char*>(d_num_out), sizeof(size_type));
     temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
@@ -645,7 +648,7 @@ class open_addressing_impl {
    *
    * @return The number of elements in the container
    */
-  [[nodiscard]] size_type size(cuda_stream_ref stream) const
+  [[nodiscard]] size_type size(cuda::stream_ref stream) const
   {
     auto counter =
       detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
@@ -658,7 +661,7 @@ class open_addressing_impl {
     // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to
     // v2.1.0
     detail::size<cuco::detail::default_block_size()>
-      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
         storage_.ref(), is_filled, counter.data());
 
     return counter.load_to_host(stream);
@@ -677,10 +680,10 @@ class open_addressing_impl {
    * @param stream CUDA stream used for this operation
    */
   template <typename Container>
-  void rehash(Container const& container, cuda_stream_ref stream)
+  void rehash(Container const& container, cuda::stream_ref stream)
   {
     this->rehash_async(container, stream);
-    stream.synchronize();
+    stream.wait();
   }
 
   /**
@@ -706,10 +709,10 @@ class open_addressing_impl {
    * @param stream CUDA stream used for this operation
    */
   template <typename Container>
-  void rehash(extent_type extent, Container const& container, cuda_stream_ref stream)
+  void rehash(extent_type extent, Container const& container, cuda::stream_ref stream)
   {
     this->rehash_async(extent, container, stream);
-    stream.synchronize();
+    stream.wait();
   }
 
   /**
@@ -722,7 +725,7 @@ class open_addressing_impl {
    * @param stream CUDA stream used for this operation
    */
   template <typename Container>
-  void rehash_async(Container const& container, cuda_stream_ref stream)
+  void rehash_async(Container const& container, cuda::stream_ref stream)
   {
     this->rehash_async(this->storage_.window_extent(), container, stream);
   }
@@ -747,7 +750,7 @@ class open_addressing_impl {
    * @param stream CUDA stream used for this operation
    */
   template <typename Container>
-  void rehash_async(extent_type extent, Container const& container, cuda_stream_ref stream)
+  void rehash_async(extent_type extent, Container const& container, cuda::stream_ref stream)
   {
     auto const old_storage = std::move(this->storage_);
     new (&storage_) storage_type{extent, this->allocator()};
@@ -762,7 +765,7 @@ class open_addressing_impl {
     auto const is_filled      = open_addressing_ns::detail::slot_is_filled<has_payload, key_type>{
       this->empty_key_sentinel(), this->erased_key_sentinel()};
 
-    detail::rehash<block_size><<<grid_size, block_size, 0, stream>>>(
+    detail::rehash<block_size><<<grid_size, block_size, 0, stream.get()>>>(
       old_storage.ref(), container.ref(op::insert), is_filled);
   }
 
@@ -844,7 +847,7 @@ class open_addressing_impl {
   [[nodiscard]] size_type count(InputIt first,
                                 InputIt last,
                                 Ref container_ref,
-                                cuda_stream_ref stream) const noexcept
+                                cuda::stream_ref stream) const noexcept
   {
     auto const num_keys = cuco::detail::distance(first, last);
     if (num_keys == 0) { return 0; }
@@ -856,7 +859,7 @@ class open_addressing_impl {
     auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
 
     detail::count<IsOuter, cg_size, cuco::detail::default_block_size()>
-      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
         first, num_keys, counter.data(), container_ref);
 
     return counter.load_to_host(stream);
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 3fa1d0220..9ca129038 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/static_map/kernels.cuh>
 #include <cuco/detail/utility/cuda.hpp>
@@ -22,6 +21,8 @@
 #include <cuco/operator.hpp>
 #include <cuco/static_map_ref.cuh>
 
+#include <cuda/stream_ref>
+
 #include <cstddef>
 
 namespace cuco {
@@ -43,7 +44,7 @@ constexpr static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator,
              cuda_thread_scope<Scope>,
              Storage,
              Allocator const& alloc,
-             cuda_stream_ref stream)
+             cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(capacity,
                                       cuco::pair{empty_key_sentinel, empty_value_sentinel},
                                       pred,
@@ -72,7 +73,7 @@ constexpr static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator,
              cuda_thread_scope<Scope>,
              Storage,
              Allocator const& alloc,
-             cuda_stream_ref stream)
+             cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(n,
                                       desired_load_factor,
                                       cuco::pair{empty_key_sentinel, empty_value_sentinel},
@@ -102,7 +103,7 @@ constexpr static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator,
              cuda_thread_scope<Scope>,
              Storage,
              Allocator const& alloc,
-             cuda_stream_ref stream)
+             cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(capacity,
                                       cuco::pair{empty_key_sentinel, empty_value_sentinel},
                                       erased_key_sentinel,
@@ -123,7 +124,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
 {
   impl_->clear(stream);
 }
@@ -137,7 +138,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear_async(
-  cuda_stream_ref stream) noexcept
+  cuda::stream_ref stream) noexcept
 {
   impl_->clear_async(stream);
 }
@@ -153,7 +154,7 @@ template <class Key,
 template <typename InputIt>
 static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
 static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert(
-  InputIt first, InputIt last, cuda_stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream)
 {
   return impl_->insert(first, last, ref(op::insert), stream);
 }
@@ -168,7 +169,7 @@ template <class Key,
           class Storage>
 template <typename InputIt>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_async(
-  InputIt first, InputIt last, cuda_stream_ref stream) noexcept
+  InputIt first, InputIt last, cuda::stream_ref stream) noexcept
 {
   impl_->insert_async(first, last, ref(op::insert), stream);
 }
@@ -184,7 +185,7 @@ template <class Key,
 template <typename InputIt, typename StencilIt, typename Predicate>
 static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
 static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_if(
-  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream)
+  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream)
 {
   return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream);
 }
@@ -199,8 +200,11 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename StencilIt, typename Predicate>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_if_async(
-    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept
+  insert_if_async(InputIt first,
+                  InputIt last,
+                  StencilIt stencil,
+                  Predicate pred,
+                  cuda::stream_ref stream) noexcept
 {
   impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream);
 }
@@ -215,10 +219,10 @@ template <class Key,
           class Storage>
 template <typename InputIt>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream)
+  insert_or_assign(InputIt first, InputIt last, cuda::stream_ref stream)
 {
   return this->insert_or_assign_async(first, last, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -231,7 +235,7 @@ template <class Key,
           class Storage>
 template <typename InputIt>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream) noexcept
+  insert_or_assign_async(InputIt first, InputIt last, cuda::stream_ref stream) noexcept
 {
   auto const num = cuco::detail::distance(first, last);
   if (num == 0) { return; }
@@ -239,7 +243,7 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
   auto const grid_size = cuco::detail::grid_size(num, cg_size);
 
   static_map_ns::detail::insert_or_assign<cg_size, cuco::detail::default_block_size()>
-    <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
       first, num, ref(op::insert_or_assign));
 }
 
@@ -253,10 +257,10 @@ template <class Key,
           class Storage>
 template <typename InputIt>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::erase(
-  InputIt first, InputIt last, cuda_stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream)
 {
   erase_async(first, last, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -269,7 +273,7 @@ template <class Key,
           class Storage>
 template <typename InputIt>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::erase_async(
-  InputIt first, InputIt last, cuda_stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream)
 {
   impl_->erase_async(first, last, ref(op::erase), stream);
 }
@@ -284,10 +288,10 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
 {
   contains_async(first, last, output_begin, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -300,7 +304,7 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains_async(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const noexcept
 {
   impl_->contains_async(first, last, output_begin, ref(op::contains), stream);
 }
@@ -320,10 +324,10 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
   StencilIt stencil,
   Predicate pred,
   OutputIt output_begin,
-  cuda_stream_ref stream) const
+  cuda::stream_ref stream) const
 {
   contains_if_async(first, last, stencil, pred, output_begin, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -341,7 +345,7 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
                     StencilIt stencil,
                     Predicate pred,
                     OutputIt output_begin,
-                    cuda_stream_ref stream) const noexcept
+                    cuda::stream_ref stream) const noexcept
 {
   impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream);
 }
@@ -356,10 +360,10 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
 {
   find_async(first, last, output_begin, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -372,7 +376,7 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find_async(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
 {
   impl_->find_async(first, last, output_begin, ref(op::find), stream);
 }
@@ -388,7 +392,7 @@ template <class Key,
 template <typename KeyOut, typename ValueOut>
 std::pair<KeyOut, ValueOut>
 static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::retrieve_all(
-  KeyOut keys_out, ValueOut values_out, cuda_stream_ref stream) const
+  KeyOut keys_out, ValueOut values_out, cuda::stream_ref stream) const
 {
   auto const zipped_out_begin = thrust::make_zip_iterator(thrust::make_tuple(keys_out, values_out));
   auto const zipped_out_end   = impl_->retrieve_all(zipped_out_begin, stream);
@@ -406,7 +410,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
 {
   this->impl_->rehash(*this, stream);
 }
@@ -420,7 +424,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
-  size_type capacity, cuda_stream_ref stream)
+  size_type capacity, cuda::stream_ref stream)
 {
   auto const extent = make_window_extent<static_map>(capacity);
   this->impl_->rehash(extent, *this, stream);
@@ -435,7 +439,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
 {
   this->impl_->rehash_async(*this, stream);
 }
@@ -449,7 +453,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
-  size_type capacity, cuda_stream_ref stream)
+  size_type capacity, cuda::stream_ref stream)
 {
   auto const extent = make_window_extent<static_map>(capacity);
   this->impl_->rehash_async(extent, *this, stream);
@@ -465,7 +469,7 @@ template <class Key,
           class Storage>
 static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
 static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
-  cuda_stream_ref stream) const
+  cuda::stream_ref stream) const
 {
   return impl_->size(stream);
 }
diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl
index d24a090dd..4b2a5af23 100644
--- a/include/cuco/detail/static_multiset/static_multiset.inl
+++ b/include/cuco/detail/static_multiset/static_multiset.inl
@@ -39,7 +39,7 @@ constexpr static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator
                   cuda_thread_scope<Scope>,
                   Storage,
                   Allocator const& alloc,
-                  cuda_stream_ref stream)
+                  cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(
       capacity, empty_key_sentinel, pred, probing_scheme, alloc, stream)}
 {
@@ -61,7 +61,7 @@ constexpr static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator
                   cuda_thread_scope<Scope>,
                   Storage,
                   Allocator const& alloc,
-                  cuda_stream_ref stream)
+                  cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(
       n, desired_load_factor, empty_key_sentinel, pred, probing_scheme, alloc, stream)}
 {
@@ -83,7 +83,7 @@ constexpr static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator
                   cuda_thread_scope<Scope>,
                   Storage,
                   Allocator const& alloc,
-                  cuda_stream_ref stream)
+                  cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(
       capacity, empty_key_sentinel, erased_key_sentinel, pred, probing_scheme, alloc, stream)}
 {
@@ -97,7 +97,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
 {
   impl_->clear(stream);
 }
@@ -110,7 +110,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear_async(
-  cuda_stream_ref stream) noexcept
+  cuda::stream_ref stream) noexcept
 {
   impl_->clear_async(stream);
 }
@@ -124,10 +124,10 @@ template <class Key,
           class Storage>
 template <typename InputIt>
 void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert(
-  InputIt first, InputIt last, cuda_stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream)
 {
   this->insert_async(first, last, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -139,7 +139,7 @@ template <class Key,
           class Storage>
 template <typename InputIt>
 void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_async(
-  InputIt first, InputIt last, cuda_stream_ref stream) noexcept
+  InputIt first, InputIt last, cuda::stream_ref stream) noexcept
 {
   impl_->insert_async(first, last, ref(op::insert), stream);
 }
@@ -153,10 +153,10 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename StencilIt, typename Predicate>
 void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_if(
-  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream)
+  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream)
 {
   this->insert_if_async(first, last, stencil, pred, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -168,8 +168,11 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename StencilIt, typename Predicate>
 void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_if_async(
-    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept
+  insert_if_async(InputIt first,
+                  InputIt last,
+                  StencilIt stencil,
+                  Predicate pred,
+                  cuda::stream_ref stream) noexcept
 {
   impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream);
 }
@@ -183,10 +186,10 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
 {
   this->contains_async(first, last, output_begin, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -201,7 +204,7 @@ void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Sto
   contains_async(InputIt first,
                  InputIt last,
                  OutputIt output_begin,
-                 cuda_stream_ref stream) const noexcept
+                 cuda::stream_ref stream) const noexcept
 {
   impl_->contains_async(first, last, output_begin, ref(op::contains), stream);
 }
@@ -220,10 +223,10 @@ void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Sto
   StencilIt stencil,
   Predicate pred,
   OutputIt output_begin,
-  cuda_stream_ref stream) const
+  cuda::stream_ref stream) const
 {
   this->contains_if_async(first, last, stencil, pred, output_begin, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -240,7 +243,7 @@ void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Sto
                     StencilIt stencil,
                     Predicate pred,
                     OutputIt output_begin,
-                    cuda_stream_ref stream) const noexcept
+                    cuda::stream_ref stream) const noexcept
 {
   impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream);
 }
@@ -254,10 +257,10 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
 {
   find_async(first, last, output_begin, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -269,7 +272,7 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find_async(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
 {
   impl_->find_async(first, last, output_begin, ref(op::find), stream);
 }
@@ -284,7 +287,7 @@ template <class Key,
 template <typename InputIt>
 static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
 static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::count(
-  InputIt first, InputIt last, cuda_stream_ref stream) const noexcept
+  InputIt first, InputIt last, cuda::stream_ref stream) const noexcept
 {
   return impl_->count(first, last, ref(op::count), stream);
 }
@@ -303,7 +306,7 @@ static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>
   InputIt last,
   ProbeKeyEqual const& probe_key_equal,
   ProbeHash const& probe_hash,
-  cuda_stream_ref stream) const noexcept
+  cuda::stream_ref stream) const noexcept
 {
   return impl_->count(first,
                       last,
@@ -325,7 +328,7 @@ static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>
   InputIt last,
   ProbeKeyEqual const& probe_key_equal,
   ProbeHash const& probe_hash,
-  cuda_stream_ref stream) const noexcept
+  cuda::stream_ref stream) const noexcept
 {
   return impl_->count_outer(
     first,
@@ -343,7 +346,7 @@ template <class Key,
           class Storage>
 static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
 static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
-  cuda_stream_ref stream) const
+  cuda::stream_ref stream) const
 {
   return impl_->size(stream);
 }
diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
index d3cece00f..3802454d9 100644
--- a/include/cuco/detail/static_set/static_set.inl
+++ b/include/cuco/detail/static_set/static_set.inl
@@ -40,7 +40,7 @@ constexpr static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Sto
   cuda_thread_scope<Scope>,
   Storage,
   Allocator const& alloc,
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(
       capacity, empty_key_sentinel, pred, probing_scheme, alloc, stream)}
 {
@@ -62,7 +62,7 @@ constexpr static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Sto
   cuda_thread_scope<Scope>,
   Storage,
   Allocator const& alloc,
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(
       n, desired_load_factor, empty_key_sentinel, pred, probing_scheme, alloc, stream)}
 {
@@ -84,7 +84,7 @@ constexpr static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Sto
   cuda_thread_scope<Scope>,
   Storage,
   Allocator const& alloc,
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
   : impl_{std::make_unique<impl_type>(
       capacity, empty_key_sentinel, erased_key_sentinel, pred, probing_scheme, alloc, stream)}
 {
@@ -98,7 +98,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
 {
   impl_->clear(stream);
 }
@@ -111,7 +111,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear_async(
-  cuda_stream_ref stream) noexcept
+  cuda::stream_ref stream) noexcept
 {
   impl_->clear_async(stream);
 }
@@ -126,7 +126,7 @@ template <class Key,
 template <typename InputIt>
 static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
 static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert(
-  InputIt first, InputIt last, cuda_stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream)
 {
   return impl_->insert(first, last, ref(op::insert), stream);
 }
@@ -140,7 +140,7 @@ template <class Key,
           class Storage>
 template <typename InputIt>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_async(
-  InputIt first, InputIt last, cuda_stream_ref stream) noexcept
+  InputIt first, InputIt last, cuda::stream_ref stream) noexcept
 {
   impl_->insert_async(first, last, ref(op::insert), stream);
 }
@@ -155,7 +155,7 @@ template <class Key,
 template <typename InputIt, typename StencilIt, typename Predicate>
 static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
 static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_if(
-  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream)
+  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream)
 {
   return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream);
 }
@@ -169,7 +169,7 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename StencilIt, typename Predicate>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_if_async(
-  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept
+  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream) noexcept
 {
   impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream);
 }
@@ -183,10 +183,10 @@ template <class Key,
           class Storage>
 template <typename InputIt>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::erase(
-  InputIt first, InputIt last, cuda_stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream)
 {
   erase_async(first, last, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -198,7 +198,7 @@ template <class Key,
           class Storage>
 template <typename InputIt>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::erase_async(
-  InputIt first, InputIt last, cuda_stream_ref stream)
+  InputIt first, InputIt last, cuda::stream_ref stream)
 {
   impl_->erase_async(first, last, ref(op::erase), stream);
 }
@@ -212,10 +212,10 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
 {
   contains_async(first, last, output_begin, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -227,7 +227,7 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains_async(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const noexcept
 {
   impl_->contains_async(first, last, output_begin, ref(op::contains), stream);
 }
@@ -246,10 +246,10 @@ void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>
   StencilIt stencil,
   Predicate pred,
   OutputIt output_begin,
-  cuda_stream_ref stream) const
+  cuda::stream_ref stream) const
 {
   contains_if_async(first, last, stencil, pred, output_begin, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -266,7 +266,7 @@ void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>
   StencilIt stencil,
   Predicate pred,
   OutputIt output_begin,
-  cuda_stream_ref stream) const noexcept
+  cuda::stream_ref stream) const noexcept
 {
   impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream);
 }
@@ -280,10 +280,10 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
 {
   find_async(first, last, output_begin, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -295,7 +295,7 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename OutputIt>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find_async(
-  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+  InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream) const
 {
   impl_->find_async(first, last, output_begin, ref(op::find), stream);
 }
@@ -314,7 +314,7 @@ static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::ret
   InputIt last,
   OutputIt1 output_probe,
   OutputIt2 output_match,
-  cuda_stream_ref stream) const
+  cuda::stream_ref stream) const
 {
   auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return {output_probe, output_match}; }
@@ -326,7 +326,7 @@ static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::ret
   auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
 
   static_set_ns::detail::retrieve<cuco::detail::default_block_size()>
-    <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
       first, num_keys, output_probe, output_match, counter.data(), ref(op::find));
 
   auto const count = counter.load_to_host(stream);
@@ -347,7 +347,7 @@ OutputIt static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stor
   OutputIt output_begin,
   ProbeEqual const& probe_equal,
   ProbeHash const& probe_hash,
-  cuda_stream_ref stream) const
+  cuda::stream_ref stream) const
 {
   CUCO_FAIL("Unsupported code path: retrieve with custom hash/equal");
 }
@@ -361,7 +361,7 @@ template <class Key,
           class Storage>
 template <typename OutputIt>
 OutputIt static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::retrieve_all(
-  OutputIt output_begin, cuda_stream_ref stream) const
+  OutputIt output_begin, cuda::stream_ref stream) const
 {
   return impl_->retrieve_all(output_begin, stream);
 }
@@ -374,7 +374,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
 {
   this->impl_->rehash(*this, stream);
 }
@@ -387,7 +387,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash(
-  size_type capacity, cuda_stream_ref stream)
+  size_type capacity, cuda::stream_ref stream)
 {
   auto const extent = make_window_extent<static_set>(capacity);
   this->impl_->rehash(extent, *this, stream);
@@ -401,7 +401,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
 {
   this->impl_->rehash_async(*this, stream);
 }
@@ -414,7 +414,7 @@ template <class Key,
           class Allocator,
           class Storage>
 void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::rehash_async(
-  size_type capacity, cuda_stream_ref stream)
+  size_type capacity, cuda::stream_ref stream)
 {
   auto const extent = make_window_extent<static_set>(capacity);
   this->impl_->rehash_async(extent, *this, stream);
@@ -429,7 +429,7 @@ template <class Key,
           class Storage>
 static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
 static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
-  cuda_stream_ref stream) const
+  cuda::stream_ref stream) const
 {
   return impl_->size(stream);
 }
diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl
index 94b7f98d0..dfbb90327 100644
--- a/include/cuco/detail/storage/aow_storage.inl
+++ b/include/cuco/detail/storage/aow_storage.inl
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/storage/kernels.cuh>
 #include <cuco/detail/storage/storage_base.cuh>
 #include <cuco/detail/utility/cuda.hpp>
 #include <cuco/extent.cuh>
 
 #include <cuda/std/array>
+#include <cuda/stream_ref>
 
 #include <cstddef>
 #include <cstdint>
@@ -64,21 +64,21 @@ aow_storage<T, WindowSize, Extent, Allocator>::ref() const noexcept
 
 template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
 void aow_storage<T, WindowSize, Extent, Allocator>::initialize(value_type key,
-                                                               cuda_stream_ref stream)
+                                                               cuda::stream_ref stream)
 {
   this->initialize_async(key, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
 void aow_storage<T, WindowSize, Extent, Allocator>::initialize_async(
-  value_type key, cuda_stream_ref stream) noexcept
+  value_type key, cuda::stream_ref stream) noexcept
 {
   auto constexpr cg_size = 1;
   auto constexpr stride  = 4;
   auto const grid_size   = cuco::detail::grid_size(this->num_windows(), cg_size, stride);
 
-  detail::initialize<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+  detail::initialize<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
     this->data(), this->num_windows(), key);
 }
 
diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh
index 3055a68e7..6f891b404 100644
--- a/include/cuco/detail/storage/counter_storage.cuh
+++ b/include/cuco/detail/storage/counter_storage.cuh
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/error.hpp>
 #include <cuco/detail/storage/storage_base.cuh>
 #include <cuco/extent.cuh>
 
 #include <cuda/atomic>
+#include <cuda/stream_ref>
 
 #include <memory>
 
@@ -64,10 +64,10 @@ class counter_storage : public storage_base<cuco::extent<SizeType, 1>> {
    *
    * @param stream CUDA stream used to reset
    */
-  void reset(cuda_stream_ref stream)
+  void reset(cuda::stream_ref stream)
   {
     static_assert(sizeof(size_type) == sizeof(value_type));
-    CUCO_CUDA_TRY(cudaMemsetAsync(this->data(), 0, sizeof(value_type), stream));
+    CUCO_CUDA_TRY(cudaMemsetAsync(this->data(), 0, sizeof(value_type), stream.get()));
   }
 
   /**
@@ -92,12 +92,12 @@ class counter_storage : public storage_base<cuco::extent<SizeType, 1>> {
    * @param stream CUDA stream used to copy device value to the host
    * @return Value of the atomic counter
    */
-  [[nodiscard]] constexpr size_type load_to_host(cuda_stream_ref stream) const
+  [[nodiscard]] constexpr size_type load_to_host(cuda::stream_ref stream) const
   {
     size_type h_count;
-    CUCO_CUDA_TRY(
-      cudaMemcpyAsync(&h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream));
-    stream.synchronize();
+    CUCO_CUDA_TRY(cudaMemcpyAsync(
+      &h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
+    stream.wait();
     return h_count;
   }
 
diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
index 7ada54123..be324ab8d 100644
--- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
+++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
@@ -17,9 +17,8 @@
 
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
-
 #include <cuda/std/array>
+#include <cuda/stream_ref>
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/device_vector.h>
 
@@ -143,7 +142,7 @@ class dynamic_bitset {
   constexpr void test(KeyIt keys_begin,
                       KeyIt keys_end,
                       OutputIt outputs_begin,
-                      cuda_stream_ref stream = {}) noexcept;
+                      cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total
@@ -163,7 +162,7 @@ class dynamic_bitset {
   constexpr void rank(KeyIt keys_begin,
                       KeyIt keys_end,
                       OutputIt outputs_begin,
-                      cuda_stream_ref stream = {}) noexcept;
+                      cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the
@@ -183,7 +182,7 @@ class dynamic_bitset {
   constexpr void select(KeyIt keys_begin,
                         KeyIt keys_end,
                         OutputIt outputs_begin,
-                        cuda_stream_ref stream = {}) noexcept;
+                        cuda::stream_ref stream = {}) noexcept;
 
   using rank_type = cuco::experimental::detail::rank;  ///< Rank type
 
@@ -350,7 +349,7 @@ class dynamic_bitset {
    *
    * @param stream Stream to execute kernels
    */
-  constexpr void build(cuda_stream_ref stream = {}) noexcept;
+  constexpr void build(cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Populates rank and select indexes for true or false bits
@@ -364,7 +363,7 @@ class dynamic_bitset {
     thrust::device_vector<rank_type, rank_allocator_type>& ranks,
     thrust::device_vector<size_type, size_allocator_type>& selects,
     bool flip_bits,
-    cuda_stream_ref stream = {});
+    cuda::stream_ref stream = {});
 };
 
 }  // namespace detail
diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl
index a62ce502c..8d53d52e0 100644
--- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl
+++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl
@@ -76,7 +76,7 @@ template <typename KeyIt, typename OutputIt>
 constexpr void dynamic_bitset<Allocator>::test(KeyIt keys_begin,
                                                KeyIt keys_end,
                                                OutputIt outputs_begin,
-                                               cuda_stream_ref stream) noexcept
+                                               cuda::stream_ref stream) noexcept
 
 {
   build();
@@ -85,7 +85,7 @@ constexpr void dynamic_bitset<Allocator>::test(KeyIt keys_begin,
 
   auto const grid_size = cuco::detail::grid_size(num_keys);
 
-  bitset_test_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+  bitset_test_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
     ref(), keys_begin, outputs_begin, num_keys);
 }
 
@@ -94,7 +94,7 @@ template <typename KeyIt, typename OutputIt>
 constexpr void dynamic_bitset<Allocator>::rank(KeyIt keys_begin,
                                                KeyIt keys_end,
                                                OutputIt outputs_begin,
-                                               cuda_stream_ref stream) noexcept
+                                               cuda::stream_ref stream) noexcept
 {
   build();
   auto const num_keys = cuco::detail::distance(keys_begin, keys_end);
@@ -102,7 +102,7 @@ constexpr void dynamic_bitset<Allocator>::rank(KeyIt keys_begin,
 
   auto const grid_size = cuco::detail::grid_size(num_keys);
 
-  bitset_rank_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+  bitset_rank_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
     ref(), keys_begin, outputs_begin, num_keys);
 }
 
@@ -111,7 +111,7 @@ template <typename KeyIt, typename OutputIt>
 constexpr void dynamic_bitset<Allocator>::select(KeyIt keys_begin,
                                                  KeyIt keys_end,
                                                  OutputIt outputs_begin,
-                                                 cuda_stream_ref stream) noexcept
+                                                 cuda::stream_ref stream) noexcept
 
 {
   build();
@@ -120,7 +120,7 @@ constexpr void dynamic_bitset<Allocator>::select(KeyIt keys_begin,
 
   auto const grid_size = cuco::detail::grid_size(num_keys);
 
-  bitset_select_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+  bitset_select_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
     ref(), keys_begin, outputs_begin, num_keys);
 }
 
@@ -129,7 +129,7 @@ constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
   thrust::device_vector<rank_type, rank_allocator_type>& ranks,
   thrust::device_vector<size_type, size_allocator_type>& selects,
   bool flip_bits,
-  cuda_stream_ref stream)
+  cuda::stream_ref stream)
 {
   if (n_bits_ == 0) { return; }
 
@@ -143,15 +143,19 @@ constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
   auto const bit_counts_begin = thrust::raw_pointer_cast(bit_counts.data());
 
   auto grid_size = cuco::detail::grid_size(num_words);
-  bit_counts_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+  bit_counts_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
     thrust::raw_pointer_cast(words_.data()), bit_counts_begin, num_words, flip_bits);
 
   std::size_t temp_storage_bytes = 0;
   using temp_allocator_type = typename std::allocator_traits<allocator_type>::rebind_alloc<char>;
   auto temp_allocator       = temp_allocator_type{this->allocator_};
 
-  CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(
-    nullptr, temp_storage_bytes, bit_counts_begin, bit_counts_begin, bit_counts_size, stream));
+  CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(nullptr,
+                                              temp_storage_bytes,
+                                              bit_counts_begin,
+                                              bit_counts_begin,
+                                              bit_counts_size,
+                                              stream.get()));
 
   // Allocate temporary storage
   auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
@@ -161,7 +165,7 @@ constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
                                               bit_counts_begin,
                                               bit_counts_begin,
                                               bit_counts_size,
-                                              stream));
+                                              stream.get()));
 
   temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
 
@@ -170,25 +174,30 @@ constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
   ranks.resize(num_blocks);
 
   grid_size = cuco::detail::grid_size(num_blocks);
-  encode_ranks_from_prefix_bit_counts<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
-    bit_counts_begin,
-    thrust::raw_pointer_cast(ranks.data()),
-    num_words,
-    num_blocks,
-    words_per_block);
+  encode_ranks_from_prefix_bit_counts<<<grid_size,
+                                        cuco::detail::default_block_size(),
+                                        0,
+                                        stream.get()>>>(bit_counts_begin,
+                                                        thrust::raw_pointer_cast(ranks.data()),
+                                                        num_words,
+                                                        num_blocks,
+                                                        words_per_block);
 
   // Step 3. Compute selects
   thrust::device_vector<size_type, size_allocator_type> select_markers(num_blocks,
                                                                        this->allocator_);
   auto const select_markers_begin = thrust::raw_pointer_cast(select_markers.data());
 
-  mark_blocks_with_select_entries<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+  mark_blocks_with_select_entries<<<grid_size,
+                                    cuco::detail::default_block_size(),
+                                    0,
+                                    stream.get()>>>(
     bit_counts_begin, select_markers_begin, num_blocks, words_per_block, bits_per_block);
 
   auto d_sum = reinterpret_cast<size_type*>(thrust::raw_pointer_cast(
     std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(size_type))));
   CUCO_CUDA_TRY(cub::DeviceReduce::Sum(
-    nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream));
+    nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream.get()));
 
   d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
 
@@ -197,12 +206,12 @@ constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
                                        select_markers_begin,
                                        d_sum,
                                        num_blocks,
-                                       stream));
+                                       stream.get()));
 
   size_type num_selects{};
   CUCO_CUDA_TRY(
-    cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
-  stream.synchronize();
+    cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream.get()));
+  stream.wait();
   std::allocator_traits<temp_allocator_type>::deallocate(
     temp_allocator, thrust::device_ptr<char>{reinterpret_cast<char*>(d_sum)}, sizeof(size_type));
   temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
@@ -218,7 +227,7 @@ constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
                                            select_begin,
                                            thrust::make_discard_iterator(),
                                            num_blocks,
-                                           stream));
+                                           stream.get()));
 
   d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
 
@@ -229,13 +238,13 @@ constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
                                            select_begin,
                                            thrust::make_discard_iterator(),
                                            num_blocks,
-                                           stream));
+                                           stream.get()));
 
   temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
 }
 
 template <class Allocator>
-constexpr void dynamic_bitset<Allocator>::build(cuda_stream_ref stream) noexcept
+constexpr void dynamic_bitset<Allocator>::build(cuda::stream_ref stream) noexcept
 {
   if (not is_built_) {
     build_ranks_and_selects(ranks_true_, selects_true_, false, stream);   // 1 bits
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index e4dee193f..feb8f00a0 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/hyperloglog/hyperloglog.cuh>
 #include <cuco/distinct_count_estimator_ref.cuh>
 #include <cuco/hash_functions.cuh>
@@ -23,6 +22,8 @@
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/stream_ref>
+
 #include <cstddef>
 #include <iterator>
 #include <memory>
@@ -71,7 +72,7 @@ class distinct_count_estimator {
   constexpr distinct_count_estimator(cuco::sketch_size_kb sketch_size_kb = 32_KB,
                                      Hash const& hash                    = {},
                                      Allocator const& alloc              = {},
-                                     cuco::cuda_stream_ref stream        = {});
+                                     cuda::stream_ref stream             = {});
 
   /**
    * @brief Constructs a `distinct_count_estimator` host object.
@@ -84,9 +85,9 @@ class distinct_count_estimator {
    * @param stream CUDA stream used to initialize the object
    */
   constexpr distinct_count_estimator(cuco::standard_deviation standard_deviation,
-                                     Hash const& hash             = {},
-                                     Allocator const& alloc       = {},
-                                     cuco::cuda_stream_ref stream = {});
+                                     Hash const& hash        = {},
+                                     Allocator const& alloc  = {},
+                                     cuda::stream_ref stream = {});
 
   ~distinct_count_estimator() = default;
 
@@ -106,7 +107,7 @@ class distinct_count_estimator {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  constexpr void clear_async(cuco::cuda_stream_ref stream = {}) noexcept;
+  constexpr void clear_async(cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -116,7 +117,7 @@ class distinct_count_estimator {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  constexpr void clear(cuco::cuda_stream_ref stream = {});
+  constexpr void clear(cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously adds to be counted items to the estimator.
@@ -130,7 +131,7 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
+  constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Adds to be counted items to the estimator.
@@ -147,7 +148,7 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
+  constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously merges the result of `other` estimator into `*this` estimator.
@@ -163,7 +164,7 @@ class distinct_count_estimator {
   template <cuda::thread_scope OtherScope, class OtherAllocator>
   constexpr void merge_async(
     distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-    cuco::cuda_stream_ref stream = {});
+    cuda::stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator into `*this` estimator.
@@ -181,7 +182,7 @@ class distinct_count_estimator {
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
   constexpr void merge(distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-                       cuco::cuda_stream_ref stream = {});
+                       cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
@@ -194,8 +195,7 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  constexpr void merge_async(ref_type<OtherScope> const& other_ref,
-                             cuco::cuda_stream_ref stream = {});
+  constexpr void merge_async(ref_type<OtherScope> const& other_ref, cuda::stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator.
@@ -211,7 +211,7 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  constexpr void merge(ref_type<OtherScope> const& other_ref, cuco::cuda_stream_ref stream = {});
+  constexpr void merge(ref_type<OtherScope> const& other_ref, cuda::stream_ref stream = {});
 
   /**
    * @brief Compute the estimated distinct items count.
@@ -222,7 +222,7 @@ class distinct_count_estimator {
    *
    * @return Approximate distinct items count
    */
-  [[nodiscard]] constexpr std::size_t estimate(cuco::cuda_stream_ref stream = {}) const;
+  [[nodiscard]] constexpr std::size_t estimate(cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Get device ref.
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index 44374c6b5..cb566990e 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -15,12 +15,13 @@
  */
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
 #include <cuco/hash_functions.cuh>
 #include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/stream_ref>
+
 #include <cooperative_groups.h>
 
 #include <cstddef>
@@ -81,7 +82,7 @@ class distinct_count_estimator_ref {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  __host__ constexpr void clear_async(cuco::cuda_stream_ref stream = {}) noexcept;
+  __host__ constexpr void clear_async(cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -91,7 +92,7 @@ class distinct_count_estimator_ref {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  __host__ constexpr void clear(cuco::cuda_stream_ref stream = {});
+  __host__ constexpr void clear(cuda::stream_ref stream = {});
 
   /**
    * @brief Adds an item to the estimator.
@@ -112,7 +113,7 @@ class distinct_count_estimator_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  __host__ constexpr void add_async(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
+  __host__ constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Adds to be counted items to the estimator.
@@ -129,7 +130,7 @@ class distinct_count_estimator_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <class InputIt>
-  __host__ constexpr void add(InputIt first, InputIt last, cuco::cuda_stream_ref stream = {});
+  __host__ constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator reference.
@@ -158,8 +159,7 @@ class distinct_count_estimator_ref {
    */
   template <cuda::thread_scope OtherScope>
   __host__ constexpr void merge_async(
-    distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
-    cuco::cuda_stream_ref stream = {});
+    distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuda::stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator.
@@ -176,7 +176,7 @@ class distinct_count_estimator_ref {
    */
   template <cuda::thread_scope OtherScope>
   __host__ constexpr void merge(distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
-                                cuco::cuda_stream_ref stream = {});
+                                cuda::stream_ref stream = {});
 
   /**
    * @brief Compute the estimated distinct items count.
@@ -197,7 +197,7 @@ class distinct_count_estimator_ref {
    *
    * @return Approximate distinct items count
    */
-  [[nodiscard]] __host__ constexpr std::size_t estimate(cuco::cuda_stream_ref stream = {}) const;
+  [[nodiscard]] __host__ constexpr std::size_t estimate(cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Gets the hash function.
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 95da423f0..40556ed06 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/__config>
 #include <cuco/detail/open_addressing/open_addressing_impl.cuh>
 #include <cuco/detail/static_map_kernels.cuh>
@@ -29,6 +28,7 @@
 #include <cuco/utility/traits.hpp>
 
 #include <cuda/std/atomic>
+#include <cuda/stream_ref>
 #include <thrust/functional.h>
 
 #if defined(CUCO_HAS_CUDA_BARRIER)
@@ -181,7 +181,7 @@ class static_map {
                        cuda_thread_scope<Scope> scope      = {},
                        Storage storage                     = {},
                        Allocator const& alloc              = {},
-                       cuda_stream_ref stream              = {});
+                       cuda::stream_ref stream             = {});
 
   /**
    * @brief Constructs a statically-sized map with the number of elements to insert `n`, the desired
@@ -225,7 +225,7 @@ class static_map {
                        cuda_thread_scope<Scope> scope      = {},
                        Storage storage                     = {},
                        Allocator const& alloc              = {},
-                       cuda_stream_ref stream              = {});
+                       cuda::stream_ref stream             = {});
 
   /**
    * @brief Constructs a statically-sized map with the specified initial capacity, sentinel values
@@ -261,7 +261,7 @@ class static_map {
                        cuda_thread_scope<Scope> scope      = {},
                        Storage storage                     = {},
                        Allocator const& alloc              = {},
-                       cuda_stream_ref stream              = {});
+                       cuda::stream_ref stream             = {});
 
   /**
    * @brief Erases all elements from the container. After this call, `size()` returns zero.
@@ -269,7 +269,7 @@ class static_map {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear(cuda_stream_ref stream = {});
+  void clear(cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
@@ -277,7 +277,7 @@ class static_map {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear_async(cuda_stream_ref stream = {}) noexcept;
+  void clear_async(cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Inserts all keys in the range `[first, last)` and returns the number of successful
@@ -297,7 +297,7 @@ class static_map {
    * @return Number of successful insertions
    */
   template <typename InputIt>
-  size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {});
+  size_type insert(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously inserts all keys in the range `[first, last)`.
@@ -311,7 +311,7 @@ class static_map {
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt>
-  void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
+  void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns
@@ -339,7 +339,7 @@ class static_map {
    */
   template <typename InputIt, typename StencilIt, typename Predicate>
   size_type insert_if(
-    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {});
+    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding
@@ -366,7 +366,7 @@ class static_map {
                        InputIt last,
                        StencilIt stencil,
                        Predicate pred,
-                       cuda_stream_ref stream = {}) noexcept;
+                       cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
@@ -387,7 +387,7 @@ class static_map {
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt>
-  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {});
+  void insert_or_assign(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
@@ -406,7 +406,7 @@ class static_map {
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt>
-  void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
+  void insert_or_assign_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Erases keys in the range `[first, last)`.
@@ -433,7 +433,7 @@ class static_map {
    * provided at construction
    */
   template <typename InputIt>
-  void erase(InputIt first, InputIt last, cuda_stream_ref stream = {});
+  void erase(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously erases keys in the range `[first, last)`.
@@ -458,7 +458,7 @@ class static_map {
    * provided at construction
    */
   template <typename InputIt>
-  void erase_async(InputIt first, InputIt last, cuda_stream_ref stream = {});
+  void erase_async(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
@@ -478,7 +478,7 @@ class static_map {
   void contains(InputIt first,
                 InputIt last,
                 OutputIt output_begin,
-                cuda_stream_ref stream = {}) const;
+                cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
@@ -496,7 +496,7 @@ class static_map {
   void contains_async(InputIt first,
                       InputIt last,
                       OutputIt output_begin,
-                      cuda_stream_ref stream = {}) const noexcept;
+                      cuda::stream_ref stream = {}) const noexcept;
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the map if
@@ -529,7 +529,7 @@ class static_map {
                    StencilIt stencil,
                    Predicate pred,
                    OutputIt output_begin,
-                   cuda_stream_ref stream = {}) const;
+                   cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
@@ -560,7 +560,7 @@ class static_map {
                          StencilIt stencil,
                          Predicate pred,
                          OutputIt output_begin,
-                         cuda_stream_ref stream = {}) const noexcept;
+                         cuda::stream_ref stream = {}) const noexcept;
 
   /**
    * @brief For all keys in the range `[first, last)`, finds a payload with its key equivalent to
@@ -579,7 +579,7 @@ class static_map {
    * @param stream Stream used for executing the kernels
    */
   template <typename InputIt, typename OutputIt>
-  void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const;
+  void find(InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream = {}) const;
 
   /**
    * @brief For all keys in the range `[first, last)`, asynchronously finds a payload with its key
@@ -600,7 +600,7 @@ class static_map {
   void find_async(InputIt first,
                   InputIt last,
                   OutputIt output_begin,
-                  cuda_stream_ref stream = {}) const;
+                  cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Retrieves all of the keys and their associated values.
@@ -625,7 +625,7 @@ class static_map {
   template <typename KeyOut, typename ValueOut>
   std::pair<KeyOut, ValueOut> retrieve_all(KeyOut keys_out,
                                            ValueOut values_out,
-                                           cuda_stream_ref stream = {}) const;
+                                           cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Regenerates the container.
@@ -635,7 +635,7 @@ class static_map {
    *
    * @param stream CUDA stream used for this operation
    */
-  void rehash(cuda_stream_ref stream = {});
+  void rehash(cuda::stream_ref stream = {});
 
   /**
    * @brief Reserves at least the specified number of slots and regenerates the container
@@ -655,14 +655,14 @@ class static_map {
    * @param capacity New capacity of the container
    * @param stream CUDA stream used for this operation
    */
-  void rehash(size_type capacity, cuda_stream_ref stream = {});
+  void rehash(size_type capacity, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously regenerates the container.
    *
    * @param stream CUDA stream used for this operation
    */
-  void rehash_async(cuda_stream_ref stream = {});
+  void rehash_async(cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously reserves at least the specified number of slots and regenerates the
@@ -680,7 +680,7 @@ class static_map {
    * @param capacity New capacity of the container
    * @param stream CUDA stream used for this operation
    */
-  void rehash_async(size_type capacity, cuda_stream_ref stream = {});
+  void rehash_async(size_type capacity, cuda::stream_ref stream = {});
 
   /**
    * @brief Gets the number of elements in the container.
@@ -690,7 +690,7 @@ class static_map {
    * @param stream CUDA stream used to get the number of inserted elements
    * @return The number of elements in the container
    */
-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+  [[nodiscard]] size_type size(cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Gets the maximum number of elements the hash map can hold.
diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh
index 81e5cda74..e23d1b02b 100644
--- a/include/cuco/static_multiset.cuh
+++ b/include/cuco/static_multiset.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/open_addressing/open_addressing_impl.cuh>
 #include <cuco/extent.cuh>
 #include <cuco/hash_functions.cuh>
@@ -29,6 +28,7 @@
 #include <cuco/utility/traits.hpp>
 
 #include <cuda/atomic>
+#include <cuda/stream_ref>
 #include <thrust/functional.h>
 
 #include <cstddef>
@@ -151,7 +151,7 @@ class static_multiset {
                             cuda_thread_scope<Scope> scope      = {},
                             Storage storage                     = {},
                             Allocator const& alloc              = {},
-                            cuda_stream_ref stream              = {});
+                            cuda::stream_ref stream             = {});
 
   /**
    * @brief Constructs a statically-sized multiset with the number of elements to insert `n`, the
@@ -193,7 +193,7 @@ class static_multiset {
                             cuda_thread_scope<Scope> scope      = {},
                             Storage storage                     = {},
                             Allocator const& alloc              = {},
-                            cuda_stream_ref stream              = {});
+                            cuda::stream_ref stream             = {});
 
   /**
    * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values
@@ -227,7 +227,7 @@ class static_multiset {
                             cuda_thread_scope<Scope> scope      = {},
                             Storage storage                     = {},
                             Allocator const& alloc              = {},
-                            cuda_stream_ref stream              = {});
+                            cuda::stream_ref stream             = {});
 
   /**
    * @brief Erases all elements from the container. After this call, `size()` returns zero.
@@ -235,7 +235,7 @@ class static_multiset {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear(cuda_stream_ref stream = {});
+  void clear(cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
@@ -243,7 +243,7 @@ class static_multiset {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear_async(cuda_stream_ref stream = {}) noexcept;
+  void clear_async(cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Inserts all keys in the range `[first, last)`
@@ -261,7 +261,7 @@ class static_multiset {
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt>
-  void insert(InputIt first, InputIt last, cuda_stream_ref stream = {});
+  void insert(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously inserts all keys in the range `[first, last)`.
@@ -276,7 +276,7 @@ class static_multiset {
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt>
-  void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
+  void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns
@@ -302,7 +302,7 @@ class static_multiset {
    */
   template <typename InputIt, typename StencilIt, typename Predicate>
   void insert_if(
-    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {});
+    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding
@@ -329,7 +329,7 @@ class static_multiset {
                        InputIt last,
                        StencilIt stencil,
                        Predicate pred,
-                       cuda_stream_ref stream = {}) noexcept;
+                       cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the multiset.
@@ -349,7 +349,7 @@ class static_multiset {
   void contains(InputIt first,
                 InputIt last,
                 OutputIt output_begin,
-                cuda_stream_ref stream = {}) const;
+                cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
@@ -367,7 +367,7 @@ class static_multiset {
   void contains_async(InputIt first,
                       InputIt last,
                       OutputIt output_begin,
-                      cuda_stream_ref stream = {}) const noexcept;
+                      cuda::stream_ref stream = {}) const noexcept;
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the multiset if
@@ -400,7 +400,7 @@ class static_multiset {
                    StencilIt stencil,
                    Predicate pred,
                    OutputIt output_begin,
-                   cuda_stream_ref stream = {}) const;
+                   cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
@@ -431,7 +431,7 @@ class static_multiset {
                          StencilIt stencil,
                          Predicate pred,
                          OutputIt output_begin,
-                         cuda_stream_ref stream = {}) const noexcept;
+                         cuda::stream_ref stream = {}) const noexcept;
 
   /**
    * @brief For all keys in the range `[first, last)`, finds an element with key equivalent to the
@@ -450,7 +450,7 @@ class static_multiset {
    * @param stream Stream used for executing the kernels
    */
   template <typename InputIt, typename OutputIt>
-  void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const;
+  void find(InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream = {}) const;
 
   /**
    * @brief For all keys in the range `[first, last)`, asynchronously finds an element with key
@@ -471,7 +471,7 @@ class static_multiset {
   void find_async(InputIt first,
                   InputIt last,
                   OutputIt output_begin,
-                  cuda_stream_ref stream = {}) const;
+                  cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset
@@ -487,7 +487,7 @@ class static_multiset {
    * @return The sum of total occurrences of all keys in `[first, last)`
    */
   template <typename InputIt>
-  size_type count(InputIt first, InputIt last, cuda_stream_ref stream = {}) const noexcept;
+  size_type count(InputIt first, InputIt last, cuda::stream_ref stream = {}) const noexcept;
 
   /**
    * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset
@@ -511,7 +511,7 @@ class static_multiset {
                   InputIt last,
                   ProbeKeyEqual const& probe_key_equal,
                   ProbeHash const& probe_hash,
-                  cuda_stream_ref stream = {}) const noexcept;
+                  cuda::stream_ref stream = {}) const noexcept;
 
   /**
    * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset
@@ -537,7 +537,7 @@ class static_multiset {
                         InputIt last,
                         ProbeKeyEqual const& probe_key_equal,
                         ProbeHash const& probe_hash,
-                        cuda_stream_ref stream = {}) const noexcept;
+                        cuda::stream_ref stream = {}) const noexcept;
 
   /**
    * @brief Gets the number of elements in the container.
@@ -547,7 +547,7 @@ class static_multiset {
    * @param stream CUDA stream used to get the number of inserted elements
    * @return The number of elements in the container
    */
-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+  [[nodiscard]] size_type size(cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Gets the maximum number of elements the multiset can hold.
diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
index 3517f8429..a83649409 100644
--- a/include/cuco/static_set.cuh
+++ b/include/cuco/static_set.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/open_addressing/open_addressing_impl.cuh>
 #include <cuco/extent.cuh>
 #include <cuco/hash_functions.cuh>
@@ -30,6 +29,7 @@
 
 #include <cuda/atomic>
 #include <cuda/std/utility>
+#include <cuda/stream_ref>
 #include <thrust/functional.h>
 
 #if defined(CUCO_HAS_CUDA_BARRIER)
@@ -156,7 +156,7 @@ class static_set {
                        cuda_thread_scope<Scope> scope      = {},
                        Storage storage                     = {},
                        Allocator const& alloc              = {},
-                       cuda_stream_ref stream              = {});
+                       cuda::stream_ref stream             = {});
 
   /**
    * @brief Constructs a statically-sized set with the number of elements to insert `n`, the desired
@@ -198,7 +198,7 @@ class static_set {
                        cuda_thread_scope<Scope> scope      = {},
                        Storage storage                     = {},
                        Allocator const& alloc              = {},
-                       cuda_stream_ref stream              = {});
+                       cuda::stream_ref stream             = {});
 
   /**
    * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values
@@ -232,7 +232,7 @@ class static_set {
                        cuda_thread_scope<Scope> scope      = {},
                        Storage storage                     = {},
                        Allocator const& alloc              = {},
-                       cuda_stream_ref stream              = {});
+                       cuda::stream_ref stream             = {});
 
   /**
    * @brief Erases all elements from the container. After this call, `size()` returns zero.
@@ -240,7 +240,7 @@ class static_set {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear(cuda_stream_ref stream = {});
+  void clear(cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
@@ -248,7 +248,7 @@ class static_set {
    *
    * @param stream CUDA stream this operation is executed in
    */
-  void clear_async(cuda_stream_ref stream = {}) noexcept;
+  void clear_async(cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Inserts all keys in the range `[first, last)` and returns the number of successful
@@ -268,7 +268,7 @@ class static_set {
    * @return Number of successfully inserted keys
    */
   template <typename InputIt>
-  size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {});
+  size_type insert(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously inserts all keys in the range `[first, last)`.
@@ -282,7 +282,7 @@ class static_set {
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt>
-  void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
+  void insert_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns
@@ -310,7 +310,7 @@ class static_set {
    */
   template <typename InputIt, typename StencilIt, typename Predicate>
   size_type insert_if(
-    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {});
+    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding
@@ -337,7 +337,7 @@ class static_set {
                        InputIt last,
                        StencilIt stencil,
                        Predicate pred,
-                       cuda_stream_ref stream = {}) noexcept;
+                       cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Erases keys in the range `[first, last)`.
@@ -364,7 +364,7 @@ class static_set {
    * provided at construction
    */
   template <typename InputIt>
-  void erase(InputIt first, InputIt last, cuda_stream_ref stream = {});
+  void erase(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously erases keys in the range `[first, last)`.
@@ -389,7 +389,7 @@ class static_set {
    * provided at construction
    */
   template <typename InputIt>
-  void erase_async(InputIt first, InputIt last, cuda_stream_ref stream = {});
+  void erase_async(InputIt first, InputIt last, cuda::stream_ref stream = {});
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the set.
@@ -409,7 +409,7 @@ class static_set {
   void contains(InputIt first,
                 InputIt last,
                 OutputIt output_begin,
-                cuda_stream_ref stream = {}) const;
+                cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
@@ -427,7 +427,7 @@ class static_set {
   void contains_async(InputIt first,
                       InputIt last,
                       OutputIt output_begin,
-                      cuda_stream_ref stream = {}) const noexcept;
+                      cuda::stream_ref stream = {}) const noexcept;
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the set if
@@ -460,7 +460,7 @@ class static_set {
                    StencilIt stencil,
                    Predicate pred,
                    OutputIt output_begin,
-                   cuda_stream_ref stream = {}) const;
+                   cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
@@ -491,7 +491,7 @@ class static_set {
                          StencilIt stencil,
                          Predicate pred,
                          OutputIt output_begin,
-                         cuda_stream_ref stream = {}) const noexcept;
+                         cuda::stream_ref stream = {}) const noexcept;
 
   /**
    * @brief For all keys in the range `[first, last)`, finds an element with key equivalent to the
@@ -510,7 +510,7 @@ class static_set {
    * @param stream Stream used for executing the kernels
    */
   template <typename InputIt, typename OutputIt>
-  void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const;
+  void find(InputIt first, InputIt last, OutputIt output_begin, cuda::stream_ref stream = {}) const;
 
   /**
    * @brief For all keys in the range `[first, last)`, asynchronously finds an element with key
@@ -531,7 +531,7 @@ class static_set {
   void find_async(InputIt first,
                   InputIt last,
                   OutputIt output_begin,
-                  cuda_stream_ref stream = {}) const;
+                  cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Retrieves the matched key in the set corresponding to all probe keys in the range
@@ -564,7 +564,7 @@ class static_set {
                                                  InputIt last,
                                                  OutputIt1 output_probe,
                                                  OutputIt2 output_match,
-                                                 cuda_stream_ref stream = {}) const;
+                                                 cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Asynchronously retrieves the matched key in the set corresponding to all probe keys in
@@ -602,7 +602,7 @@ class static_set {
                     OutputIt output_begin,
                     ProbeEqual const& probe_equal = ProbeEqual{},
                     ProbeHash const& probe_hash   = ProbeHash{},
-                    cuda_stream_ref stream        = {}) const;
+                    cuda::stream_ref stream       = {}) const;
 
   /**
    * @brief Retrieves all keys contained in the set.
@@ -622,7 +622,7 @@ class static_set {
    * @return Iterator indicating the end of the output
    */
   template <typename OutputIt>
-  OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const;
+  OutputIt retrieve_all(OutputIt output_begin, cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Regenerates the container.
@@ -632,7 +632,7 @@ class static_set {
    *
    * @param stream CUDA stream used for this operation
    */
-  void rehash(cuda_stream_ref stream = {});
+  void rehash(cuda::stream_ref stream = {});
 
   /**
    * @brief Reserves at least the specified number of slots and regenerates the container
@@ -652,14 +652,14 @@ class static_set {
    * @param capacity New capacity of the container
    * @param stream CUDA stream used for this operation
    */
-  void rehash(size_type capacity, cuda_stream_ref stream = {});
+  void rehash(size_type capacity, cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously regenerates the container.
    *
    * @param stream CUDA stream used for this operation
    */
-  void rehash_async(cuda_stream_ref stream = {});
+  void rehash_async(cuda::stream_ref stream = {});
 
   /**
    * @brief Asynchronously reserves at least the specified number of slots and regenerates the
@@ -677,7 +677,7 @@ class static_set {
    * @param capacity New capacity of the container
    * @param stream CUDA stream used for this operation
    */
-  void rehash_async(size_type capacity, cuda_stream_ref stream = {});
+  void rehash_async(size_type capacity, cuda::stream_ref stream = {});
 
   /**
    * @brief Gets the number of elements in the container.
@@ -687,7 +687,7 @@ class static_set {
    * @param stream CUDA stream used to get the number of inserted elements
    * @return The number of elements in the container
    */
-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+  [[nodiscard]] size_type size(cuda::stream_ref stream = {}) const;
 
   /**
    * @brief Gets the maximum number of elements the hash set can hold.