From 8cc23bdd4f894c85d5ee400712db994711244b3d Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 25 Nov 2020 04:34:25 +1100
Subject: [PATCH] Replace raw streams with rmm::cuda_stream_view (part 3)
 (#6744)

Converting libcudf to use `rmm::cuda_stream_view` will require a LOT of changes, so I'm splitting it into multiple PRs to ease reviewing. This is the third PR in the series. This series of PRs will

- Replace usage of `cudaStream_t` with `rmm::cuda_stream_view`
- Replace usage of `0` or `nullptr` as a stream identifier with `rmm::cuda_stream_default`
- Ensure all APIs always order the stream parameter before the memory resource parameter. #5119

Contributes to #6645 and #5119

Depends on #6646 and #6648 so this PR will look much bigger until they are merged.

This third PR converts:

 - remaining dictionary functionality
 - cuio
 - lists
 - scalar
 - strings
 - groupby
 - join
 - contiguous_split
 - get_element
 - datetime_ops
 - extract
 - merge
 - partitioning
 - minmax reduction
 - scan
 - byte_cast
 - clamp
 - interleave_columns
 - is_sorted
 - groupby
 - rank
 - tests
 - concurrent map classes
---
 CHANGELOG.md                                  |   1 +
 .../synchronization/synchronization.cpp       |  13 +-
 .../synchronization/synchronization.hpp       |  15 +-
 cpp/include/cudf/detail/copy.hpp              |   2 +-
 cpp/include/cudf/detail/copy_range.cuh        |   2 +-
 cpp/include/cudf/detail/scatter.cuh           |   5 +-
 cpp/include/cudf/dictionary/detail/encode.hpp |  14 +-
 cpp/include/cudf/dictionary/detail/merge.hpp  |   8 +-
 .../cudf/dictionary/detail/update_keys.hpp    |  26 +--
 .../cudf/dictionary/dictionary_factories.hpp  |  14 +-
 cpp/include/cudf/groupby.hpp                  |   8 +-
 cpp/include/cudf/io/avro.hpp                  |   4 +-
 cpp/include/cudf/io/csv.hpp                   |   1 -
 cpp/include/cudf/io/data_sink.hpp             |  10 +-
 cpp/include/cudf/io/datasource.hpp            |   6 +-
 cpp/include/cudf/io/detail/avro.hpp           |   5 +-
 cpp/include/cudf/io/detail/csv.hpp            |   6 +-
 cpp/include/cudf/io/detail/json.hpp           |   5 +-
 cpp/include/cudf/io/detail/orc.hpp            |   7 +-
 cpp/include/cudf/io/detail/parquet.hpp        |  18 +-
 cpp/include/cudf/io/json.hpp                  |   4 +-
 cpp/include/cudf/io/types.hpp                 |   1 +
 cpp/include/cudf/join.hpp                     |  16 +-
 cpp/include/cudf/lists/detail/copying.hpp     |   4 +-
 cpp/include/cudf/lists/detail/gather.cuh      |   2 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |  22 +-
 cpp/include/cudf/scalar/scalar_factories.hpp  |  18 +-
 cpp/include/cudf/strings/copying.hpp          |   4 +-
 cpp/include/cudf/strings/detail/combine.hpp   |   6 +-
 .../cudf/strings/detail/converters.hpp        |  22 +-
 .../cudf/strings/detail/copy_if_else.cuh      |  11 +-
 .../cudf/strings/detail/copy_range.cuh        |  10 +-
 cpp/include/cudf/strings/detail/gather.cuh    |   8 +-
 cpp/include/cudf/strings/detail/merge.cuh     |  16 +-
 .../cudf/strings/detail/modify_strings.cuh    |  18 +-
 cpp/include/cudf/strings/detail/scatter.cuh   |   6 +-
 .../detail/strings_column_factories.cuh       |  58 +++---
 cpp/include/cudf/strings/detail/utilities.cuh |  20 +-
 cpp/include/cudf/strings/detail/utilities.hpp |  35 ++--
 .../cudf/strings/strings_column_view.hpp      |   5 +-
 cpp/include/cudf/table/table_device_view.cuh  |   2 +-
 cpp/include/cudf/types.hpp                    |   5 -
 cpp/include/cudf/utilities/error.hpp          |   6 +-
 cpp/include/cudf/utilities/traits.hpp         |   2 +-
 cpp/include/nvtext/detail/load_hash_file.hpp  |   7 +-
 cpp/include/nvtext/detail/tokenize.hpp        |  18 +-
 cpp/src/bitmask/null_mask.cu                  |   6 +-
 cpp/src/column/column.cu                      |  13 +-
 cpp/src/copying/contiguous_split.cu           |   1 +
 cpp/src/copying/copy_range.cu                 |   5 +-
 cpp/src/copying/get_element.cu                |  21 +-
 cpp/src/copying/scatter.cu                    |   4 +-
 cpp/src/datetime/datetime_ops.cu              |  29 ++-
 cpp/src/dictionary/add_keys.cu                |  15 +-
 cpp/src/dictionary/decode.cu                  |  11 +-
 cpp/src/dictionary/detail/merge.cu            |   8 +-
 cpp/src/dictionary/dictionary_factories.cu    |  21 +-
 cpp/src/dictionary/encode.cu                  |  16 +-
 cpp/src/dictionary/remove_keys.cu             |  30 +--
 cpp/src/dictionary/replace.cu                 |  26 +--
 cpp/src/dictionary/set_keys.cu                |  32 +--
 cpp/src/filling/fill.cu                       |   2 +-
 cpp/src/filling/repeat.cu                     |   6 +-
 cpp/src/groupby/groupby.cu                    |   4 +-
 cpp/src/groupby/hash/groupby.cu               |  10 +-
 cpp/src/groupby/sort/group_argmax.cu          |  13 +-
 cpp/src/groupby/sort/group_argmin.cu          |  13 +-
 cpp/src/groupby/sort/group_collect.cu         |   6 +-
 cpp/src/groupby/sort/group_count.cu           |  18 +-
 cpp/src/groupby/sort/group_max.cu             |  12 +-
 cpp/src/groupby/sort/group_min.cu             |  12 +-
 cpp/src/groupby/sort/group_nth_element.cu     |  14 +-
 cpp/src/groupby/sort/group_nunique.cu         |  22 +-
 cpp/src/groupby/sort/group_quantiles.cu       |  20 +-
 cpp/src/groupby/sort/group_reductions.hpp     |  49 ++---
 .../sort/group_single_pass_reduction_util.cuh |  14 +-
 cpp/src/groupby/sort/group_std.cu             |  25 +--
 cpp/src/groupby/sort/group_sum.cu             |  12 +-
 cpp/src/groupby/sort/groupby.cu               |  52 ++---
 cpp/src/hash/concurrent_unordered_map.cuh     |  37 ++--
 .../hash/concurrent_unordered_multimap.cuh    |  39 ++--
 cpp/src/hash/hash_allocator.cuh               |  15 +-
 cpp/src/hash/hashing.cu                       |   8 +-
 cpp/src/hash/unordered_multiset.cuh           |  17 +-
 cpp/src/interop/dlpack.cpp                    |   2 +-
 cpp/src/interop/from_arrow.cpp                |   6 +-
 cpp/src/io/avro/avro_gpu.cu                   |  42 ++--
 cpp/src/io/avro/avro_gpu.h                    |  10 +-
 cpp/src/io/avro/reader_impl.cu                |  39 ++--
 cpp/src/io/avro/reader_impl.hpp               |   8 +-
 cpp/src/io/comp/debrotli.cu                   |  20 +-
 cpp/src/io/comp/gpuinflate.cu                 |  15 +-
 cpp/src/io/comp/gpuinflate.h                  |  26 +--
 cpp/src/io/comp/snap.cu                       |  13 +-
 cpp/src/io/comp/uncomp.cpp                    |  11 +-
 cpp/src/io/comp/unsnap.cu                     |  12 +-
 cpp/src/io/csv/csv_gpu.cu                     |  22 +-
 cpp/src/io/csv/csv_gpu.h                      |  13 +-
 cpp/src/io/csv/durations.cu                   |  23 ++-
 cpp/src/io/csv/reader_impl.cu                 |  35 ++--
 cpp/src/io/csv/reader_impl.hpp                |  10 +-
 cpp/src/io/csv/writer_impl.cu                 |  60 +++---
 cpp/src/io/csv/writer_impl.hpp                |  12 +-
 cpp/src/io/json/json_gpu.cu                   |  15 +-
 cpp/src/io/json/json_gpu.h                    |   8 +-
 cpp/src/io/json/reader_impl.cu                |  61 +++---
 cpp/src/io/json/reader_impl.hpp               |  27 +--
 cpp/src/io/orc/chunked_state.hpp              |   4 +-
 cpp/src/io/orc/dict_enc.cu                    |  20 +-
 cpp/src/io/orc/orc_gpu.h                      |  39 ++--
 cpp/src/io/orc/reader_impl.cu                 |  50 +++--
 cpp/src/io/orc/reader_impl.hpp                |  10 +-
 cpp/src/io/orc/stats_enc.cu                   |  19 +-
 cpp/src/io/orc/stripe_data.cu                 |  32 +--
 cpp/src/io/orc/stripe_enc.cu                  |  28 ++-
 cpp/src/io/orc/stripe_init.cu                 |  19 +-
 cpp/src/io/orc/writer_impl.cu                 | 109 ++++++----
 cpp/src/io/orc/writer_impl.hpp                |  21 +-
 cpp/src/io/parquet/chunked_state.hpp          |  19 +-
 cpp/src/io/parquet/page_data.cu               |  60 +++---
 cpp/src/io/parquet/page_dict.cu               |  15 +-
 cpp/src/io/parquet/page_enc.cu                |  59 +++---
 cpp/src/io/parquet/page_hdr.cu                |  12 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  41 ++--
 cpp/src/io/parquet/reader_impl.cu             |  47 +++--
 cpp/src/io/parquet/reader_impl.hpp            |  19 +-
 cpp/src/io/parquet/writer_impl.cu             |  88 ++++----
 cpp/src/io/parquet/writer_impl.hpp            |  18 +-
 cpp/src/io/statistics/column_stats.cu         |  23 ++-
 cpp/src/io/statistics/column_stats.h          |   6 +-
 cpp/src/io/utilities/column_buffer.hpp        |  18 +-
 cpp/src/io/utilities/data_sink.cpp            |   6 +-
 cpp/src/io/utilities/hostdevice_vector.hpp    |  36 ++--
 cpp/src/join/cross_join.cu                    |   6 +-
 cpp/src/join/hash_join.cu                     |  81 ++++----
 cpp/src/join/hash_join.cuh                    |  53 ++---
 cpp/src/join/join.cu                          |  73 +++----
 cpp/src/join/nested_loop_join.cuh             |  46 +++--
 cpp/src/join/semi_join.cu                     |  18 +-
 cpp/src/lists/copying/copying.cu              |   7 +-
 cpp/src/lists/copying/gather.cu               |   5 +-
 cpp/src/lists/extract.cu                      |  10 +-
 cpp/src/merge/merge.cu                        | 116 ++++++-----
 cpp/src/partitioning/partitioning.cu          | 104 +++++-----
 cpp/src/partitioning/round_robin.cu           |  31 +--
 cpp/src/reductions/minmax.cu                  |  25 ++-
 cpp/src/reductions/scan.cu                    | 116 +++++------
 cpp/src/replace/clamp.cu                      |  78 ++++----
 cpp/src/replace/nulls.cu                      |   8 +-
 cpp/src/replace/replace.cu                    |  17 +-
 cpp/src/reshape/byte_cast.cu                  |  31 ++-
 cpp/src/reshape/interleave_columns.cu         |  31 +--
 cpp/src/rolling/rolling.cu                    | 188 +++++++++---------
 cpp/src/scalar/scalar_factories.cpp           |  18 +-
 cpp/src/search/search.cu                      |  11 +-
 cpp/src/sort/is_sorted.cu                     |  14 +-
 cpp/src/sort/rank.cu                          |  43 ++--
 cpp/src/strings/attributes.cu                 |  40 ++--
 cpp/src/strings/capitalize.cu                 |  12 +-
 cpp/src/strings/case.cu                       |  43 ++--
 cpp/src/strings/char_types/char_types.cu      |  84 ++++----
 cpp/src/strings/combine.cu                    |  58 +++---
 cpp/src/strings/contains.cu                   |  64 +++---
 cpp/src/strings/convert/convert_booleans.cu   |  36 ++--
 cpp/src/strings/convert/convert_datetime.cu   |  67 +++----
 cpp/src/strings/convert/convert_durations.cu  |  54 +++--
 cpp/src/strings/convert/convert_floats.cu     |  48 +++--
 cpp/src/strings/convert/convert_hex.cu        |  46 ++---
 cpp/src/strings/convert/convert_integers.cu   |  54 +++--
 cpp/src/strings/convert/convert_ipv4.cu       |  59 +++---
 cpp/src/strings/convert/convert_urls.cu       |  41 ++--
 cpp/src/strings/copying/concatenate.cu        |   6 +-
 cpp/src/strings/copying/copying.cu            |   8 +-
 cpp/src/strings/extract.cu                    |  21 +-
 cpp/src/strings/filling/fill.cu               |  10 +-
 cpp/src/strings/filter_chars.cu               |  15 +-
 cpp/src/strings/find.cu                       |  82 ++++----
 cpp/src/strings/find_multiple.cu              |  10 +-
 cpp/src/strings/padding.cu                    |  43 ++--
 cpp/src/strings/regex/regex.cuh               |   8 +-
 cpp/src/strings/regex/regexec.cu              |  12 +-
 cpp/src/strings/replace/backref_re.cu         |  31 +--
 cpp/src/strings/replace/backref_re.cuh        |  10 +-
 cpp/src/strings/replace/backref_re_large.cu   |  12 +-
 cpp/src/strings/replace/backref_re_medium.cu  |  12 +-
 cpp/src/strings/replace/multi_re.cu           |  32 +--
 cpp/src/strings/replace/replace.cu            |  31 ++-
 cpp/src/strings/replace/replace_re.cu         |  33 +--
 cpp/src/strings/split/partition.cu            |  18 +-
 cpp/src/strings/split/split.cu                |  66 +++---
 cpp/src/strings/split/split_record.cu         |  39 ++--
 cpp/src/strings/strings_column_factories.cu   |  69 ++++---
 cpp/src/strings/strings_column_view.cu        |  21 +-
 cpp/src/strings/strings_scalar_factories.cpp  |   6 +-
 cpp/src/strings/strip.cu                      |  19 +-
 cpp/src/strings/substring.cu                  |  64 +++---
 cpp/src/strings/translate.cu                  |  23 ++-
 cpp/src/strings/utilities.cu                  |  40 ++--
 cpp/src/strings/utilities.cuh                 |  16 +-
 cpp/src/strings/utilities.hpp                 |   4 +-
 cpp/src/strings/wrap.cu                       |  15 +-
 cpp/src/text/detokenize.cu                    |  27 ++-
 cpp/src/text/edit_distance.cu                 |  36 ++--
 cpp/src/text/generate_ngrams.cu               |  39 ++--
 cpp/src/text/ngrams_tokenize.cu               |  42 ++--
 cpp/src/text/normalize.cu                     |  44 ++--
 cpp/src/text/replace.cu                       |  34 ++--
 cpp/src/text/stemmer.cu                       |  60 +++---
 cpp/src/text/subword/data_normalizer.cu       |  25 ++-
 .../text/subword/detail/data_normalizer.hpp   |   5 +-
 .../text/subword/detail/tokenizer_utils.cuh   |   6 +-
 .../subword/detail/wordpiece_tokenizer.hpp    |   8 +-
 cpp/src/text/subword/load_hash_file.cu        |  42 ++--
 cpp/src/text/subword/subword_tokenize.cu      |  13 +-
 cpp/src/text/subword/wordpiece_tokenizer.cu   |  87 ++++----
 cpp/src/text/tokenize.cu                      |  64 +++---
 cpp/src/unary/math_ops.cu                     |  18 +-
 cpp/src/unary/unary_ops.cuh                   |   2 +-
 cpp/tests/column/column_device_view_test.cu   |  24 ++-
 cpp/tests/column/factories_test.cpp           |  12 +-
 cpp/tests/copying/copy_tests.cu               |  29 +--
 cpp/tests/copying/gather_struct_tests.cu      |  33 ++-
 cpp/tests/copying/shift_tests.cpp             |  14 +-
 cpp/tests/datetime/datetime_ops_test.cpp      |  15 +-
 cpp/tests/error/error_handling_test.cu        |  27 ++-
 cpp/tests/groupby/group_std_test.cpp          |   4 +
 cpp/tests/groupby/group_var_test.cpp          |   4 +
 cpp/tests/hash_map/map_test.cu                |  13 +-
 cpp/tests/hash_map/multimap_test.cu           |  13 +-
 cpp/tests/io/parquet_test.cpp                 |  27 +--
 cpp/tests/scalar/factories_test.cpp           |  13 +-
 cpp/tests/table/table_view_tests.cu           |  17 +-
 cpp/tests/wrappers/timestamps_test.cu         |  17 +-
 java/src/main/native/src/TableJni.cpp         |   8 +-
 java/src/main/native/src/map_lookup.cu        |   2 +-
 235 files changed, 3179 insertions(+), 2719 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 25bbb34fbe1..b052722ca65 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -71,6 +71,7 @@
 - PR #6612 Update JNI to new RMM cuda_stream_view API
 - PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1)
 - PR #6648 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 2)
+- PR #6744 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 3)
 - PR #6579 Update scatter APIs to use reference wrapper / const scalar
 - PR #6614 Add support for conversion to Pandas nullable dtypes and fix related issue in `cudf.to_json`
 - PR #6622 Update `to_pandas` api docs
diff --git a/cpp/benchmarks/synchronization/synchronization.cpp b/cpp/benchmarks/synchronization/synchronization.cpp
index a2de31e53d3..c5a88bd6410 100644
--- a/cpp/benchmarks/synchronization/synchronization.cpp
+++ b/cpp/benchmarks/synchronization/synchronization.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,13 +15,15 @@
  */
 
 #include "synchronization.hpp"
+
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 cuda_event_timer::cuda_event_timer(benchmark::State& state,
                                    bool flush_l2_cache,
-                                   cudaStream_t stream)
+                                   rmm::cuda_stream_view stream)
   : p_state(&state), stream(stream)
 {
   // flush all of L2$
@@ -35,18 +37,19 @@ cuda_event_timer::cuda_event_timer(benchmark::State& state,
     if (l2_cache_bytes > 0) {
       const int memset_value = 0;
       rmm::device_buffer l2_cache_buffer(l2_cache_bytes, stream);
-      CUDA_TRY(cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream));
+      CUDA_TRY(
+        cudaMemsetAsync(l2_cache_buffer.data(), memset_value, l2_cache_bytes, stream.value()));
     }
   }
 
   CUDA_TRY(cudaEventCreate(&start));
   CUDA_TRY(cudaEventCreate(&stop));
-  CUDA_TRY(cudaEventRecord(start, stream));
+  CUDA_TRY(cudaEventRecord(start, stream.value()));
 }
 
 cuda_event_timer::~cuda_event_timer()
 {
-  CUDA_TRY(cudaEventRecord(stop, stream));
+  CUDA_TRY(cudaEventRecord(stop, stream.value()));
   CUDA_TRY(cudaEventSynchronize(stop));
 
   float milliseconds = 0.0f;
diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp
index 9e214907812..5e84e9fb9ae 100644
--- a/cpp/benchmarks/synchronization/synchronization.hpp
+++ b/cpp/benchmarks/synchronization/synchronization.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@
 
       for (auto _ : state){
 
-        cudaStream_t stream = 0;
+        rmm::cuda_stream_view stream{}; // default stream, could be another stream
 
         // Create (Construct) an object of this class. You HAVE to pass in the
         // benchmark::State object you are using. It measures the time from its
@@ -44,7 +44,7 @@
         cuda_event_timer raii(state, true, stream); // flush_l2_cache = true
 
         // Now perform the operations that is to be benchmarked
-        sample_kernel<<<1, 256, 0, stream>>>(); // Possibly launching a CUDA kernel
+        sample_kernel<<<1, 256, 0, stream.value()>>>(); // Possibly launching a CUDA kernel
 
       }
     }
@@ -61,8 +61,11 @@
 
 // Google Benchmark library
 #include <benchmark/benchmark.h>
+
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <driver_types.h>
 
 class cuda_event_timer {
@@ -77,7 +80,9 @@ class cuda_event_timer {
    *                            every iteration.
    * @param[in] stream_ The CUDA stream we are measuring time on.
    **/
-  cuda_event_timer(benchmark::State& state, bool flush_l2_cache, cudaStream_t stream_ = 0);
+  cuda_event_timer(benchmark::State& state,
+                   bool flush_l2_cache,
+                   rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   // The user must provide a benchmark::State object to set
   // the timer so we disable the default c'tor.
@@ -91,7 +96,7 @@ class cuda_event_timer {
  private:
   cudaEvent_t start;
   cudaEvent_t stop;
-  cudaStream_t stream;
+  rmm::cuda_stream_view stream;
   benchmark::State* p_state;
 };
 
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index cfd637570fe..0af8dd6a500 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -184,7 +184,7 @@ std::unique_ptr<table> sample(
  */
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index afe67540c42..feb7255eec8 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -185,7 +185,7 @@ void copy_range(SourceValueIterator source_value_begin,
       nullptr);
   }
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 }
 
 /**
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 1df17585c99..a45c4f86ba4 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -171,10 +171,9 @@ struct column_scatterer_impl<dictionary32, MapIterator> {
                  "scatter dictionary keys must be the same type");
 
     // first combine keys so both dictionaries have the same set
-    auto target_matched = dictionary::detail::add_keys(target, source.keys(), mr, stream.value());
+    auto target_matched    = dictionary::detail::add_keys(target, source.keys(), stream, mr);
     auto const target_view = dictionary_column_view(target_matched->view());
-    auto source_matched    = dictionary::detail::set_keys(
-      source, target_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
+    auto source_matched    = dictionary::detail::set_keys(source, target_view.keys(), stream);
     auto const source_view = dictionary_column_view(source_matched->view());
 
     // now build the new indices by doing a scatter on just the matched indices
diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp
index 8a1cff84119..933512efdde 100644
--- a/cpp/include/cudf/dictionary/detail/encode.hpp
+++ b/cpp/include/cudf/dictionary/detail/encode.hpp
@@ -19,6 +19,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -44,15 +46,15 @@ namespace detail {
  *
  * @param column The column to dictionary encode.
  * @param indices_type The integer type to use for the indices.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Returns a dictionary column.
  */
 std::unique_ptr<column> encode(
   column_view const& column,
   data_type indices_type              = data_type{type_id::UINT32},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -65,14 +67,14 @@ std::unique_ptr<column> encode(
  * ```
  *
  * @param dictionary_column Existing dictionary column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with type matching the dictionary_column's keys.
  */
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Return minimal integer type for the given number of elements.
diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
index 4b9cb634b74..521d36e229e 100644
--- a/cpp/include/cudf/dictionary/detail/merge.hpp
+++ b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -18,6 +18,8 @@
 #include <cudf/detail/merge.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -33,15 +35,15 @@ namespace detail {
  * @param lcol First column.
  * @param rcol Second column.
  * @param row_order Indexes for each column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               dictionary_column_view const& rcol,
                               cudf::detail::index_vector const& row_order,
-                              rmm::mr::device_memory_resource* mr,
-                              cudaStream_t stream);
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index ec6a9af61cf..9d3cc9f90bc 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -31,8 +33,8 @@ namespace detail {
 std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view
@@ -43,8 +45,8 @@ std::unique_ptr<column> add_keys(
 std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view
@@ -54,8 +56,8 @@ std::unique_ptr<column> remove_keys(
  */
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::dictionary::set_keys(dictionary_column_view
@@ -66,8 +68,8 @@ std::unique_ptr<column> remove_unused_keys(
 std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create new dictionaries that have keys merged from the input dictionaries.
@@ -82,8 +84,8 @@ std::unique_ptr<column> set_keys(
  */
 std::vector<std::unique_ptr<column>> match_dictionaries(
   std::vector<dictionary_column_view> input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create new dictionaries that have keys merged from dictionary columns
@@ -106,8 +108,8 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
  */
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
   std::vector<table_view> tables,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index fe0b92e7df4..6e5f5fa6539 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 /**
  * @addtogroup column_factories Factories
@@ -54,15 +56,15 @@ namespace cudf {
  *
  * @param keys_column Column of unique, ordered values to use as the new dictionary column's keys.
  * @param indices_column Indices to use for the new dictionary column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> make_dictionary_column(
   column_view const& keys_column,
   column_view const& indices_column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
@@ -106,15 +108,15 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
  *
  * @param keys Column of unique, ordered values to use as the new dictionary column's keys.
  * @param indices Indices values and null-mask to use for the new dictionary column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
 std::unique_ptr<column> make_dictionary_column(
   std::unique_ptr<column> keys_column,
   std::unique_ptr<column> indices_column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 8f8aadccde5..fc809b03dfa 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <utility>
 #include <vector>
 
@@ -222,13 +224,13 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> dispatch_aggregation(
     std::vector<aggregation_request> const& requests,
-    cudaStream_t stream,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr);
 
   // Sort-based groupby
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_aggregate(
     std::vector<aggregation_request> const& requests,
-    cudaStream_t stream,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr);
 };
 /** @} */
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 0311b9e92cb..18398ff4ceb 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -18,11 +18,11 @@
 
 #include "types.hpp"
 
-#include <rmm/mr/device/per_device_resource.hpp>
-
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 2511a366ca8..4b7f3e22601 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/io/types.hpp>
-
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 9f16ffa3105..6c830e31a56 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -16,13 +16,15 @@
 
 #pragma once
 
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>
 
-#include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
-
 namespace cudf {
 //! IO interfaces
 namespace io {
@@ -113,7 +115,7 @@ class data_sink {
    *
    * @return void
    **/
-  virtual void device_write(void const* gpu_data, size_t size, cudaStream_t stream)
+  virtual void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream)
   {
     CUDF_FAIL("data_sink classes that support device_write must override this function.");
   }
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index a99bac3f7f1..88f2bd187e2 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -16,6 +16,9 @@
 
 #pragma once
 
+#include <cudf/io/types.hpp>
+#include <cudf/utilities/error.hpp>
+
 #include <arrow/buffer.h>
 #include <arrow/io/file.h>
 #include <arrow/io/interfaces.h>
@@ -23,9 +26,6 @@
 
 #include <memory>
 
-#include <cudf/io/types.hpp>
-#include <cudf/utilities/error.hpp>
-
 namespace cudf {
 //! IO interfaces
 namespace io {
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index c965bfbfb21..40090dbc438 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -23,6 +23,8 @@
 
 #include <cudf/io/avro.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -71,7 +73,8 @@ class reader {
    *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(avro_reader_options const &options, cudaStream_t stream = 0);
+  table_with_metadata read(avro_reader_options const &options,
+                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 };
 }  // namespace avro
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 8a8d07a353c..7790c2ceee1 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -18,6 +18,8 @@
 
 #include <cudf/io/csv.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -65,7 +67,7 @@ class reader {
    *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(cudaStream_t stream = 0);
+  table_with_metadata read(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 };
 
 class writer {
@@ -104,7 +106,7 @@ class writer {
    */
   void write(table_view const &table,
              const table_metadata *metadata = nullptr,
-             cudaStream_t stream            = 0);
+             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
 };
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 62a209b57f4..2176381879a 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -23,6 +23,8 @@
 
 #include <cudf/io/json.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 // Forward declarations
 namespace arrow {
 namespace io {
@@ -77,7 +79,8 @@ class reader {
    * @param[in] options Settings for controlling reading behavior
    * @return cudf::table object that contains the array of cudf::column.
    */
-  table_with_metadata read(json_reader_options const &options, cudaStream_t stream = 0);
+  table_with_metadata read(json_reader_options const &options,
+                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 };
 
 }  // namespace json
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 509aea4c6e1..15969ac6137 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -18,6 +18,8 @@
 
 #include <cudf/io/orc.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -66,7 +68,8 @@ class reader {
    *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(orc_reader_options const& options, cudaStream_t stream = 0);
+  table_with_metadata read(orc_reader_options const& options,
+                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 };
 
 /**
@@ -103,7 +106,7 @@ class writer {
    */
   void write(table_view const& table,
              const table_metadata* metadata = nullptr,
-             cudaStream_t stream            = 0);
+             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
 
   /**
    * @brief Begins the chunked/streamed write process.
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 586ff497972..1769c72e1c8 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -22,6 +22,8 @@
 
 #include <cudf/io/parquet.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -70,7 +72,8 @@ class reader {
    *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(parquet_reader_options const& options, cudaStream_t stream = 0);
+  table_with_metadata read(parquet_reader_options const& options,
+                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 };
 
 /**
@@ -108,12 +111,13 @@ class writer {
    * @param int96_timestamps If true, write timestamps as INT96 values
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  std::unique_ptr<std::vector<uint8_t>> write(table_view const& table,
-                                              const table_metadata* metadata            = nullptr,
-                                              bool return_filemetadata                  = false,
-                                              const std::string column_chunks_file_path = "",
-                                              bool int96_timestamps                     = false,
-                                              cudaStream_t stream                       = 0);
+  std::unique_ptr<std::vector<uint8_t>> write(
+    table_view const& table,
+    const table_metadata* metadata            = nullptr,
+    bool return_filemetadata                  = false,
+    const std::string column_chunks_file_path = "",
+    bool int96_timestamps                     = false,
+    rmm::cuda_stream_view stream              = rmm::cuda_stream_default);
 
   /**
    * @brief Begins the chunked/streamed write process.
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index d4b627dd145..262d79b64c2 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -18,11 +18,11 @@
 
 #include "types.hpp"
 
-#include <rmm/mr/device/per_device_resource.hpp>
-
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <string>
 #include <vector>
 
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index a50ab95195d..fe6eda101d8 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+
 #include <map>
 #include <memory>
 #include <string>
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 981bc46d046..37847c41339 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -19,6 +19,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <vector>
 
 namespace cudf {
@@ -396,7 +398,7 @@ class hash_join {
    */
   hash_join(cudf::table_view const& build,
             std::vector<size_type> const& build_on,
-            cudaStream_t stream = 0);
+            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Controls where common columns will be output for a inner join.
@@ -449,8 +451,8 @@ class hash_join {
     std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
     common_columns_output_side common_columns_output_side = common_columns_output_side::PROBE,
     null_equality compare_nulls                           = null_equality::EQUAL,
-    rmm::mr::device_memory_resource* mr                   = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                                   = 0) const;
+    rmm::cuda_stream_view stream                          = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Performs a left join by probing in the internal hash table.
@@ -479,8 +481,8 @@ class hash_join {
     std::vector<size_type> const& probe_on,
     std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
     null_equality compare_nulls         = null_equality::EQUAL,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0) const;
+    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Performs a full join by probing in the internal hash table.
@@ -509,8 +511,8 @@ class hash_join {
     std::vector<size_type> const& probe_on,
     std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
     null_equality compare_nulls         = null_equality::EQUAL,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0) const;
+    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
  private:
   struct hash_join_impl;
diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp
index 6482518303b..cfa1980e665 100644
--- a/cpp/include/cudf/lists/detail/copying.hpp
+++ b/cpp/include/cudf/lists/detail/copying.hpp
@@ -17,6 +17,8 @@
 
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
@@ -43,7 +45,7 @@ namespace detail {
 std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          size_type start,
                                          size_type end,
-                                         cudaStream_t stream,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 439bd7ab089..b035ae62408 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -51,7 +51,7 @@ struct gather_data {
  * @copydoc cudf::make_gather_data(cudf::lists_column_view const& source_column,
  *                                 MapItType gather_map,
  *                                 size_type gather_map_size,
- *                                 cudaStream_t stream,
+ *                                 rmm::cuda_stream_view stream,
  *                                 rmm::mr::device_memory_resource* mr)
  *
  * @param prev_base_offsets The buffer backing the base offsets used in the gather map. We can
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index 65bdfb349c8..1de4461f703 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -16,19 +16,23 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
-#include <thrust/binary_search.h>
-#include <cinttypes>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/list_device_view.cuh>
 #include <cudf/null_mask.hpp>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/binary_search.h>
+
+#include <cinttypes>
+
 namespace cudf {
 namespace lists {
 namespace detail {
@@ -385,9 +389,9 @@ struct list_child_constructor {
         (unbound_list_row.label() == unbound_list_view::label_type::SOURCE ? source_lists
                                                                            : target_lists);
       auto const list_begin_offset =
-        bound_column.offsets().element<size_type>(unbound_list_row.row_index());
+        bound_column.offsets().template element<size_type>(unbound_list_row.row_index());
       auto const list_end_offset =
-        bound_column.offsets().element<size_type>(unbound_list_row.row_index() + 1);
+        bound_column.offsets().template element<size_type>(unbound_list_row.row_index() + 1);
 
 #ifndef NDEBUG
       printf(
@@ -493,10 +497,10 @@ struct list_child_constructor {
     // string_views should now have been populated with source and target references.
 
     auto string_offsets = cudf::strings::detail::child_offsets_from_string_iterator(
-      string_views.begin(), string_views.size(), mr, stream.value());
+      string_views.begin(), string_views.size(), stream, mr);
 
     auto string_chars = cudf::strings::detail::child_chars_from_string_vector(
-      string_views, string_offsets->view().template data<cudf::size_type>(), 0, mr, stream.value());
+      string_views, string_offsets->view().template data<cudf::size_type>(), 0, stream, mr);
     auto child_null_mask =
       source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
         ? construct_child_nullmask(
@@ -587,7 +591,7 @@ struct list_child_constructor {
       child_list_views.begin(), [] __device__(auto const& row) { return row.size(); });
 
     auto child_offsets = cudf::strings::detail::make_offsets_child_column(
-      begin, begin + child_list_views.size(), mr, stream.value());
+      begin, begin + child_list_views.size(), stream, mr);
 
     auto child_column =
       cudf::type_dispatcher(source_lists_column_view.child().child(1).type(),
@@ -695,7 +699,7 @@ std::unique_ptr<column> scatter(
   auto list_size_begin = thrust::make_transform_iterator(
     target_vector.begin(), [] __device__(unbound_list_view l) { return l.size(); });
   auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    list_size_begin, list_size_begin + target.size(), mr, stream.value());
+    list_size_begin, list_size_begin + target.size(), stream, mr);
 
   auto child_column = cudf::type_dispatcher(child_column_type,
                                             list_child_constructor{},
diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index feade65f31a..5271bed14c8 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <cudf/scalar/scalar.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 /**
  * @addtogroup scalar_factories
@@ -38,7 +40,7 @@ namespace cudf {
  */
 std::unique_ptr<scalar> make_numeric_scalar(
   data_type type,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -54,7 +56,7 @@ std::unique_ptr<scalar> make_numeric_scalar(
  */
 std::unique_ptr<scalar> make_timestamp_scalar(
   data_type type,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -70,7 +72,7 @@ std::unique_ptr<scalar> make_timestamp_scalar(
  */
 std::unique_ptr<scalar> make_duration_scalar(
   data_type type,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -86,7 +88,7 @@ std::unique_ptr<scalar> make_duration_scalar(
  */
 std::unique_ptr<scalar> make_fixed_width_scalar(
   data_type type,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -102,7 +104,7 @@ std::unique_ptr<scalar> make_fixed_width_scalar(
  */
 std::unique_ptr<scalar> make_string_scalar(
   std::string const& string,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -125,7 +127,7 @@ std::unique_ptr<scalar> make_default_constructed_scalar(data_type type);
 template <typename T>
 std::unique_ptr<scalar> make_fixed_width_scalar(
   T value,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   return std::make_unique<scalar_type_t<T>>(value, true, stream, mr);
@@ -143,7 +145,7 @@ template <typename T>
 std::unique_ptr<scalar> make_fixed_point_scalar(
   typename T::rep value,
   numeric::scale_type scale,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   return std::make_unique<scalar_type_t<T>>(value, scale, true, stream, mr);
diff --git a/cpp/include/cudf/strings/copying.hpp b/cpp/include/cudf/strings/copying.hpp
index 70aa89fdfea..b4455e2c3b4 100644
--- a/cpp/include/cudf/strings/copying.hpp
+++ b/cpp/include/cudf/strings/copying.hpp
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -50,7 +52,7 @@ std::unique_ptr<cudf::column> copy_slice(
   size_type start,
   size_type end                       = -1,
   size_type step                      = 1,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index c45bc9558ed..ed783ca996c 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -20,6 +20,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -33,7 +35,7 @@ namespace detail {
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& separator,
                                     string_scalar const& narep,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr);
 
 /**
@@ -45,7 +47,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp
index 59348d85473..098dc1a38dc 100644
--- a/cpp/include/cudf/strings/detail/converters.hpp
+++ b/cpp/include/cudf/strings/detail/converters.hpp
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -30,7 +32,7 @@ namespace detail {
  */
 std::unique_ptr<column> to_integers(strings_column_view const& strings,
                                     data_type output_type,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr);
 
 /**
@@ -39,7 +41,7 @@ std::unique_ptr<column> to_integers(strings_column_view const& strings,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_integers(column_view const& integers,
-                                      cudaStream_t stream,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr);
 
 /**
@@ -49,7 +51,7 @@ std::unique_ptr<column> from_integers(column_view const& integers,
  */
 std::unique_ptr<column> to_floats(strings_column_view const& strings,
                                   data_type output_type,
-                                  cudaStream_t stream,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr);
 
 /**
@@ -58,7 +60,7 @@ std::unique_ptr<column> to_floats(strings_column_view const& strings,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_floats(column_view const& floats,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr);
 
 /**
@@ -69,7 +71,7 @@ std::unique_ptr<column> from_floats(column_view const& floats,
  */
 std::unique_ptr<column> to_booleans(strings_column_view const& strings,
                                     string_scalar const& true_string,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr);
 
 /**
@@ -81,7 +83,7 @@ std::unique_ptr<column> to_booleans(strings_column_view const& strings,
 std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
-                                      cudaStream_t stream,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr);
 
 /**
@@ -93,7 +95,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
 std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& strings,
                                             data_type timestamp_type,
                                             std::string const& format,
-                                            cudaStream_t stream,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr);
 
 /**
@@ -104,7 +106,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& strings,
  */
 std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string const& format,
-                                        cudaStream_t stream,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
 
 /**
@@ -116,7 +118,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
 std::unique_ptr<column> to_durations(strings_column_view const& strings,
                                      data_type duration_type,
                                      std::string const& format,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr);
 
 /**
@@ -127,7 +129,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& strings,
  */
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string const& format,
-                                       cudaStream_t stream,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 3433ab7d210..96961feee04 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -60,7 +60,7 @@ std::unique_ptr<cudf::column> copy_if_else(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = std::distance(lhs_begin, lhs_end);
-  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   auto execpol = rmm::exec_policy(stream);
   // create null mask
@@ -88,14 +88,13 @@ std::unique_ptr<cudf::column> copy_if_else(
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().template data<int32_t>();
 
   // build chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column =
-    create_chars_child_column(strings_count, null_count, bytes, mr, stream.value());
-  auto d_chars = chars_column->mutable_view().template data<char>();
+  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
+  auto chars_column = create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().template data<char>();
   // fill in chars
   thrust::for_each_n(
     execpol->on(stream.value()),
diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh
index 563f66ad2c8..68a0c1d7733 100644
--- a/cpp/include/cudf/strings/detail/copy_range.cuh
+++ b/cpp/include/cudf/strings/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -156,7 +156,7 @@ std::unique_ptr<column> copy_range(
           source_value_begin, source_validity_begin, d_target, target_begin, target_end});
 
       p_offsets_column = detail::make_offsets_child_column(
-        string_size_begin, string_size_begin + target.size(), mr, stream.value());
+        string_size_begin, string_size_begin + target.size(), stream, mr);
     } else if (null_count > 0) {  // check validities for source only
       auto string_size_begin = thrust::make_transform_iterator(
         thrust::make_counting_iterator(0),
@@ -164,7 +164,7 @@ std::unique_ptr<column> copy_range(
           source_value_begin, source_validity_begin, d_target, target_begin, target_end});
 
       p_offsets_column = detail::make_offsets_child_column(
-        string_size_begin, string_size_begin + target.size(), mr, stream.value());
+        string_size_begin, string_size_begin + target.size(), stream, mr);
     } else {  // no need to check validities
       auto string_size_begin = thrust::make_transform_iterator(
         thrust::make_counting_iterator(0),
@@ -172,7 +172,7 @@ std::unique_ptr<column> copy_range(
           source_value_begin, source_validity_begin, d_target, target_begin, target_end});
 
       p_offsets_column = detail::make_offsets_child_column(
-        string_size_begin, string_size_begin + target.size(), mr, stream.value());
+        string_size_begin, string_size_begin + target.size(), stream, mr);
     }
 
     // create the chars column
@@ -182,7 +182,7 @@ std::unique_ptr<column> copy_range(
     auto chars_bytes = p_offsets[target.size()];
 
     auto p_chars_column = strings::detail::create_chars_child_column(
-      target.size(), null_count, chars_bytes, mr, stream.value());
+      target.size(), null_count, chars_bytes, stream, mr);
 
     // copy to the chars column
 
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index e70dbd399c9..8f457d9e48f 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ std::unique_ptr<cudf::column> gather(
 {
   auto output_count  = std::distance(begin, end);
   auto strings_count = strings.size();
-  if (output_count == 0) return make_empty_strings_column(mr, stream.value());
+  if (output_count == 0) return make_empty_strings_column(stream, mr);
 
   auto execpol        = rmm::exec_policy(stream);
   auto strings_column = column_device_view::create(strings.parent(), stream);
@@ -82,13 +82,13 @@ std::unique_ptr<cudf::column> gather(
   };
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
   auto offsets_column          = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + output_count, mr, stream.value());
+    offsets_transformer_itr, offsets_transformer_itr + output_count, stream, mr);
   auto offsets_view = offsets_column->view();
   auto d_offsets    = offsets_view.template data<int32_t>();
 
   // build chars column
   size_type bytes   = thrust::device_pointer_cast(d_offsets)[output_count];
-  auto chars_column = create_chars_child_column(output_count, 0, bytes, mr, stream.value());
+  auto chars_column = create_chars_child_column(output_count, 0, bytes, stream, mr);
   auto chars_view   = chars_column->mutable_view();
   auto d_chars      = chars_view.template data<char>();
   // fill in chars
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index 6bdbce3c933..4a3cde89b30 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <strings/utilities.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -46,12 +48,12 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
                               strings_column_view const& rhs,
                               row_order_iterator begin,
                               row_order_iterator end,
-                              rmm::mr::device_memory_resource* mr,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   using cudf::detail::side;
   size_type strings_count = static_cast<size_type>(std::distance(begin, end));
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
   auto execpol    = rmm::exec_policy(stream);
   auto lhs_column = column_device_view::create(lhs.parent(), stream);
   auto d_lhs      = *lhs_column;
@@ -75,16 +77,16 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
   };
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
   auto offsets_column          = detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().template data<int32_t>();
 
   // create the chars column
   size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   // merge the strings
   auto d_chars = chars_column->mutable_view().template data<char>();
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      [d_lhs, d_rhs, begin, d_offsets, d_chars] __device__(size_type idx) {
diff --git a/cpp/include/cudf/strings/detail/modify_strings.cuh b/cpp/include/cudf/strings/detail/modify_strings.cuh
index c90ca4575f8..b2fcb16dbd6 100644
--- a/cpp/include/cudf/strings/detail/modify_strings.cuh
+++ b/cpp/include/cudf/strings/detail/modify_strings.cuh
@@ -43,22 +43,22 @@ namespace detail {
  *
  * @param strings Number Column of strings to apply the modifications on;
  * it is not modified in place; rather a new column is returned instead
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * (cannot be a default argument because of the variadic pack);
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * (cannot be a default argument because of the variadic pack);
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * (cannot be a default argument because of the variadic pack);
  * @param ...args Additional arguments to be forwarded to
  * the probe / execute constructors (can be empty);
  * @return modified strings column
  */
 template <typename device_probe_functor, typename device_execute_functor, typename... Types>
 std::unique_ptr<column> modify_strings(strings_column_view const& strings,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr,
-                                       cudaStream_t stream,
                                        Types&&... args)
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
 
   auto execpol = rmm::exec_policy(stream);
 
@@ -67,8 +67,7 @@ std::unique_ptr<column> modify_strings(strings_column_view const& strings,
   size_type null_count = strings.null_count();
 
   // copy null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   // get the lookup tables used for case conversion
 
   device_probe_functor d_probe_fctr{d_column, std::forward<Types>(args)...};
@@ -77,7 +76,7 @@ std::unique_ptr<column> modify_strings(strings_column_view const& strings,
   auto offsets_transformer_itr =
     thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0), d_probe_fctr);
   auto offsets_column = detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto offsets_view = offsets_column->view();
   auto d_new_offsets =
     offsets_view.template data<int32_t>();  // not sure why this requires `.template` and the next
@@ -86,19 +85,18 @@ std::unique_ptr<column> modify_strings(strings_column_view const& strings,
   // build the chars column -- convert characters based on case_flag parameter
   size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
   auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
 
   device_execute_functor d_execute_fctr{
     d_column, d_new_offsets, d_chars, std::forward<Types>(args)...};
 
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      d_execute_fctr);
 
-  //
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 9e0497052a6..5d64cb9944b 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -61,7 +61,7 @@ std::unique_ptr<column> scatter(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = target.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   // create null mask -- caller must update this
   rmm::device_buffer null_mask{0, stream, mr};
@@ -75,10 +75,10 @@ std::unique_ptr<column> scatter(
     rmm::exec_policy(stream)->on(stream.value()), begin, end, scatter_map, target_vector.begin());
 
   // build offsets column
-  auto offsets_column = child_offsets_from_string_vector(target_vector, mr, stream.value());
+  auto offsets_column = child_offsets_from_string_vector(target_vector, stream, mr);
   // build chars column
   auto chars_column = child_chars_from_string_vector(
-    target_vector, offsets_column->view().data<int32_t>(), 0, mr, stream.value());
+    target_vector, offsets_column->view().data<int32_t>(), 0, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 6ec055d0aee..dcc15b00c28 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -14,45 +14,46 @@
  * limitations under the License.
  */
 
+#include <strings/utilities.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
-#include <strings/utilities.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/for_each.h>
 #include <thrust/transform_reduce.h>
 
-// clang-format off
 namespace cudf {
 namespace strings {
 namespace detail {
 
 // Create a strings-type column from vector of pointer/size pairs
-template<typename IndexPairIterator>
-std::unique_ptr<column> make_strings_column(
-  IndexPairIterator begin, IndexPairIterator end,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream ) 
+template <typename IndexPairIterator>
+std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
+                                            IndexPairIterator end,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  size_type strings_count = thrust::distance(begin,end);
-  if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream);
+  size_type strings_count = thrust::distance(begin, end);
+  if (strings_count == 0) return strings::detail::make_empty_strings_column(stream, mr);
 
   using string_index_pair = thrust::pair<const char*, size_type>;
 
-  auto execpol   = rmm::exec_policy(stream);
+  auto execpol = rmm::exec_policy(stream);
+
   // check total size is not too large for cudf column
+  auto size_checker = [] __device__(string_index_pair const& item) {
+    return (item.first != nullptr) ? item.second : 0;
+  };
   size_t bytes = thrust::transform_reduce(
-    execpol->on(stream), begin, end,
-    [] __device__(string_index_pair const& item) {
-      return (item.first != nullptr) ? item.second : 0;
-    },
-    0,
-    thrust::plus<size_t>());
+    execpol->on(stream.value()), begin, end, size_checker, 0, thrust::plus<size_t>());
   CUDF_EXPECTS(bytes < std::numeric_limits<size_type>::max(),
                "total size of strings is too large for cudf column");
 
@@ -64,30 +65,28 @@ std::unique_ptr<column> make_strings_column(
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
   auto offsets_column = strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().template data<int32_t>();
 
   // create null mask
-  auto new_nulls = cudf::detail::valid_if( begin, end,
-    [] __device__(string_index_pair const item) { return item.first != nullptr; },
-    stream,
-    mr);
+  auto validator  = [] __device__(string_index_pair const item) { return item.first != nullptr; };
+  auto new_nulls  = cudf::detail::valid_if(begin, end, validator, stream, mr);
   auto null_count = new_nulls.second;
   rmm::device_buffer null_mask{0, stream, mr};
   if (null_count > 0) null_mask = std::move(new_nulls.first);
 
   // build chars column
   auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
-  auto d_chars = chars_column->mutable_view().template data<char>();
-  thrust::for_each_n(execpol->on(stream),
+    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+  auto d_chars    = chars_column->mutable_view().template data<char>();
+  auto copy_chars = [begin, d_offsets, d_chars] __device__(size_type idx) {
+    string_index_pair const item = begin[idx];
+    if (item.first != nullptr) memcpy(d_chars + d_offsets[idx], item.first, item.second);
+  };
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
-                     [begin, d_offsets, d_chars] __device__(size_type idx) {
-                       string_index_pair const item = begin[idx];
-                       if (item.first != nullptr)
-                         memcpy(d_chars + d_offsets[idx], item.first, item.second);
-                     });
+                     copy_chars);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
@@ -101,4 +100,3 @@ std::unique_ptr<column> make_strings_column(
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
-// clang-format on TODO fix
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index 11b5455df04..a5c466ecad2 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,12 +15,14 @@
  */
 #pragma once
 
-#include <cuda_runtime.h>
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/scan.h>
+
 #include <mutex>
 #include <unordered_map>
 
@@ -43,8 +45,8 @@ template <typename InputIterator>
 std::unique_ptr<column> make_offsets_child_column(
   InputIterator begin,
   InputIterator end,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(begin < end, "Invalid iterator range");
   auto count = thrust::distance(begin, end);
@@ -57,8 +59,8 @@ std::unique_ptr<column> make_offsets_child_column(
   // Rather than manually computing the final offset using values in device memory,
   // we use inclusive-scan on a shifted output (d_offsets+1) and then set the first
   // offset values to zero manually.
-  thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream), begin, end, d_offsets + 1);
-  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream));
+  thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()), begin, end, d_offsets + 1);
+  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream.value()));
   return offsets_column;
 }
 
@@ -76,12 +78,12 @@ template <typename Iter>
 std::unique_ptr<cudf::column> child_offsets_from_string_iterator(
   Iter strings_begin,
   cudf::size_type num_strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto transformer = [] __device__(string_view v) { return v.size_bytes(); };
   auto begin       = thrust::make_transform_iterator(strings_begin, transformer);
-  return make_offsets_child_column(begin, begin + num_strings, mr, stream);
+  return make_offsets_child_column(begin, begin + num_strings, stream, mr);
 }
 
 // This template is a thin wrapper around per-context singleton objects.
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 4e313a12121..c3b953b4211 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 #pragma once
 
-#include <cuda_runtime.h>
-#include <rmm/thrust_rmm_allocator.h>
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -30,27 +31,27 @@ namespace detail {
  * @param strings_count Number of strings in the column.
  * @param null_count Number of null string entries in the column.
  * @param bytes Number of bytes for the chars column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return The chars child column for a strings column.
  */
 std::unique_ptr<column> create_chars_child_column(
   size_type strings_count,
   size_type null_count,
   size_type bytes,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a strings column with no strings.
  *
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Empty strings column
  */
 std::unique_ptr<column> make_empty_strings_column(
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a string_view vector from a strings column.
@@ -59,21 +60,21 @@ std::unique_ptr<column> make_empty_strings_column(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return Device vector of string_views
  */
-rmm::device_vector<string_view> create_string_vector_from_column(cudf::strings_column_view strings,
-                                                                 cudaStream_t stream = 0);
+rmm::device_vector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view strings, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Creates an offsets column from a string_view vector.
  *
  * @param strings Strings column
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Child offsets column
  */
 std::unique_ptr<cudf::column> child_offsets_from_string_vector(
   const rmm::device_vector<string_view>& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a chars column from a string_view vector.
@@ -81,16 +82,16 @@ std::unique_ptr<cudf::column> child_offsets_from_string_vector(
  * @param strings Strings vector
  * @param d_offsets Offsets vector for placing strings into column's memory.
  * @param null_count Number of null strings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Child chars column
  */
 std::unique_ptr<cudf::column> child_chars_from_string_vector(
   const rmm::device_vector<string_view>& strings,
   const int32_t* d_offsets,
   cudf::size_type null_count,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index b6ae22f6b6a..f30316eda10 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 /**
  * @file
@@ -114,7 +115,7 @@ void print(strings_column_view const& strings,
  */
 std::pair<rmm::device_vector<char>, rmm::device_vector<size_type>> create_offsets(
   strings_column_view const& strings,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 76d2e57597f..f34a265a50a 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 3e3fe6ad719..dc7635928a7 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -37,11 +37,6 @@
  *
  **/
 
-/**
- * @brief Forward declaration of cudaStream_t
- **/
-using cudaStream_t = struct CUstream_st*;
-
 namespace bit_mask {
 using bit_mask_t = uint32_t;
 }
diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp
index c244462a390..0cdf0e7fe7b 100644
--- a/cpp/include/cudf/utilities/error.hpp
+++ b/cpp/include/cudf/utilities/error.hpp
@@ -125,7 +125,11 @@ inline void throw_cuda_error(cudaError_t error, const char* file, unsigned int l
  *
  **/
 #ifndef NDEBUG
-#define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream));
+#define CHECK_CUDA(stream)                   \
+  do {                                       \
+    CUDA_TRY(cudaStreamSynchronize(stream)); \
+    CUDA_TRY(cudaPeekAtLastError());         \
+  } while (0);
 #else
 #define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError());
 #endif
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index df9c679c15d..9c009ce5b60 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -18,13 +18,13 @@
 
 #include <cudf/lists/list_view.cuh>
 #include <cudf/strings/string_view.cuh>
+#include <cudf/structs/struct_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
 #include <simt/type_traits>
-#include "cudf/structs/struct_view.hpp"
 
 namespace cudf {
 
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index 8eced8ca056..a75ae3d6181 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -15,9 +15,12 @@
  */
 #pragma once
 
-#include <cudf/column/column.hpp>
 #include <nvtext/subword_tokenize.hpp>
 
+#include <cudf/column/column.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 #include <stdint.h>
 #include <string.h>
 
@@ -38,7 +41,7 @@ namespace detail {
  * @return vocabulary hash-table elements
  */
 hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary,
-                                       cudaStream_t stream,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index 9dd06f17ce8..8b74c9cde94 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace nvtext {
 namespace detail {
 /**
@@ -35,8 +37,8 @@ namespace detail {
 std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& strings,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                  = 0);
+  rmm::cuda_stream_view stream         = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc nvtext::tokenize(strings_column_view const&,strings_column_view
@@ -51,8 +53,8 @@ std::unique_ptr<cudf::column> tokenize(
 std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& strings,
   cudf::strings_column_view const& delimiters,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc nvtext::count_tokens(strings_column_view const&, string_scalar
@@ -68,8 +70,8 @@ std::unique_ptr<cudf::column> tokenize(
 std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& strings,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                  = 0);
+  rmm::cuda_stream_view stream         = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc nvtext::count_tokens(strings_column_view const&,strings_column_view
@@ -84,8 +86,8 @@ std::unique_ptr<cudf::column> count_tokens(
 std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& strings,
   cudf::strings_column_view const& delimiters,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 06f969a9d43..ae0395913cc 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -142,7 +142,7 @@ void set_null_mask(bitmask_type *bitmask,
     cudf::detail::grid_1d config(number_of_mask_words, 256);
     set_null_mask_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       static_cast<bitmask_type *>(bitmask), begin_bit, end_bit, valid, number_of_mask_words);
-    CHECK_CUDA(stream);
+    CHECK_CUDA(stream.value());
   }
 }
 
@@ -604,7 +604,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
                                            last_word_indices,
                                            stream.value()));
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 
   // third, adjust counts in segment boundaries (if segments are not
   // word-aligned)
@@ -619,7 +619,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
                                                stream.value()>>>(
     bitmask, num_ranges, d_first_indices.begin(), d_last_indices.begin(), d_null_counts.begin());
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 
   std::vector<size_type> ret(num_ranges);
   CUDA_TRY(cudaMemcpyAsync(ret.data(),
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index b64f88291b7..f72a65f2348 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -268,13 +268,12 @@ struct create_column_from_view {
 
     auto num_rows = children.empty() ? 0 : children.front()->size();
 
-    return make_structs_column(
-      num_rows,
-      std::move(children),
-      view.null_count(),
-      cudf::detail::copy_bitmask(view.null_mask(), begin, end, rmm::cuda_stream_view{stream}, mr),
-      stream.value(),
-      mr);
+    return make_structs_column(num_rows,
+                               std::move(children),
+                               view.null_count(),
+                               cudf::detail::copy_bitmask(view.null_mask(), begin, end, stream, mr),
+                               stream.value(),
+                               mr);
   }
 };
 }  // anonymous namespace
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 0719af9756b..38466de12c5 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -608,6 +608,7 @@ BufInfo build_output_columns(InputIter begin,
       src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr);
     return column_view{src.type(), size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)};
   });
+
   return current_info;
 }
 
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index ff532059108..d41c0a2fa74 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -187,10 +187,9 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
 
   // combine keys so both dictionaries have the same set
   auto target_matched =
-    cudf::dictionary::detail::add_keys(dict_target, dict_source.keys(), mr, stream.value());
+    cudf::dictionary::detail::add_keys(dict_target, dict_source.keys(), stream, mr);
   auto const target_view = cudf::dictionary_column_view(target_matched->view());
-  auto source_matched    = cudf::dictionary::detail::set_keys(
-    dict_source, target_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
+  auto source_matched = cudf::dictionary::detail::set_keys(dict_source, target_view.keys(), stream);
   auto const source_view = cudf::dictionary_column_view(source_matched->view());
 
   // build the new indices by calling in_place_copy_range on just the indices
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index a6da491f672..94e6be49e9d 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -16,11 +16,12 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 
-#include <cudf/detail/utilities/cuda.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
@@ -32,7 +33,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(
     column_view const &input,
     size_type index,
-    cudaStream_t stream                 = 0,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
     auto s = make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
@@ -56,7 +57,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(
     column_view const &input,
     size_type index,
-    cudaStream_t stream                 = 0,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
     auto device_col = column_device_view::create(input, stream);
@@ -81,7 +82,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(
     column_view const &input,
     size_type index,
-    cudaStream_t stream                 = 0,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
     auto dict_view = dictionary_column_view(input);
@@ -109,7 +110,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(
     column_view const &input,
     size_type index,
-    cudaStream_t stream                 = 0,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
     CUDF_FAIL("get_element_functor not supported for list_view");
@@ -119,7 +120,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(
     column_view const &input,
     size_type index,
-    cudaStream_t stream                 = 0,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
     CUDF_FAIL("get_element_functor not supported for decimal32");
@@ -129,7 +130,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(
     column_view const &input,
     size_type index,
-    cudaStream_t stream                 = 0,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
     CUDF_FAIL("get_element_functor not supported for decimal64");
@@ -139,7 +140,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(
     column_view const &input,
     size_type index,
-    cudaStream_t stream                 = 0,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
     CUDF_FAIL("get_element_functor not supported for struct_view");
@@ -150,7 +151,7 @@ struct get_element_functor {
 
 std::unique_ptr<scalar> get_element(column_view const &input,
                                     size_type index,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(index >= 0 and index < input.size(), "Index out of bounds");
@@ -163,7 +164,7 @@ std::unique_ptr<scalar> get_element(column_view const &input,
                                     size_type index,
                                     rmm::mr::device_memory_resource *mr)
 {
-  return detail::get_element(input, index, 0, mr);
+  return detail::get_element(input, index, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index d8beb052f8f..90ff5ff3025 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -180,8 +180,8 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
     auto dict_target =
       dictionary::detail::add_keys(dictionary_column_view(target),
                                    make_column_from_scalar(source.get(), 1, stream)->view(),
-                                   mr,
-                                   stream.value());
+                                   stream,
+                                   mr);
     auto dict_view    = dictionary_column_view(dict_target->view());
     auto scalar_index = dictionary::detail::get_index(dict_view, source.get(), stream);
     auto scalar_iter  = thrust::make_permutation_iterator(
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index ce2df92efc0..e4989c743ef 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -137,16 +137,16 @@ struct launch_functor {
 
   template <typename Element>
   typename std::enable_if_t<!cudf::is_timestamp_t<Element>::value, void> operator()(
-    cudaStream_t stream) const
+    rmm::cuda_stream_view stream) const
   {
     CUDF_FAIL("Cannot extract datetime component from non-timestamp column.");
   }
 
   template <typename Timestamp>
   typename std::enable_if_t<cudf::is_timestamp_t<Timestamp>::value, void> operator()(
-    cudaStream_t stream) const
+    rmm::cuda_stream_view stream) const
   {
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       input.begin<Timestamp>(),
                       input.end<Timestamp>(),
                       output.begin<OutputColT>(),
@@ -157,7 +157,7 @@ struct launch_functor {
 // Create an output column by applying the functor to every element from the input column
 template <typename TransformFunctor, cudf::type_id OutputColCudfT>
 std::unique_ptr<column> apply_datetime_op(column_view const& column,
-                                          cudaStream_t stream,
+                                          rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(is_timestamp(column.type()), "Column type should be timestamp");
@@ -167,13 +167,12 @@ std::unique_ptr<column> apply_datetime_op(column_view const& column,
   // Return an empty column if source column is empty
   if (size == 0) return make_empty_column(output_col_type);
 
-  auto output =
-    make_fixed_width_column(output_col_type,
-                            size,
-                            cudf::detail::copy_bitmask(column, rmm::cuda_stream_view{stream}, mr),
-                            column.null_count(),
-                            stream,
-                            mr);
+  auto output = make_fixed_width_column(output_col_type,
+                                        size,
+                                        cudf::detail::copy_bitmask(column, stream, mr),
+                                        column.null_count(),
+                                        stream,
+                                        mr);
   auto launch =
     launch_functor<TransformFunctor, typename cudf::id_to_type_impl<OutputColCudfT>::type>{
       column, static_cast<mutable_column_view>(*output)};
@@ -211,16 +210,16 @@ struct add_calendrical_months_functor {
 
   template <typename Element>
   typename std::enable_if_t<!cudf::is_timestamp_t<Element>::value, void> operator()(
-    cudaStream_t stream) const
+    rmm::cuda_stream_view stream) const
   {
     CUDF_FAIL("Cannot extract datetime component from non-timestamp column.");
   }
 
   template <typename Timestamp>
   typename std::enable_if_t<cudf::is_timestamp_t<Timestamp>::value, void> operator()(
-    cudaStream_t stream) const
+    rmm::cuda_stream_view stream) const
   {
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       timestamp_column.begin<Timestamp>(),
                       timestamp_column.end<Timestamp>(),
                       months_column.begin<int16_t>(),
@@ -253,7 +252,7 @@ struct add_calendrical_months_functor {
 
 std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_column,
                                                column_view const& months_column,
-                                               cudaStream_t stream,
+                                               rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp");
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 79effe3fc97..daf1bb76916 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -49,8 +49,8 @@ namespace detail {
 std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "Keys must not have nulls");
   auto old_keys = dictionary_column.keys();  // [a,b,c,d,f]
@@ -116,11 +116,10 @@ std::unique_ptr<column> add_keys(
 
   // create new dictionary column with keys_column and indices_column
   // null mask has not changed
-  return make_dictionary_column(
-    std::move(keys_column),
-    std::move(indices_column),
-    cudf::detail::copy_bitmask(dictionary_column.parent(), rmm::cuda_stream_view{stream}, mr),
-    dictionary_column.null_count());
+  return make_dictionary_column(std::move(keys_column),
+                                std::move(indices_column),
+                                cudf::detail::copy_bitmask(dictionary_column.parent(), stream, mr),
+                                dictionary_column.null_count());
 }
 
 }  // namespace detail
@@ -130,7 +129,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_keys(dictionary_column, keys, mr);
+  return detail::add_keys(dictionary_column, keys, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index 913da30df16..3822edfc9ef 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -33,8 +33,8 @@ namespace detail {
  * @brief Decode a column from a dictionary.
  */
 std::unique_ptr<column> decode(dictionary_column_view const& source,
-                               rmm::mr::device_memory_resource* mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   if (source.is_empty()) return make_empty_column(data_type{type_id::EMPTY});
 
@@ -55,9 +55,8 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
   auto output_column = std::unique_ptr<column>(std::move(table_column.front()));
 
   // apply any nulls to the output column
-  output_column->set_null_mask(
-    cudf::detail::copy_bitmask(source.parent(), rmm::cuda_stream_view{stream}, mr),
-    source.null_count());
+  output_column->set_null_mask(cudf::detail::copy_bitmask(source.parent(), stream, mr),
+                               source.null_count());
 
   return output_column;
 }
@@ -68,7 +67,7 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::decode(source, mr);
+  return detail::decode(source, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu
index 6448d711db1..6a2b7f71ae3 100644
--- a/cpp/src/dictionary/detail/merge.cu
+++ b/cpp/src/dictionary/detail/merge.cu
@@ -22,6 +22,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -29,8 +31,8 @@ namespace detail {
 std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               dictionary_column_view const& rcol,
                               cudf::detail::index_vector const& row_order,
-                              rmm::mr::device_memory_resource* mr,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   auto const lcol_iter = cudf::detail::indexalator_factory::make_input_iterator(lcol.indices());
   auto const rcol_iter = cudf::detail::indexalator_factory::make_input_iterator(rcol.indices());
@@ -44,7 +46,7 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
     cudf::detail::indexalator_factory::make_output_iterator(indices_column->mutable_view());
 
   // merge the input indices columns into the output column
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     row_order.begin(),
                     row_order.end(),
                     output_iter,
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 17a09e26f7b..73d1becf639 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -29,8 +29,8 @@ namespace {
 struct dispatch_create_indices {
   template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& indices,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(std::is_unsigned<IndexType>(), "indices must be an unsigned type");
     column_view indices_view{
@@ -39,8 +39,8 @@ struct dispatch_create_indices {
   }
   template <typename IndexType, std::enable_if_t<!is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
-                                     rmm::mr::device_memory_resource*,
-                                     cudaStream_t)
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*)
   {
     CUDF_FAIL("indices must be an integer type.");
   }
@@ -49,19 +49,18 @@ struct dispatch_create_indices {
 
 std::unique_ptr<column> make_dictionary_column(column_view const& keys_column,
                                                column_view const& indices_column,
-                                               rmm::mr::device_memory_resource* mr,
-                                               cudaStream_t stream)
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!keys_column.has_nulls(), "keys column must not have nulls");
   if (keys_column.is_empty()) return make_empty_column(data_type{type_id::DICTIONARY32});
 
   auto keys_copy = std::make_unique<column>(keys_column, stream, mr);
   auto indices_copy =
-    type_dispatcher(indices_column.type(), dispatch_create_indices{}, indices_column, mr, stream);
+    type_dispatcher(indices_column.type(), dispatch_create_indices{}, indices_column, stream, mr);
   rmm::device_buffer null_mask{0, stream, mr};
   auto null_count = indices_column.null_count();
-  if (null_count)
-    null_mask = detail::copy_bitmask(indices_column, rmm::cuda_stream_view{stream}, mr);
+  if (null_count) null_mask = detail::copy_bitmask(indices_column, stream, mr);
 
   std::vector<std::unique_ptr<column>> children;
   children.emplace_back(std::move(indices_copy));
@@ -117,8 +116,8 @@ struct make_unsigned_fn {
 
 std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys,
                                                std::unique_ptr<column> indices,
-                                               rmm::mr::device_memory_resource* mr,
-                                               cudaStream_t stream)
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!keys->has_nulls(), "keys column must not have nulls");
 
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index 129c9345d4b..501e034c5fe 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -26,6 +26,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -38,8 +39,8 @@ namespace detail {
  */
 std::unique_ptr<column> encode(column_view const& input_column,
                                data_type indices_type,
-                               rmm::mr::device_memory_resource* mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer");
   CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32,
@@ -63,11 +64,10 @@ std::unique_ptr<column> encode(column_view const& input_column,
     indices_column = cudf::detail::cast(indices_column->view(), indices_type, stream, mr);
 
   // create column with keys_column and indices_column
-  return make_dictionary_column(
-    std::move(keys_column),
-    std::move(indices_column),
-    cudf::detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr),
-    input_column.null_count());
+  return make_dictionary_column(std::move(keys_column),
+                                std::move(indices_column),
+                                cudf::detail::copy_bitmask(input_column, stream, mr),
+                                input_column.null_count());
 }
 
 /**
@@ -89,7 +89,7 @@ std::unique_ptr<column> encode(column_view const& input_column,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::encode(input_column, indices_type, mr);
+  return detail::encode(input_column, indices_type, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index f0f86a3dd1a..b36b110b13f 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -26,6 +26,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
 #include <thrust/transform.h>
@@ -53,8 +55,8 @@ template <typename KeysKeeper>
 std::unique_ptr<column> remove_keys_fn(
   dictionary_column_view const& dictionary_column,
   KeysKeeper keys_to_keep_fn,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto const keys_view    = dictionary_column.keys();
   auto execpol            = rmm::exec_policy(stream);
@@ -67,7 +69,7 @@ std::unique_ptr<column> remove_keys_fn(
   auto map_itr =
     cudf::detail::indexalator_factory::make_output_iterator(map_indices->mutable_view());
   // init to max to identify new nulls
-  thrust::fill(execpol->on(stream),
+  thrust::fill(execpol->on(stream.value()),
                map_itr,
                map_itr + keys_view.size(),
                max_size);  // all valid indices are less than this value
@@ -79,7 +81,7 @@ std::unique_ptr<column> remove_keys_fn(
       auto positions = make_fixed_width_column(
         indices_type, keys_view.size(), cudf::mask_state::UNALLOCATED, stream);
       auto itr = cudf::detail::indexalator_factory::make_output_iterator(positions->mutable_view());
-      thrust::sequence(execpol->on(stream), itr, itr + keys_view.size());
+      thrust::sequence(execpol->on(stream.value()), itr, itr + keys_view.size());
       return positions;
     }();
     // copy the non-removed keys ( keys_to_keep_fn(idx)==true )
@@ -93,7 +95,7 @@ std::unique_ptr<column> remove_keys_fn(
       cudf::detail::indexalator_factory::make_input_iterator(keys_positions->view());
     // build indices mapper
     // Example scatter([0,1,2][0,2,4][max,max,max,max,max]) => [0,max,1,max,2]
-    thrust::scatter(execpol->on(stream),
+    thrust::scatter(execpol->on(stream.value()),
                     positions_itr,
                     positions_itr + filtered_view.size(),
                     filtered_itr,
@@ -145,8 +147,8 @@ std::unique_ptr<column> remove_keys_fn(
 std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(!keys_to_remove.has_nulls(), "keys_to_remove must not have nulls");
   auto const keys_view = dictionary_column.keys();
@@ -157,13 +159,13 @@ std::unique_ptr<column> remove_keys(
   auto d_matches     = matches->view().data<bool>();
   // call common utility method to keep the keys not matched to keys_to_remove
   auto key_matcher = [d_matches] __device__(size_type idx) { return !d_matches[idx]; };
-  return remove_keys_fn(dictionary_column, key_matcher, mr, stream);
+  return remove_keys_fn(dictionary_column, key_matcher, stream, mr);
 }
 
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   // locate the keys to remove
   auto const keys_size     = dictionary_column.keys_size();
@@ -174,7 +176,7 @@ std::unique_ptr<column> remove_unused_keys(
     // build keys index to verify against indices values
     rmm::device_uvector<uint32_t> keys_positions(keys_size, stream);
     thrust::sequence(
-      rmm::exec_policy(stream)->on(stream), keys_positions.begin(), keys_positions.end());
+      rmm::exec_policy(stream)->on(stream.value()), keys_positions.begin(), keys_positions.end());
     // wrap the indices for comparison in contains()
     column_view keys_positions_view(data_type{type_id::UINT32}, keys_size, keys_positions.data());
     return cudf::detail::contains(keys_positions_view, indices_view, stream, mr);
@@ -183,7 +185,7 @@ std::unique_ptr<column> remove_unused_keys(
 
   // call common utility method to keep the keys that match
   auto key_matcher = [d_matches] __device__(size_type idx) { return d_matches[idx]; };
-  return remove_keys_fn(dictionary_column, key_matcher, mr, stream);
+  return remove_keys_fn(dictionary_column, key_matcher, stream, mr);
 }
 
 }  // namespace detail
@@ -195,14 +197,14 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::remove_keys(dictionary_column, keys_to_remove, mr);
+  return detail::remove_keys(dictionary_column, keys_to_remove, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::remove_unused_keys(dictionary_column, mr);
+  return detail::remove_unused_keys(dictionary_column, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 60e7c496e06..6db30c9765d 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -127,7 +127,7 @@ std::unique_ptr<column> replace_indices(column_view const& input,
 
 /**
  * @copydoc cudf::dictionary::detail::replace_nulls(cudf::column_view const&,cudf::column_view
- * const&,rmm::mr::device_memory_resource*,cudaStream_t)
+ * const& rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
  */
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       dictionary_column_view const& replacement,
@@ -140,7 +140,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match");
 
   // first combine the keys so both input dictionaries have the same set
-  auto matched = match_dictionaries({input, replacement}, mr, stream.value());
+  auto matched = match_dictionaries({input, replacement}, stream, mr);
 
   // now build the new indices by doing replace-null using the updated input indices
   auto const input_indices =
@@ -152,16 +152,13 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
       : replace_indices(
           input_indices, make_nullable_index_iterator<false>(repl_indices), stream, mr);
 
-  // auto keys_column = ;
-  return make_dictionary_column(std::move(matched.front()->release().children.back()),
-                                std::move(new_indices),
-                                mr,
-                                stream.value());
+  return make_dictionary_column(
+    std::move(matched.front()->release().children.back()), std::move(new_indices), stream, mr);
 }
 
 /**
  * @copydoc cudf::dictionary::detail::replace_nulls(cudf::column_view const&,cudf::scalar
- * const&,rmm::mr::device_memory_resource*,cudaStream_t)
+ * const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
  */
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       scalar const& replacement,
@@ -175,11 +172,10 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   CUDF_EXPECTS(input.keys().type() == replacement.type(), "keys must match scalar type");
 
   // first add the replacment to the keys so only the indices need to be processed
-  auto const default_mr = rmm::mr::get_current_device_resource();
-  auto input_matched    = dictionary::detail::add_keys(
-    input, make_column_from_scalar(replacement, 1, stream, default_mr)->view(), mr, stream.value());
+  auto input_matched = dictionary::detail::add_keys(
+    input, make_column_from_scalar(replacement, 1, stream)->view(), stream, mr);
   auto const input_view   = dictionary_column_view(input_matched->view());
-  auto const scalar_index = get_index(input_view, replacement, stream, default_mr);
+  auto const scalar_index = get_index(input_view, replacement, stream);
 
   // now build the new indices by doing replace-null on the updated indices
   auto const input_indices = input_view.get_indices_annotated();
@@ -187,10 +183,8 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
     replace_indices(input_indices, make_scalar_iterator(*scalar_index), stream, mr);
   new_indices->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
 
-  return make_dictionary_column(std::move(input_matched->release().children.back()),
-                                std::move(new_indices),
-                                mr,
-                                stream.value());
+  return make_dictionary_column(
+    std::move(input_matched->release().children.back()), std::move(new_indices), stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 69fdcd85b35..6889a265c5a 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -28,6 +28,8 @@
 #include <cudf/stream_compaction.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/binary_search.h>
 #include <algorithm>
 #include <iterator>
@@ -49,8 +51,8 @@ struct dispatch_compute_indices {
                             std::unique_ptr<column>>
   operator()(dictionary_column_view const& input,
              column_view const& new_keys,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream)
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
     auto dictionary_view = column_device_view::create(input.parent(), stream);
     auto d_dictionary    = *dictionary_view;
@@ -72,7 +74,7 @@ struct dispatch_compute_indices {
                                       mr);
     auto result_itr =
       cudf::detail::indexalator_factory::make_output_iterator(result->mutable_view());
-    thrust::lower_bound(rmm::exec_policy(stream)->on(stream),
+    thrust::lower_bound(rmm::exec_policy(stream)->on(stream.value()),
                         new_keys_view->begin<Element>(),
                         new_keys_view->end<Element>(),
                         dictionary_itr,
@@ -88,8 +90,8 @@ struct dispatch_compute_indices {
                             std::unique_ptr<column>>
   operator()(dictionary_column_view const& input,
              column_view const& new_keys,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream)
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("list_view dictionary set_keys not supported yet");
   }
@@ -101,8 +103,8 @@ struct dispatch_compute_indices {
 std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "keys parameter must not have nulls");
   auto keys = dictionary_column.keys();
@@ -140,8 +142,8 @@ std::unique_ptr<column> set_keys(
                                         dispatch_compute_indices{},
                                         dictionary_column,
                                         keys_column->view(),
-                                        mr,
-                                        stream);
+                                        stream,
+                                        mr);
 
   // create column with keys_column and indices_column
   return make_dictionary_column(std::move(keys_column),
@@ -151,8 +153,8 @@ std::unique_ptr<column> set_keys(
 }
 
 std::vector<std::unique_ptr<column>> match_dictionaries(std::vector<dictionary_column_view> input,
-                                                        rmm::mr::device_memory_resource* mr,
-                                                        cudaStream_t stream)
+                                                        rmm::cuda_stream_view stream,
+                                                        rmm::mr::device_memory_resource* mr)
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
@@ -160,13 +162,13 @@ std::vector<std::unique_ptr<column>> match_dictionaries(std::vector<dictionary_c
   auto keys_view = new_keys->view();
   std::vector<std::unique_ptr<column>> result(input.size());
   std::transform(input.begin(), input.end(), result.begin(), [keys_view, mr, stream](auto& col) {
-    return set_keys(col, keys_view, mr, stream);
+    return set_keys(col, keys_view, stream, mr);
   });
   return result;
 }
 
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
-  std::vector<table_view> tables, rmm::mr::device_memory_resource* mr, cudaStream_t stream)
+  std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   // Make a copy of all the column views from each table_view
   std::vector<std::vector<column_view>> updated_columns;
@@ -188,7 +190,7 @@ std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_d
           return dictionary_column_view(t.column(col_idx));
         });
       // now match the keys in these dictionary columns
-      auto dict_cols = dictionary::detail::match_dictionaries(dict_views, mr, stream);
+      auto dict_cols = dictionary::detail::match_dictionaries(dict_views, stream, mr);
       // replace the updated_columns vector entries for the set of columns at col_idx
       auto dict_col_idx = 0;
       for (auto& v : updated_columns) v[col_idx] = dict_cols[dict_col_idx++]->view();
@@ -218,7 +220,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::set_keys(dictionary_column, keys, mr);
+  return detail::set_keys(dictionary_column, keys, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 77482e13b6c..a9d8a61d88f 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -176,7 +176,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   // add the scalar to get the output dictionary key-set
   auto scalar_column = cudf::make_column_from_scalar(value, 1, stream);
   auto target_matched =
-    cudf::dictionary::detail::add_keys(target, scalar_column->view(), mr, stream.value());
+    cudf::dictionary::detail::add_keys(target, scalar_column->view(), stream, mr);
   cudf::column_view const target_indices =
     cudf::dictionary_column_view(target_matched->view()).get_indices_annotated();
 
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 224f6dfe3a0..46b16ac0949 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -46,7 +46,8 @@ struct count_accessor {
   cudf::scalar const* p_scalar = nullptr;
 
   template <typename T>
-  std::enable_if_t<std::is_integral<T>::value, cudf::size_type> operator()(cudaStream_t stream = 0)
+  std::enable_if_t<std::is_integral<T>::value, cudf::size_type> operator()(
+    rmm::cuda_stream_view stream)
   {
     using ScalarType = cudf::scalar_type_t<T>;
 #if 1
@@ -63,7 +64,8 @@ struct count_accessor {
   }
 
   template <typename T>
-  std::enable_if_t<not std::is_integral<T>::value, cudf::size_type> operator()(cudaStream_t stream)
+  std::enable_if_t<not std::is_integral<T>::value, cudf::size_type> operator()(
+    rmm::cuda_stream_view)
   {
     CUDF_FAIL("count value should be a integral type.");
   }
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 6a004393b83..3df6e0ece85 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,7 +55,7 @@ groupby::groupby(table_view const& keys,
 // Select hash vs. sort groupby implementation
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::dispatch_aggregation(
   std::vector<aggregation_request> const& requests,
-  cudaStream_t stream,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   // If sort groupby has been called once on this groupby object, then
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 0a56563cf87..3ef97d431cd 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -223,12 +223,12 @@ auto create_hash_map(table_device_view const& d_keys,
   row_equality_comparator<keys_have_nulls> rows_equal{d_keys, d_keys, null_keys_are_equal};
 
   return map_type::create(compute_hash_table_size(d_keys.num_rows()),
+                          stream,
                           unused_key,
                           unused_value,
                           hasher,
                           rows_equal,
-                          allocator_type(),
-                          stream.value());
+                          allocator_type());
 }
 
 /**
@@ -273,8 +273,8 @@ void compute_single_pass_aggs(table_view const& keys,
   cudf::detail::initialize_with_identity(table_view, aggs, stream);
 
   // prepare to launch kernel to do the actual aggregation
-  auto d_sparse_table = mutable_table_device_view::create(sparse_table);
-  auto d_values       = table_device_view::create(flattened_values);
+  auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
+  auto d_values       = table_device_view::create(flattened_values, stream);
   rmm::device_vector<aggregation::Kind> d_aggs(aggs);
 
   bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
@@ -372,7 +372,7 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
-  auto d_keys = table_device_view::create(keys);
+  auto d_keys = table_device_view::create(keys, stream);
   auto map    = create_hash_map<keys_have_nulls>(*d_keys, include_null_keys, stream);
 
   // Cache of sparse results where the location of aggregate value in each
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index b49fbeb7387..a22e7619694 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/gather.hpp>
 #include <groupby/sort/group_single_pass_reduction_util.cuh>
 
+#include <cudf/detail/gather.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -26,16 +29,16 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                      size_type num_groups,
                                      rmm::device_vector<size_type> const& group_labels,
                                      column_view const& key_sort_order,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   auto indices = type_dispatcher(values.type(),
                                  reduce_functor<aggregation::ARGMAX>{},
                                  values,
                                  num_groups,
                                  group_labels,
-                                 rmm::mr::get_current_device_resource(),
-                                 stream);
+                                 stream,
+                                 rmm::mr::get_current_device_resource());
 
   // The functor returns the index of maximum in the sorted values.
   // We need the index of maximum in the original unsorted values.
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index 5ae11ba0506..6cdcd7cd94a 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/gather.hpp>
 #include <groupby/sort/group_single_pass_reduction_util.cuh>
 
+#include <cudf/detail/gather.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -26,16 +29,16 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      size_type num_groups,
                                      rmm::device_vector<size_type> const& group_labels,
                                      column_view const& key_sort_order,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   auto indices = type_dispatcher(values.type(),
                                  reduce_functor<aggregation::ARGMIN>{},
                                  values,
                                  num_groups,
                                  group_labels,
-                                 rmm::mr::get_current_device_resource(),
-                                 stream);
+                                 stream,
+                                 rmm::mr::get_current_device_resource());
 
   // The functor returns the index of minimum in the sorted values.
   // We need the index of minimum in the original unsorted values.
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index aeb9d472b7e..9c8ab92cc50 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -20,14 +20,16 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_collect(column_view const &values,
                                       rmm::device_vector<size_type> const &group_offsets,
                                       size_type num_groups,
-                                      rmm::mr::device_memory_resource *mr,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource *mr)
 {
   rmm::device_buffer offsets_data(
     group_offsets.data().get(), group_offsets.size() * sizeof(cudf::size_type), stream, mr);
diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu
index 504ffe09bc2..d63f691d2e1 100644
--- a/cpp/src/groupby/sort/group_count.cu
+++ b/cpp/src/groupby/sort/group_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 
@@ -28,8 +30,8 @@ namespace detail {
 std::unique_ptr<column> group_count_valid(column_view const& values,
                                           rmm::device_vector<size_type> const& group_labels,
                                           size_type num_groups,
-                                          rmm::mr::device_memory_resource* mr,
-                                          cudaStream_t stream)
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
@@ -49,14 +51,14 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
       thrust::make_transform_iterator(cudf::detail::make_validity_iterator(*values_view),
                                       [] __device__(auto b) { return static_cast<size_type>(b); });
 
-    thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+    thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()),
                           group_labels.begin(),
                           group_labels.end(),
                           bitmask_iterator,
                           thrust::make_discard_iterator(),
                           result->mutable_view().begin<size_type>());
   } else {
-    thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+    thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()),
                           group_labels.begin(),
                           group_labels.end(),
                           thrust::make_constant_iterator(1),
@@ -69,8 +71,8 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
 
 std::unique_ptr<column> group_count_all(rmm::device_vector<size_type> const& group_offsets,
                                         size_type num_groups,
-                                        rmm::mr::device_memory_resource* mr,
-                                        cudaStream_t stream)
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
 
@@ -79,7 +81,7 @@ std::unique_ptr<column> group_count_all(rmm::device_vector<size_type> const& gro
 
   if (num_groups == 0) { return result; }
 
-  thrust::adjacent_difference(rmm::exec_policy(stream)->on(stream),
+  thrust::adjacent_difference(rmm::exec_policy(stream)->on(stream.value()),
                               group_offsets.begin() + 1,
                               group_offsets.end(),
                               result->mutable_view().begin<size_type>());
diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu
index aec10eec520..06aa172d125 100644
--- a/cpp/src/groupby/sort/group_max.cu
+++ b/cpp/src/groupby/sort/group_max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,22 +16,24 @@
 
 #include <groupby/sort/group_single_pass_reduction_util.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
                                   rmm::device_vector<size_type> const& group_labels,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   return type_dispatcher(values.type(),
                          reduce_functor<aggregation::MAX>{},
                          values,
                          num_groups,
                          group_labels,
-                         mr,
-                         stream);
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_min.cu b/cpp/src/groupby/sort/group_min.cu
index 89405ccc2fe..72bc3e6ba3d 100644
--- a/cpp/src/groupby/sort/group_min.cu
+++ b/cpp/src/groupby/sort/group_min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,22 +16,24 @@
 
 #include <groupby/sort/group_single_pass_reduction_util.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
                                   rmm::device_vector<size_type> const& group_labels,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   return type_dispatcher(values.type(),
                          reduce_functor<aggregation::MIN>{},
                          values,
                          num_groups,
                          group_labels,
-                         mr,
-                         stream);
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index bc9d0016207..e7e947b65fc 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -23,6 +23,8 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace groupby {
 namespace detail {
@@ -33,8 +35,8 @@ std::unique_ptr<column> group_nth_element(column_view const &values,
                                           size_type num_groups,
                                           size_type n,
                                           null_policy null_handling,
-                                          rmm::mr::device_memory_resource *mr,
-                                          cudaStream_t stream)
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
                "Size of values column should be same as that of group labels");
@@ -47,7 +49,7 @@ std::unique_ptr<column> group_nth_element(column_view const &values,
   if (null_handling == null_policy::INCLUDE || !values.has_nulls()) {
     // Returns index of nth value.
     thrust::transform_if(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream)->on(stream.value()),
       group_sizes.begin<size_type>(),
       group_sizes.end<size_type>(),
       group_offsets.begin(),
@@ -67,7 +69,7 @@ std::unique_ptr<column> group_nth_element(column_view const &values,
                                       [] __device__(auto b) { return static_cast<size_type>(b); });
     rmm::device_vector<size_type> intra_group_index(values.size());
     // intra group index for valids only.
-    thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream),
+    thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream.value()),
                                   group_labels.begin(),
                                   group_labels.end(),
                                   bitmask_iterator,
@@ -76,7 +78,7 @@ std::unique_ptr<column> group_nth_element(column_view const &values,
     rmm::device_vector<size_type> group_count = [&] {
       if (n < 0) {
         rmm::device_vector<size_type> group_count(num_groups);
-        thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+        thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()),
                               group_labels.begin(),
                               group_labels.end(),
                               bitmask_iterator,
@@ -88,7 +90,7 @@ std::unique_ptr<column> group_nth_element(column_view const &values,
       }
     }();
     // gather the valid index == n
-    thrust::scatter_if(rmm::exec_policy(stream)->on(stream),
+    thrust::scatter_if(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator<size_type>(0),
                        thrust::make_counting_iterator<size_type>(values.size()),
                        group_labels.begin(),                          // map
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index 37455a31a91..a1daaedaf27 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -20,6 +20,8 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 
@@ -35,8 +37,8 @@ struct nunique_functor {
              size_type const num_groups,
              rmm::device_vector<size_type> const& group_offsets,
              null_policy null_handling,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream)
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
     auto result = make_numeric_column(
       data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -61,7 +63,7 @@ struct nunique_functor {
           return static_cast<size_type>(is_unique);
         });
 
-      thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+      thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()),
                             group_labels.begin(),
                             group_labels.end(),
                             is_unique_iterator,
@@ -79,7 +81,7 @@ struct nunique_functor {
                            (not equal.operator()<T>(i, i - 1));    // new unique value in sorted
           return static_cast<size_type>(is_unique);
         });
-      thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+      thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()),
                             group_labels.begin(),
                             group_labels.end(),
                             is_unique_iterator,
@@ -96,8 +98,8 @@ struct nunique_functor {
              size_type const num_groups,
              rmm::device_vector<size_type> const& group_offsets,
              null_policy null_handling,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream)
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("list_view group_nunique not supported yet");
   }
@@ -108,8 +110,8 @@ std::unique_ptr<column> group_nunique(column_view const& values,
                                       size_type const num_groups,
                                       rmm::device_vector<size_type> const& group_offsets,
                                       null_policy null_handling,
-                                      rmm::mr::device_memory_resource* mr,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
@@ -122,8 +124,8 @@ std::unique_ptr<column> group_nunique(column_view const& values,
                          num_groups,
                          group_offsets,
                          null_handling,
-                         mr,
-                         stream);
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index 7afeb7c39e4..a9a46b25c04 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,14 +16,16 @@
 
 #include "group_reductions.hpp"
 
+#include <quantiles/quantiles_util.hpp>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/types.hpp>
-#include <quantiles/quantiles_util.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/for_each.h>
 
@@ -40,8 +42,8 @@ struct quantiles_functor {
     size_type const num_groups,
     rmm::device_vector<double> const& quantile,
     interpolation interpolation,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream = 0)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
   {
     using ResultType = cudf::detail::target_type_t<T, aggregation::QUANTILE>;
 
@@ -60,7 +62,7 @@ struct quantiles_functor {
     auto result_view     = mutable_column_device_view::create(result->mutable_view());
 
     // For each group, calculate quantile
-    thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator(0),
                        num_groups,
                        [d_values       = *values_view,
@@ -125,8 +127,8 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         size_type const num_groups,
                                         std::vector<double> const& quantiles,
                                         interpolation interp,
-                                        rmm::mr::device_memory_resource* mr,
-                                        cudaStream_t stream)
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 {
   rmm::device_vector<double> dv_quantiles(quantiles);
 
@@ -138,8 +140,8 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                          num_groups,
                          dv_quantiles,
                          interp,
-                         mr,
-                         stream);
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index f1952bc41f7..718ff6e0db9 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -20,6 +20,7 @@
 #include <cudf/column/column.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <memory>
 
@@ -38,8 +39,8 @@ namespace detail {
 std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
                                   rmm::device_vector<size_type> const& group_labels,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream = 0);
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate groupwise minimum value
@@ -53,8 +54,8 @@ std::unique_ptr<column> group_sum(column_view const& values,
 std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
                                   rmm::device_vector<size_type> const& group_labels,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream = 0);
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate groupwise maximum value
@@ -68,8 +69,8 @@ std::unique_ptr<column> group_min(column_view const& values,
 std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
                                   rmm::device_vector<size_type> const& group_labels,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream = 0);
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate group-wise indices of maximum values.
@@ -85,8 +86,8 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                      size_type num_groups,
                                      rmm::device_vector<size_type> const& group_labels,
                                      column_view const& key_sort_order,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream = 0);
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate group-wise indices of minimum values.
@@ -102,8 +103,8 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      size_type num_groups,
                                      rmm::device_vector<size_type> const& group_labels,
                                      column_view const& key_sort_order,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream = 0);
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate number of non-null values in each group of
@@ -118,8 +119,8 @@ std::unique_ptr<column> group_argmin(column_view const& values,
 std::unique_ptr<column> group_count_valid(column_view const& values,
                                           rmm::device_vector<size_type> const& group_labels,
                                           size_type num_groups,
-                                          rmm::mr::device_memory_resource* mr,
-                                          cudaStream_t stream = 0);
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate number of values in each group of @p values
@@ -131,8 +132,8 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
  */
 std::unique_ptr<column> group_count_all(rmm::device_vector<size_type> const& group_offsets,
                                         size_type num_groups,
-                                        rmm::mr::device_memory_resource* mr,
-                                        cudaStream_t stream = 0);
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate groupwise variance
@@ -151,8 +152,8 @@ std::unique_ptr<column> group_var(column_view const& values,
                                   column_view const& group_sizes,
                                   rmm::device_vector<size_type> const& group_labels,
                                   size_type ddof,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream = 0);
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate groupwise quantiles
@@ -171,8 +172,8 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         size_type const num_groups,
                                         std::vector<double> const& quantiles,
                                         interpolation interp,
-                                        rmm::mr::device_memory_resource* mr,
-                                        cudaStream_t stream = 0);
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate number of unique values in each group of
@@ -193,8 +194,8 @@ std::unique_ptr<column> group_nunique(column_view const& values,
                                       size_type const num_groups,
                                       rmm::device_vector<size_type> const& group_offsets,
                                       null_policy null_handling,
-                                      rmm::mr::device_memory_resource* mr,
-                                      cudaStream_t stream = 0);
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Internal API to calculate nth values in each group of  @p values
@@ -217,8 +218,8 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
                                           size_type num_groups,
                                           size_type n,
                                           null_policy null_handling,
-                                          rmm::mr::device_memory_resource* mr,
-                                          cudaStream_t stream = 0);
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 /**
  * @brief Internal API to collect grouped values into a lists column
  *
@@ -231,8 +232,8 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
 std::unique_ptr<column> group_collect(column_view const& values,
                                       rmm::device_vector<size_type> const& group_offsets,
                                       size_type num_groups,
-                                      rmm::mr::device_memory_resource* mr,
-                                      cudaStream_t stream = 0);
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index cc21405925b..696acc886a2 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,8 @@
 #include <cudf/types.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/discard_iterator.h>
 
 namespace cudf {
@@ -52,8 +54,8 @@ struct reduce_functor {
     column_view const& values,
     size_type num_groups,
     rmm::device_vector<cudf::size_type> const& group_labels,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
   {
     using OpType     = cudf::detail::corresponding_operator_t<K>;
     using ResultType = cudf::detail::target_type_t<T, K>;
@@ -70,10 +72,10 @@ struct reduce_functor {
     auto result_table = mutable_table_view({*result});
     cudf::detail::initialize_with_identity(result_table, {K}, stream);
 
-    auto resultview = mutable_column_device_view::create(result->mutable_view());
-    auto valuesview = column_device_view::create(values);
+    auto resultview = mutable_column_device_view::create(result->mutable_view(), stream);
+    auto valuesview = column_device_view::create(values, stream);
 
-    thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator(0),
                        values.size(),
                        [d_values     = *valuesview,
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index eb504951ebb..143a66ab2bd 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
@@ -63,17 +64,17 @@ struct var_functor {
     column_view const& group_sizes,
     rmm::device_vector<size_type> const& group_labels,
     size_type ddof,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
   {
 // Running this in debug build causes a runtime error:
 // `reduce_by_key failed on 2nd step: invalid device function`
 #if !defined(__CUDACC_DEBUG__)
     using ResultType                = cudf::detail::target_type_t<T, aggregation::Kind::VARIANCE>;
     size_type const* d_group_labels = group_labels.data().get();
-    auto values_view                = column_device_view::create(values);
-    auto means_view                 = column_device_view::create(group_means);
-    auto group_size_view            = column_device_view::create(group_sizes);
+    auto values_view                = column_device_view::create(values, stream);
+    auto means_view                 = column_device_view::create(group_means, stream);
+    auto group_size_view            = column_device_view::create(group_sizes, stream);
 
     std::unique_ptr<column> result = make_numeric_column(data_type(type_to_id<ResultType>()),
                                                          group_sizes.size(),
@@ -89,7 +90,7 @@ struct var_functor {
       thrust::make_counting_iterator(0),
       var_transform<ResultType, T>{d_values, d_means, d_group_sizes, d_group_labels, ddof});
 
-    thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+    thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()),
                           group_labels.begin(),
                           group_labels.end(),
                           values_it,
@@ -97,10 +98,10 @@ struct var_functor {
                           result->mutable_view().data<ResultType>());
 
     // set nulls
-    auto result_view = mutable_column_device_view::create(*result);
+    auto result_view = mutable_column_device_view::create(*result, stream);
 
     thrust::for_each_n(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream)->on(stream.value()),
       thrust::make_counting_iterator(0),
       group_sizes.size(),
       [d_result = *result_view, d_group_sizes = *group_size_view, ddof] __device__(size_type i) {
@@ -132,11 +133,11 @@ std::unique_ptr<column> group_var(column_view const& values,
                                   column_view const& group_sizes,
                                   rmm::device_vector<size_type> const& group_labels,
                                   size_type ddof,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   return type_dispatcher(
-    values.type(), var_functor{}, values, group_means, group_sizes, group_labels, ddof, mr, stream);
+    values.type(), var_functor{}, values, group_means, group_sizes, group_labels, ddof, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_sum.cu b/cpp/src/groupby/sort/group_sum.cu
index 25b68ae86f4..bf3aff91c99 100644
--- a/cpp/src/groupby/sort/group_sum.cu
+++ b/cpp/src/groupby/sort/group_sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,22 +16,24 @@
 
 #include <groupby/sort/group_single_pass_reduction_util.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace groupby {
 namespace detail {
 std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
                                   rmm::device_vector<size_type> const& group_labels,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   return type_dispatcher(values.type(),
                          reduce_functor<aggregation::SUM>{},
                          values,
                          num_groups,
                          group_labels,
-                         mr,
-                         stream);
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu
index 1b9fff02fba..84cc1af0d76 100644
--- a/cpp/src/groupby/sort/groupby.cu
+++ b/cpp/src/groupby/sort/groupby.cu
@@ -33,6 +33,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <unordered_map>
 #include <utility>
@@ -54,9 +56,9 @@ struct store_result_functor {
                        column_view const& values,
                        sort::sort_groupby_helper& helper,
                        cudf::detail::result_cache& cache,
-                       cudaStream_t stream,
+                       rmm::cuda_stream_view stream,
                        rmm::mr::device_memory_resource* mr)
-    : col_idx(col_idx), values(values), helper(helper), cache(cache), stream(stream), mr(mr)
+    : col_idx(col_idx), helper(helper), cache(cache), values(values), stream(stream), mr(mr)
   {
   }
 
@@ -105,7 +107,7 @@ struct store_result_functor {
   cudf::detail::result_cache& cache;  ///< cache of results to store into
   column_view const& values;          ///< Column of values to group and aggregate
 
-  cudaStream_t stream;                  ///< CUDA stream on which to execute kernels
+  rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
   rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
 
   std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
@@ -122,8 +124,8 @@ void store_result_functor::operator()<aggregation::COUNT_VALID>(aggregation cons
     agg,
     get_grouped_values().nullable()
       ? detail::group_count_valid(
-          get_grouped_values(), helper.group_labels(), helper.num_groups(), mr, stream)
-      : detail::group_count_all(helper.group_offsets(), helper.num_groups(), mr, stream));
+          get_grouped_values(), helper.group_labels(), helper.num_groups(), stream, mr)
+      : detail::group_count_all(helper.group_offsets(), helper.num_groups(), stream, mr));
 }
 
 template <>
@@ -132,7 +134,7 @@ void store_result_functor::operator()<aggregation::COUNT_ALL>(aggregation const&
   if (cache.has_result(col_idx, agg)) return;
 
   cache.add_result(
-    col_idx, agg, detail::group_count_all(helper.group_offsets(), helper.num_groups(), mr, stream));
+    col_idx, agg, detail::group_count_all(helper.group_offsets(), helper.num_groups(), stream, mr));
 }
 
 template <>
@@ -143,7 +145,7 @@ void store_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
   cache.add_result(col_idx,
                    agg,
                    detail::group_sum(
-                     get_grouped_values(), helper.num_groups(), helper.group_labels(), mr, stream));
+                     get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr));
 };
 
 template <>
@@ -157,8 +159,8 @@ void store_result_functor::operator()<aggregation::ARGMAX>(aggregation const& ag
                                         helper.num_groups(),
                                         helper.group_labels(),
                                         helper.key_sort_order(),
-                                        mr,
-                                        stream));
+                                        stream,
+                                        mr));
 };
 
 template <>
@@ -172,8 +174,8 @@ void store_result_functor::operator()<aggregation::ARGMIN>(aggregation const& ag
                                         helper.num_groups(),
                                         helper.group_labels(),
                                         helper.key_sort_order(),
-                                        mr,
-                                        stream));
+                                        stream,
+                                        mr));
 };
 
 template <>
@@ -184,7 +186,7 @@ void store_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
   auto result = [&]() {
     if (cudf::is_fixed_width(values.type())) {
       return detail::group_min(
-        get_grouped_values(), helper.num_groups(), helper.group_labels(), mr, stream);
+        get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr);
     } else {
       auto argmin_agg = make_argmin_aggregation();
       operator()<aggregation::ARGMIN>(*argmin_agg);
@@ -221,7 +223,7 @@ void store_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
   auto result = [&]() {
     if (cudf::is_fixed_width(values.type())) {
       return detail::group_max(
-        get_grouped_values(), helper.num_groups(), helper.group_labels(), mr, stream);
+        get_grouped_values(), helper.num_groups(), helper.group_labels(), stream, mr);
     } else {
       auto argmax_agg = make_argmax_aggregation();
       operator()<aggregation::ARGMAX>(*argmax_agg);
@@ -292,8 +294,8 @@ void store_result_functor::operator()<aggregation::VARIANCE>(aggregation const&
                                   group_sizes,
                                   helper.group_labels(),
                                   var_agg._ddof,
-                                  mr,
-                                  stream);
+                                  stream,
+                                  mr);
   cache.add_result(col_idx, agg, std::move(result));
 };
 
@@ -327,8 +329,8 @@ void store_result_functor::operator()<aggregation::QUANTILE>(aggregation const&
                                         helper.num_groups(),
                                         quantile_agg._quantiles,
                                         quantile_agg._interpolation,
-                                        mr,
-                                        stream);
+                                        stream,
+                                        mr);
   cache.add_result(col_idx, agg, std::move(result));
 };
 
@@ -347,8 +349,8 @@ void store_result_functor::operator()<aggregation::MEDIAN>(aggregation const& ag
                                         helper.num_groups(),
                                         {0.5},
                                         interpolation::LINEAR,
-                                        mr,
-                                        stream);
+                                        stream,
+                                        mr);
   cache.add_result(col_idx, agg, std::move(result));
 };
 
@@ -364,8 +366,8 @@ void store_result_functor::operator()<aggregation::NUNIQUE>(aggregation const& a
                                       helper.num_groups(),
                                       helper.group_offsets(),
                                       nunique_agg._null_handling,
-                                      mr,
-                                      stream);
+                                      stream,
+                                      mr);
   cache.add_result(col_idx, agg, std::move(result));
 };
 
@@ -394,8 +396,8 @@ void store_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation cons
                                              helper.num_groups(),
                                              nth_element_agg._n,
                                              nth_element_agg._null_handling,
-                                             mr,
-                                             stream));
+                                             stream,
+                                             mr));
 }
 
 template <>
@@ -404,7 +406,7 @@ void store_result_functor::operator()<aggregation::COLLECT>(aggregation const& a
   if (cache.has_result(col_idx, agg)) return;
 
   auto result = detail::group_collect(
-    get_grouped_values(), helper.group_offsets(), helper.num_groups(), mr, stream);
+    get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr);
 
   cache.add_result(col_idx, agg, std::move(result));
 };
@@ -414,7 +416,7 @@ void store_result_functor::operator()<aggregation::COLLECT>(aggregation const& a
 // Sort-based groupby
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_aggregate(
   std::vector<aggregation_request> const& requests,
-  cudaStream_t stream,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   // We're going to start by creating a cache of results so that aggs that
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index 950d8c2931b..f9e4fdc411b 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,8 @@
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/pair.h>
 
 #include <cassert>
@@ -144,6 +146,7 @@ class concurrent_unordered_map {
    * responsibility to synchronize or use the same stream to access the map.
    *
    * @param capacity The maximum number of pairs the map may hold
+   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param unused_element The sentinel value to use for an empty value
    * @param unused_key The sentinel value to use for an empty key
    * @param hash_function The hash function to use for hashing keys
@@ -151,15 +154,14 @@ class concurrent_unordered_map {
    * equal
    * @param allocator The allocator to use for allocation the hash table's
    * storage
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    **/
   static auto create(size_type capacity,
+                     rmm::cuda_stream_view stream     = rmm::cuda_stream_default,
                      const mapped_type unused_element = std::numeric_limits<mapped_type>::max(),
                      const key_type unused_key        = std::numeric_limits<key_type>::max(),
                      const Hasher& hash_function      = hasher(),
                      const Equality& equal            = key_equal(),
-                     const allocator_type& allocator  = allocator_type(),
-                     cudaStream_t stream              = 0)
+                     const allocator_type& allocator  = allocator_type())
   {
     CUDF_FUNC_RANGE();
     using Self = concurrent_unordered_map<Key, Element, Hasher, Equality, Allocator>;
@@ -416,7 +418,8 @@ class concurrent_unordered_map {
     }
   }
 
-  void assign_async(const concurrent_unordered_map& other, cudaStream_t stream = 0)
+  void assign_async(const concurrent_unordered_map& other,
+                    rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     if (other.m_capacity <= m_capacity) {
       m_capacity = other.m_capacity;
@@ -431,13 +434,13 @@ class concurrent_unordered_map {
                              other.m_hashtbl_values,
                              m_capacity * sizeof(value_type),
                              cudaMemcpyDefault,
-                             stream));
+                             stream.value()));
   }
 
-  void clear_async(cudaStream_t stream = 0)
+  void clear_async(rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     constexpr int block_size = 128;
-    init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream>>>(
+    init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
       m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
   }
 
@@ -449,16 +452,16 @@ class concurrent_unordered_map {
     }
   }
 
-  void prefetch(const int dev_id, cudaStream_t stream = 0)
+  void prefetch(const int dev_id, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     cudaPointerAttributes hashtbl_values_ptr_attributes;
     cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values);
 
     if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
-      CUDA_TRY(
-        cudaMemPrefetchAsync(m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream));
+      CUDA_TRY(cudaMemPrefetchAsync(
+        m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value()));
     }
-    CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream));
+    CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value()));
   }
 
   /**
@@ -469,7 +472,7 @@ class concurrent_unordered_map {
    *
    * @param stream CUDA stream used for device memory operations and kernel launches.
    **/
-  void destroy(cudaStream_t stream = 0)
+  void destroy(rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     m_allocator.deallocate(m_hashtbl_values, m_capacity, stream);
     delete this;
@@ -510,7 +513,7 @@ class concurrent_unordered_map {
                            const Hasher& hash_function,
                            const Equality& equal,
                            const allocator_type& allocator,
-                           cudaStream_t stream = 0)
+                           rmm::cuda_stream_view stream = rmm::cuda_stream_default)
     : m_hf(hash_function),
       m_equal(equal),
       m_allocator(allocator),
@@ -528,12 +531,12 @@ class concurrent_unordered_map {
       if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
         int dev_id = 0;
         CUDA_TRY(cudaGetDevice(&dev_id));
-        CUDA_TRY(
-          cudaMemPrefetchAsync(m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream));
+        CUDA_TRY(cudaMemPrefetchAsync(
+          m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value()));
       }
     }
 
-    init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream>>>(
+    init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
       m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
     CUDA_TRY(cudaGetLastError());
   }
diff --git a/cpp/src/hash/concurrent_unordered_multimap.cuh b/cpp/src/hash/concurrent_unordered_multimap.cuh
index 1807065bc86..8ba36e8696d 100644
--- a/cpp/src/hash/concurrent_unordered_multimap.cuh
+++ b/cpp/src/hash/concurrent_unordered_multimap.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,8 @@
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/pair.h>
 
 #include <cassert>
@@ -91,20 +93,20 @@ class concurrent_unordered_multimap {
    * responsibility to synchronize or use the same stream to access the map.
    *
    * @param capacity The maximum number of pairs the map may hold.
+   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param init Indicates if the map should be initialized with the unused
    * key/values
    * @param hash_function The hash function to use for hashing keys
    * @param equal The equality comparison function for comparing if two keys are
    * equal
    * @param allocator The allocator to use for allocation of the map's storage
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    **/
   static auto create(size_type capacity,
+                     rmm::cuda_stream_view stream    = rmm::cuda_stream_default,
                      const bool init                 = true,
                      const Hasher& hash_function     = hasher(),
                      const Equality& equal           = key_equal(),
-                     const allocator_type& allocator = allocator_type(),
-                     cudaStream_t stream             = 0)
+                     const allocator_type& allocator = allocator_type())
   {
     CUDF_FUNC_RANGE();
     using Self = concurrent_unordered_multimap<Key,
@@ -133,7 +135,7 @@ class concurrent_unordered_multimap {
    *
    * @param stream CUDA stream used for device memory operations and kernel launches.
    **/
-  void destroy(cudaStream_t stream = 0)
+  void destroy(rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     m_allocator.deallocate(m_hashtbl_values, m_hashtbl_capacity, stream);
     delete this;
@@ -483,7 +485,8 @@ class concurrent_unordered_multimap {
     return const_iterator(m_hashtbl_values, m_hashtbl_values + m_hashtbl_size, begin_ptr);
   }
 
-  void assign_async(const concurrent_unordered_multimap& other, cudaStream_t stream = 0)
+  void assign_async(const concurrent_unordered_multimap& other,
+                    rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     m_collisions = other.m_collisions;
     if (other.m_hashtbl_size <= m_hashtbl_capacity) {
@@ -499,13 +502,13 @@ class concurrent_unordered_multimap {
                              other.m_hashtbl_values,
                              m_hashtbl_size * sizeof(value_type),
                              cudaMemcpyDefault,
-                             stream));
+                             stream.value()));
   }
 
-  void clear_async(cudaStream_t stream = 0)
+  void clear_async(rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     constexpr int block_size = 128;
-    init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size, 0, stream>>>(
+    init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
       m_hashtbl_values, m_hashtbl_size, unused_key, unused_element);
     if (count_collisions) m_collisions = 0;
   }
@@ -520,14 +523,14 @@ class concurrent_unordered_multimap {
     }
   }
 
-  void prefetch(const int dev_id, cudaStream_t stream = 0)
+  void prefetch(const int dev_id, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     cudaPointerAttributes hashtbl_values_ptr_attributes;
     cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values);
 
     if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
       CUDA_TRY(cudaMemPrefetchAsync(
-        m_hashtbl_values, m_hashtbl_size * sizeof(value_type), dev_id, stream));
+        m_hashtbl_values, m_hashtbl_size * sizeof(value_type), dev_id, stream.value()));
     }
   }
 
@@ -561,11 +564,11 @@ class concurrent_unordered_multimap {
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
   explicit concurrent_unordered_multimap(size_type n,
-                                         const bool init             = true,
-                                         const Hasher& hash_function = hasher(),
-                                         const Equality& equal       = key_equal(),
-                                         const allocator_type& a     = allocator_type(),
-                                         cudaStream_t stream         = 0)
+                                         const bool init              = true,
+                                         const Hasher& hash_function  = hasher(),
+                                         const Equality& equal        = key_equal(),
+                                         const allocator_type& a      = allocator_type(),
+                                         rmm::cuda_stream_view stream = rmm::cuda_stream_default)
     : m_hf(hash_function),
       m_equal(equal),
       m_allocator(a),
@@ -584,12 +587,12 @@ class concurrent_unordered_multimap {
         int dev_id = 0;
         CUDA_TRY(cudaGetDevice(&dev_id));
         CUDA_TRY(cudaMemPrefetchAsync(
-          m_hashtbl_values, m_hashtbl_size * sizeof(value_type), dev_id, stream));
+          m_hashtbl_values, m_hashtbl_size * sizeof(value_type), dev_id, stream.value()));
       }
     }
 
     if (init) {
-      init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size, 0, stream>>>(
+      init_hashtbl<<<((m_hashtbl_size - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
         m_hashtbl_values, m_hashtbl_size, unused_key, unused_element);
       CUDA_TRY(cudaGetLastError());
     }
diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/src/hash/hash_allocator.cuh
index 7a0f3fd4005..0c4acccf33d 100644
--- a/cpp/src/hash/hash_allocator.cuh
+++ b/cpp/src/hash/hash_allocator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #include <new>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -35,12 +36,14 @@ struct managed_allocator {
   {
   }
 
-  T* allocate(std::size_t n, cudaStream_t stream = 0) const
+  T* allocate(std::size_t n, rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
   {
     return static_cast<T*>(mr->allocate(n * sizeof(T), stream));
   }
 
-  void deallocate(T* p, std::size_t n, cudaStream_t stream = 0) const
+  void deallocate(T* p,
+                  std::size_t n,
+                  rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
   {
     mr->deallocate(p, n * sizeof(T), stream);
   }
@@ -69,12 +72,14 @@ struct default_allocator {
   {
   }
 
-  T* allocate(std::size_t n, cudaStream_t stream = 0) const
+  T* allocate(std::size_t n, rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
   {
     return static_cast<T*>(mr->allocate(n * sizeof(T), stream));
   }
 
-  void deallocate(T* p, std::size_t n, cudaStream_t stream = 0) const
+  void deallocate(T* p,
+                  std::size_t n,
+                  rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
   {
     mr->deallocate(p, n * sizeof(T), stream);
   }
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index 8e91de9707f..e6f6ba2bbad 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -675,14 +675,14 @@ std::unique_ptr<column> md5_hash(table_view const& input,
     "MD5 unsupported column type");
 
   // Result column allocation and creation
-  auto begin          = thrust::make_constant_iterator(32);
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    begin, begin + input.num_rows(), mr, stream.value());
+  auto begin = thrust::make_constant_iterator(32);
+  auto offsets_column =
+    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
   auto offsets_view  = offsets_column->view();
   auto d_new_offsets = offsets_view.data<int32_t>();
 
   auto chars_column = strings::detail::create_chars_child_column(
-    input.num_rows(), 0, input.num_rows() * 32, mr, stream.value());
+    input.num_rows(), 0, input.num_rows() * 32, stream, mr);
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
 
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index 0e6173529f6..11d4bc414ca 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,12 @@
 
 #pragma once
 
+#include <hash/helper_functions.cuh>
+
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
-#include <hash/helper_functions.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
@@ -68,7 +71,7 @@ class unordered_multiset {
   /**
    * @brief Factory to construct a new unordered_multiset
    **/
-  static unordered_multiset<Element> create(column_view const &col, cudaStream_t stream)
+  static unordered_multiset<Element> create(column_view const &col, rmm::cuda_stream_view stream)
   {
     auto d_column = column_device_view::create(col, stream);
     auto d_col    = *d_column;
@@ -82,7 +85,7 @@ class unordered_multiset {
     size_type *d_hash_bins_end   = hash_bins_end.data().get();
     Element *d_hash_data         = hash_data.data().get();
 
-    thrust::for_each(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      thrust::make_counting_iterator<size_type>(col.size()),
                      [d_hash_bins_start, d_col, hasher] __device__(size_t idx) {
@@ -93,17 +96,17 @@ class unordered_multiset {
                        }
                      });
 
-    thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
+    thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                            hash_bins_start.begin(),
                            hash_bins_start.end(),
                            hash_bins_end.begin());
 
-    thrust::copy(rmm::exec_policy(stream)->on(stream),
+    thrust::copy(rmm::exec_policy(stream)->on(stream.value()),
                  hash_bins_end.begin(),
                  hash_bins_end.end(),
                  hash_bins_start.begin());
 
-    thrust::for_each(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      thrust::make_counting_iterator<size_type>(col.size()),
                      [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) {
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index efc19791c07..5f4fcb1c108 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp
index 4f208d8985c..0bdb1f5eeb6 100644
--- a/cpp/src/interop/from_arrow.cpp
+++ b/cpp/src/interop/from_arrow.cpp
@@ -209,9 +209,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  if (array.length() == 0) {
-    return cudf::strings::detail::make_empty_strings_column(mr, stream.value());
-  }
+  if (array.length() == 0) { return cudf::strings::detail::make_empty_strings_column(stream, mr); }
   auto str_array    = static_cast<arrow::StringArray const*>(&array);
   auto offset_array = std::make_unique<arrow::Int32Array>(
     str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
@@ -294,7 +292,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::struct_view>(
     out_mask = detail::copy_bitmask(static_cast<bitmask_type*>(out_mask.data()),
                                     array.offset(),
                                     array.offset() + array.length(),
-                                    rmm::cuda_stream_view{stream},
+                                    stream,
                                     mr);
   }
 
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index f5c9c708821..93ec44e4cb2 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -17,6 +17,8 @@
 
 #include <io/utilities/block_utils.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 using cudf::detail::device_span;
 
 namespace cudf {
@@ -310,32 +312,32 @@ extern "C" __global__ void __launch_bounds__(NWARPS * 32, 2)
  * @param[in] first_row Crop all rows below first_row
  * @param[in] min_row_size Minimum size in bytes of a row
  * @param[in] stream CUDA stream to use, default 0
- */
-void __host__ DecodeAvroColumnData(block_desc_s *blocks,
-                                   schemadesc_s *schema,
-                                   device_span<nvstrdesc_s> global_dictionary,
-                                   const uint8_t *avro_data,
-                                   uint32_t num_blocks,
-                                   uint32_t schema_len,
-                                   size_t max_rows,
-                                   size_t first_row,
-                                   uint32_t min_row_size,
-                                   cudaStream_t stream)
+ **/
+void DecodeAvroColumnData(block_desc_s *blocks,
+                          schemadesc_s *schema,
+                          device_span<nvstrdesc_s> global_dictionary,
+                          const uint8_t *avro_data,
+                          uint32_t num_blocks,
+                          uint32_t schema_len,
+                          size_t max_rows,
+                          size_t first_row,
+                          uint32_t min_row_size,
+                          rmm::cuda_stream_view stream)
 {
   // NWARPS warps per threadblock
   dim3 const dim_block(32, NWARPS);
   // 1 warp per datablock, NWARPS datablocks per threadblock
   dim3 const dim_grid((num_blocks + NWARPS - 1) / NWARPS, 1);
 
-  gpuDecodeAvroColumnData<<<dim_grid, dim_block, 0, stream>>>(blocks,
-                                                              schema,
-                                                              global_dictionary,
-                                                              avro_data,
-                                                              num_blocks,
-                                                              schema_len,
-                                                              min_row_size,
-                                                              max_rows,
-                                                              first_row);
+  gpuDecodeAvroColumnData<<<dim_grid, dim_block, 0, stream.value()>>>(blocks,
+                                                                      schema,
+                                                                      global_dictionary,
+                                                                      avro_data,
+                                                                      num_blocks,
+                                                                      schema_len,
+                                                                      min_row_size,
+                                                                      max_rows,
+                                                                      first_row);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/avro/avro_gpu.h b/cpp/src/io/avro/avro_gpu.h
index 7f03482c54a..5aac6f99a80 100644
--- a/cpp/src/io/avro/avro_gpu.h
+++ b/cpp/src/io/avro/avro_gpu.h
@@ -19,6 +19,8 @@
 
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 namespace avro {
@@ -61,10 +63,10 @@ void DecodeAvroColumnData(block_desc_s *blocks,
                           const uint8_t *avro_data,
                           uint32_t num_blocks,
                           uint32_t schema_len,
-                          size_t max_rows       = ~0,
-                          size_t first_row      = 0,
-                          uint32_t min_row_size = 0,
-                          cudaStream_t stream   = (cudaStream_t)0);
+                          size_t max_rows              = ~0,
+                          size_t first_row             = 0,
+                          uint32_t min_row_size        = 0,
+                          rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 }  // namespace gpu
 }  // namespace avro
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 95a1710d3ae..68c746f2956 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -139,7 +140,7 @@ class metadata : public file_metadata {
 };
 
 rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_block_data,
-                                                 cudaStream_t stream)
+                                                 rmm::cuda_stream_view stream)
 {
   size_t uncompressed_data_size = 0;
   hostdevice_vector<gpu_inflate_input_s> inflate_in(_metadata->block_list.size());
@@ -191,8 +192,9 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_
                              inflate_in.host_ptr(),
                              inflate_in.memory_size(),
                              cudaMemcpyHostToDevice,
-                             stream));
-    CUDA_TRY(cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream));
+                             stream.value()));
+    CUDA_TRY(
+      cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream.value()));
     if (_metadata->codec == "deflate") {
       CUDA_TRY(gpuinflate(
         inflate_in.device_ptr(), inflate_out.device_ptr(), inflate_in.size(), 0, stream));
@@ -206,8 +208,8 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_
                              inflate_out.device_ptr(),
                              inflate_out.memory_size(),
                              cudaMemcpyDeviceToHost,
-                             stream));
-    CUDA_TRY(cudaStreamSynchronize(stream));
+                             stream.value()));
+    stream.synchronize();
 
     // Check if larger output is required, as it's not known ahead of time
     if (_metadata->codec == "deflate" && !loop_cnt) {
@@ -247,7 +249,7 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data,
                                size_t num_rows,
                                std::vector<std::pair<int, std::string>> selection,
                                std::vector<column_buffer> &out_buffers,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream)
 {
   // Build gpu schema
   hostdevice_vector<gpu::schemadesc_s> schema_desc(_metadata->schema.size());
@@ -312,7 +314,7 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data,
                            schema_desc.host_ptr(),
                            schema_desc.memory_size(),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
 
   gpu::DecodeAvroColumnData(static_cast<block_desc_s *>(block_list.data()),
                             schema_desc.device_ptr(),
@@ -332,15 +334,16 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data,
                                valid_alias[i],
                                out_buffers[i].null_mask_size(),
                                cudaMemcpyHostToDevice,
-                               stream));
+                               stream.value()));
     }
   }
   CUDA_TRY(cudaMemcpyAsync(schema_desc.host_ptr(),
                            schema_desc.device_ptr(),
                            schema_desc.memory_size(),
                            cudaMemcpyDeviceToHost,
-                           stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+                           stream.value()));
+  stream.synchronize();
+
   for (size_t i = 0; i < out_buffers.size(); i++) {
     const auto col_idx          = selection[i].first;
     const auto schema_null_idx  = _metadata->columns[col_idx].schema_null_idx;
@@ -351,13 +354,14 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data,
 reader::impl::impl(std::unique_ptr<datasource> source,
                    avro_reader_options const &options,
                    rmm::mr::device_memory_resource *mr)
-  : _source(std::move(source)), _mr(mr), _columns(options.get_columns())
+  : _mr(mr), _source(std::move(source)), _columns(options.get_columns())
 {
   // Open the source Avro dataset metadata
   _metadata = std::make_unique<metadata>(_source.get());
 }
 
-table_with_metadata reader::impl::read(avro_reader_options const &options, cudaStream_t stream)
+table_with_metadata reader::impl::read(avro_reader_options const &options,
+                                       rmm::cuda_stream_view stream)
 {
   auto skip_rows = options.get_skip_rows();
   auto num_rows  = options.get_num_rows();
@@ -430,17 +434,18 @@ table_with_metadata reader::impl::read(avro_reader_options const &options, cudaS
             dict_pos += len;
           }
         }
+
         CUDA_TRY(cudaMemcpyAsync(d_global_dict.data(),
                                  h_global_dict.data(),
                                  h_global_dict.size() * sizeof(gpu::nvstrdesc_s),
                                  cudaMemcpyDefault,
-                                 stream));
+                                 stream.value()));
         CUDA_TRY(cudaMemcpyAsync(d_global_dict_data.data(),
                                  h_global_dict_data.data(),
                                  h_global_dict_data.size() * sizeof(char),
                                  cudaMemcpyDefault,
-                                 stream));
-        CUDA_TRY(cudaStreamSynchronize(stream));
+                                 stream.value()));
+        stream.synchronize();
       }
 
       std::vector<column_buffer> out_buffers;
@@ -453,7 +458,7 @@ table_with_metadata reader::impl::read(avro_reader_options const &options, cudaS
       decode_data(block_data, dict, d_global_dict, num_rows, selected_columns, out_buffers, stream);
 
       for (size_t i = 0; i < column_types.size(); ++i) {
-        out_columns.emplace_back(make_column(out_buffers[i], stream, _mr));
+        out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, _mr));
       }
     } else {
       // Create empty columns
@@ -496,7 +501,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(avro_reader_options const &options, cudaStream_t stream)
+table_with_metadata reader::read(avro_reader_options const &options, rmm::cuda_stream_view stream)
 {
   return _impl->read(options, stream);
 }
diff --git a/cpp/src/io/avro/reader_impl.hpp b/cpp/src/io/avro/reader_impl.hpp
index cdebb3cf9dc..880c428b60d 100644
--- a/cpp/src/io/avro/reader_impl.hpp
+++ b/cpp/src/io/avro/reader_impl.hpp
@@ -31,6 +31,8 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/avro.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <string>
 #include <utility>
@@ -70,7 +72,7 @@ class reader::impl {
    *
    * @return The set of columns along with metadata
    */
-  table_with_metadata read(avro_reader_options const &options, cudaStream_t stream);
+  table_with_metadata read(avro_reader_options const &options, rmm::cuda_stream_view stream);
 
  private:
   /**
@@ -82,7 +84,7 @@ class reader::impl {
    * @return Device buffer to decompressed block data
    */
   rmm::device_buffer decompress_data(const rmm::device_buffer &comp_block_data,
-                                     cudaStream_t stream);
+                                     rmm::cuda_stream_view stream);
 
   /**
    * @brief Convert the avro row-based block data and outputs to columns
@@ -99,7 +101,7 @@ class reader::impl {
                    size_t num_rows,
                    std::vector<std::pair<int, std::string>> columns,
                    std::vector<column_buffer> &out_buffers,
-                   cudaStream_t stream);
+                   rmm::cuda_stream_view stream);
 
  private:
   rmm::mr::device_memory_resource *_mr = nullptr;
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 5fcf73c03d1..57bad5f3283 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -54,11 +54,15 @@ THE SOFTWARE.
 
 */
 
-#include <cudf/utilities/error.hpp>
-#include <io/utilities/block_utils.cuh>
 #include "brotli_dict.h"
 #include "gpuinflate.h"
 
+#include <io/utilities/block_utils.cuh>
+
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 #define HUFFTAB_LUT1_BITS 8
@@ -2025,7 +2029,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s *inputs,
                                   void *scratch,
                                   size_t scratch_size,
                                   int count,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream)
 {
   uint32_t count32 = (count > 0) ? count : 0;
   uint32_t fb_heap_size;
@@ -2037,15 +2041,15 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s *inputs,
   scratch_size = min(scratch_size, (size_t)0xffffffffu);
   fb_heap_size = (uint32_t)((scratch_size - sizeof(brotli_dictionary_s)) & ~0xf);
 
-  CUDA_TRY(cudaMemsetAsync(scratch_u8, 0, 2 * sizeof(uint32_t), stream));
+  CUDA_TRY(cudaMemsetAsync(scratch_u8, 0, 2 * sizeof(uint32_t), stream.value()));
   // NOTE: The 128KB dictionary copy can have a relatively large overhead since source isn't
   // page-locked
   CUDA_TRY(cudaMemcpyAsync(scratch_u8 + fb_heap_size,
                            get_brotli_dictionary(),
                            sizeof(brotli_dictionary_s),
                            cudaMemcpyHostToDevice,
-                           stream));
-  gpu_debrotli_kernel<<<dim_grid, dim_block, 0, stream>>>(
+                           stream.value()));
+  gpu_debrotli_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(
     inputs, outputs, scratch_u8, fb_heap_size, count32);
 #if DUMP_FB_HEAP
   uint32_t dump[2];
@@ -2053,8 +2057,8 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s *inputs,
   printf("heap dump (%d bytes)\n", fb_heap_size);
   while (cur < fb_heap_size && !(cur & 3)) {
     CUDA_TRY(cudaMemcpyAsync(
-      &dump[0], scratch_u8 + cur, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
-    CUDA_TRY(cudaStreamSynchronize(stream));
+      &dump[0], scratch_u8 + cur, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream.value()));
+    stream.synchronize();
     printf("@%d: next = %d, size = %d\n", cur, dump[0], dump[1]);
     cur = (dump[0] > cur) ? dump[0] : 0xffffffffu;
   }
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 723b0850a6c..840b868ffb5 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -43,9 +43,12 @@ misrepresented as being the original software.
 Mark Adler    madler@alumni.caltech.edu
 */
 
-#include <io/utilities/block_utils.cuh>
 #include "gpuinflate.h"
 
+#include <io/utilities/block_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 #define NUMTHREADS 128  // Threads per block
@@ -1199,17 +1202,19 @@ cudaError_t __host__ gpuinflate(gpu_inflate_input_s *inputs,
                                 gpu_inflate_status_s *outputs,
                                 int count,
                                 int parse_hdr,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream)
 {
-  if (count > 0) { inflate_kernel<<<count, NUMTHREADS, 0, stream>>>(inputs, outputs, parse_hdr); }
+  if (count > 0) {
+    inflate_kernel<<<count, NUMTHREADS, 0, stream.value()>>>(inputs, outputs, parse_hdr);
+  }
   return cudaSuccess;
 }
 
 cudaError_t __host__ gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs,
                                                   int count,
-                                                  cudaStream_t stream)
+                                                  rmm::cuda_stream_view stream)
 {
-  if (count > 0) { copy_uncompressed_kernel<<<count, 1024, 0, stream>>>(inputs); }
+  if (count > 0) { copy_uncompressed_kernel<<<count, 1024, 0, stream.value()>>>(inputs); }
   return cudaSuccess;
 }
 
diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h
index 461256d3762..692752c4e33 100644
--- a/cpp/src/io/comp/gpuinflate.h
+++ b/cpp/src/io/comp/gpuinflate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <stdint.h>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 /**
@@ -53,9 +55,9 @@ struct gpu_inflate_status_s {
  **/
 cudaError_t gpuinflate(gpu_inflate_input_s *inputs,
                        gpu_inflate_status_s *outputs,
-                       int count           = 1,
-                       int parse_hdr       = 0,
-                       cudaStream_t stream = (cudaStream_t)0);
+                       int count                    = 1,
+                       int parse_hdr                = 0,
+                       rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Interface for copying uncompressed byte blocks
@@ -65,8 +67,8 @@ cudaError_t gpuinflate(gpu_inflate_input_s *inputs,
  * @param[in] stream CUDA stream to use, default 0
  **/
 cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs,
-                                         int count           = 1,
-                                         cudaStream_t stream = (cudaStream_t)0);
+                                         int count                    = 1,
+                                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Interface for decompressing Snappy-compressed data
@@ -81,8 +83,8 @@ cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s *inputs,
  **/
 cudaError_t gpu_unsnap(gpu_inflate_input_s *inputs,
                        gpu_inflate_status_s *outputs,
-                       int count           = 1,
-                       cudaStream_t stream = (cudaStream_t)0);
+                       int count                    = 1,
+                       rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Computes the size of temporary memory for Brotli decompression
@@ -110,8 +112,8 @@ cudaError_t gpu_debrotli(gpu_inflate_input_s *inputs,
                          gpu_inflate_status_s *outputs,
                          void *scratch,
                          size_t scratch_size,
-                         int count           = 1,
-                         cudaStream_t stream = (cudaStream_t)0);
+                         int count                    = 1,
+                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Interface for compressing data with Snappy
@@ -126,8 +128,8 @@ cudaError_t gpu_debrotli(gpu_inflate_input_s *inputs,
  **/
 cudaError_t gpu_snap(gpu_inflate_input_s *inputs,
                      gpu_inflate_status_s *outputs,
-                     int count           = 1,
-                     cudaStream_t stream = (cudaStream_t)0);
+                     int count                    = 1,
+                     rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 01214b00933..a3ab6a49a88 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
-#include <io/utilities/block_utils.cuh>
 #include "gpuinflate.h"
 
+#include <io/utilities/block_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 #define HASH_BITS 12
@@ -342,11 +345,13 @@ extern "C" __global__ void __launch_bounds__(128)
 cudaError_t __host__ gpu_snap(gpu_inflate_input_s *inputs,
                               gpu_inflate_status_s *outputs,
                               int count,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);  // 4 warps per stream, 1 stream per block
   dim3 dim_grid(count, 1);
-  if (count > 0) { snap_kernel<<<dim_grid, dim_block, 0, stream>>>(inputs, outputs, count); }
+  if (count > 0) {
+    snap_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, count);
+  }
   return cudaSuccess;
 }
 
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 0d9407d7f65..e0824a8a0fb 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,18 @@
  * limitations under the License.
  */
 
-#include <cuda_runtime.h>
-#include <string.h>  // memset
-#include <zlib.h>    // uncompress
 #include "io_uncomp.h"
 #include "unbz2.h"  // bz2 uncompress
 
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <cuda_runtime.h>
+
+#include <string.h>  // memset
+
+#include <zlib.h>  // uncompress
+
 using cudf::detail::host_span;
 
 namespace cudf {
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index 44229e1202e..0eeb4602463 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -14,10 +14,14 @@
  * limitations under the License.
  */
 
-#include <cub/cub.cuh>
-#include <io/utilities/block_utils.cuh>
 #include "gpuinflate.h"
 
+#include <io/utilities/block_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cub/cub.cuh>
+
 namespace cudf {
 namespace io {
 // Not supporting streams longer than this (not what snappy is intended for)
@@ -695,13 +699,13 @@ __global__ void __launch_bounds__(block_size)
 cudaError_t __host__ gpu_unsnap(gpu_inflate_input_s *inputs,
                                 gpu_inflate_status_s *outputs,
                                 int count,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream)
 {
   uint32_t count32 = (count > 0) ? count : 0;
   dim3 dim_block(128, 1);     // 4 warps per stream, 1 stream per block
   dim3 dim_grid(count32, 1);  // TODO: Check max grid dimensions vs max expected count
 
-  unsnap_kernel<128><<<dim_grid, dim_block, 0, stream>>>(inputs, outputs);
+  unsnap_kernel<128><<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs);
 
   return cudaSuccess;
 }
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index dec35cb7feb..8b913e5918c 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -32,6 +32,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/detail/copy.h>
 #include <thrust/transform.h>
 
@@ -1009,13 +1011,13 @@ __global__ void __launch_bounds__(rowofs_block_dim)
 size_t __host__ count_blank_rows(const cudf::io::parse_options_view &opts,
                                  device_span<char const> const data,
                                  device_span<uint64_t const> const row_offsets,
-                                 cudaStream_t stream)
+                                 rmm::cuda_stream_view stream)
 {
   const auto newline  = opts.skipblanklines ? opts.terminator : opts.comment;
   const auto comment  = opts.comment != '\0' ? opts.comment : newline;
   const auto carriage = (opts.skipblanklines && opts.terminator == '\n') ? '\r' : comment;
   return thrust::count_if(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     row_offsets.begin(),
     row_offsets.end(),
     [data = data, newline, comment, carriage] __device__(const uint64_t pos) {
@@ -1027,14 +1029,14 @@ size_t __host__ count_blank_rows(const cudf::io::parse_options_view &opts,
 void __host__ remove_blank_rows(cudf::io::parse_options_view const &options,
                                 device_span<char const> const data,
                                 rmm::device_vector<uint64_t> &row_offsets,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream)
 {
   size_t d_size       = data.size();
   const auto newline  = options.skipblanklines ? options.terminator : options.comment;
   const auto comment  = options.comment != '\0' ? options.comment : newline;
   const auto carriage = (options.skipblanklines && options.terminator == '\n') ? '\r' : comment;
   auto new_end        = thrust::remove_if(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     row_offsets.begin(),
     row_offsets.end(),
     [data = data, d_size, newline, comment, carriage] __device__(const uint64_t pos) {
@@ -1050,7 +1052,7 @@ thrust::host_vector<column_type_histogram> detect_column_types(
   device_span<column_parse::flags const> const column_flags,
   device_span<uint64_t const> const row_starts,
   size_t const num_active_columns,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   // Calculate actual block count to use based on records count
   const int block_size = csvparse_block_dim;
@@ -1058,7 +1060,7 @@ thrust::host_vector<column_type_histogram> detect_column_types(
 
   auto d_stats = rmm::device_vector<column_type_histogram>(num_active_columns);
 
-  data_type_detection<<<grid_size, block_size, 0, stream>>>(
+  data_type_detection<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_starts, d_stats);
 
   return thrust::host_vector<column_type_histogram>(d_stats);
@@ -1071,14 +1073,14 @@ void __host__ decode_row_column_data(cudf::io::parse_options_view const &options
                                      device_span<cudf::data_type const> const dtypes,
                                      device_span<void *> const columns,
                                      device_span<cudf::bitmask_type *> const valids,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream)
 {
   // Calculate actual block count to use based on records count
   auto const block_size = csvparse_block_dim;
   auto const num_rows   = row_offsets.size() - 1;
   auto const grid_size  = (num_rows + block_size - 1) / block_size;
 
-  convert_csv_to_cudf<<<grid_size, block_size, 0, stream>>>(
+  convert_csv_to_cudf<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_offsets, dtypes, columns, valids);
 }
 
@@ -1093,11 +1095,11 @@ uint32_t __host__ gather_row_offsets(const parse_options_view &options,
                                      size_t byte_range_start,
                                      size_t byte_range_end,
                                      size_t skip_rows,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream)
 {
   uint32_t dim_grid = 1 + (chunk_size / rowofs_block_bytes);
 
-  gather_row_offsets_gpu<<<dim_grid, rowofs_block_dim, 0, stream>>>(
+  gather_row_offsets_gpu<<<dim_grid, rowofs_block_dim, 0, stream.value()>>>(
     row_ctx,
     offsets_out,
     data,
diff --git a/cpp/src/io/csv/csv_gpu.h b/cpp/src/io/csv/csv_gpu.h
index 921b17a8520..91982d60896 100644
--- a/cpp/src/io/csv/csv_gpu.h
+++ b/cpp/src/io/csv/csv_gpu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 using cudf::detail::device_span;
 
@@ -162,7 +163,7 @@ uint32_t gather_row_offsets(cudf::io::parse_options_view const &options,
                             size_t byte_range_start,
                             size_t byte_range_end,
                             size_t skip_rows,
-                            cudaStream_t stream = 0);
+                            rmm::cuda_stream_view stream);
 
 /**
  * Count the number of blank rows in the given row offset array
@@ -176,7 +177,7 @@ uint32_t gather_row_offsets(cudf::io::parse_options_view const &options,
 size_t count_blank_rows(cudf::io::parse_options_view const &options,
                         device_span<char const> data,
                         device_span<uint64_t const> row_offsets,
-                        cudaStream_t stream = 0);
+                        rmm::cuda_stream_view stream);
 
 /**
  * Remove blank rows in the given row offset array
@@ -190,7 +191,7 @@ size_t count_blank_rows(cudf::io::parse_options_view const &options,
 void remove_blank_rows(const cudf::io::parse_options_view &options,
                        device_span<char const> data,
                        rmm::device_vector<uint64_t> &row_offsets,
-                       cudaStream_t stream = 0);
+                       rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for detecting possible dtype of each column of data
@@ -209,7 +210,7 @@ thrust::host_vector<column_type_histogram> detect_column_types(
   device_span<column_parse::flags const> column_flags,
   device_span<uint64_t const> row_offsets,
   size_t const num_active_columns,
-  cudaStream_t stream = 0);
+  rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for decoding row-column data
@@ -230,7 +231,7 @@ void decode_row_column_data(cudf::io::parse_options_view const &options,
                             device_span<cudf::data_type const> dtypes,
                             device_span<void *> columns,
                             device_span<cudf::bitmask_type *> valids,
-                            cudaStream_t stream = 0);
+                            rmm::cuda_stream_view stream);
 
 }  // namespace gpu
 }  // namespace csv
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 15dfb5f5534..3cfb4d88ec6 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -21,6 +21,8 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <strings/convert/utilities.cuh>
 #include <strings/utilities.cuh>
 
@@ -168,21 +170,20 @@ struct duration_to_string_fn : public duration_to_string_size_fn<T> {
 struct dispatch_from_durations_fn {
   template <typename T, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& durations,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     size_type strings_count = durations.size();
     auto column             = column_device_view::create(durations, stream);
     auto d_column           = *column;
 
     // copy null mask
-    rmm::device_buffer null_mask =
-      cudf::detail::copy_bitmask(durations, rmm::cuda_stream_view{stream}, mr);
+    rmm::device_buffer null_mask = cudf::detail::copy_bitmask(durations, stream, mr);
     // build offsets column
     auto offsets_transformer_itr = thrust::make_transform_iterator(
       thrust::make_counting_iterator<int32_t>(0), duration_to_string_size_fn<T>{d_column});
     auto offsets_column = strings::detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+      offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
     auto offsets_view  = offsets_column->view();
     auto d_new_offsets = offsets_view.template data<int32_t>();
 
@@ -190,11 +191,11 @@ struct dispatch_from_durations_fn {
     auto const chars_bytes =
       cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
     auto chars_column = strings::detail::create_chars_child_column(
-      strings_count, durations.null_count(), chars_bytes, mr, stream);
+      strings_count, durations.null_count(), chars_bytes, stream, mr);
     auto chars_view = chars_column->mutable_view();
     auto d_chars    = chars_view.template data<char>();
 
-    thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator<size_type>(0),
                        strings_count,
                        duration_to_string_fn<T>{d_column, d_new_offsets, d_chars});
@@ -212,8 +213,8 @@ struct dispatch_from_durations_fn {
   // non-duration types throw an exception
   template <typename T, std::enable_if_t<not cudf::is_duration<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
-                                     rmm::mr::device_memory_resource*,
-                                     cudaStream_t) const
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
   {
     CUDF_FAIL("Values for from_durations function must be a duration type.");
   }
@@ -222,13 +223,13 @@ struct dispatch_from_durations_fn {
 }  // namespace
 
 std::unique_ptr<column> pandas_format_durations(column_view const& durations,
-                                                cudaStream_t stream,
+                                                rmm::cuda_stream_view stream,
                                                 rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = durations.size();
   if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
 
-  return type_dispatcher(durations.type(), dispatch_from_durations_fn{}, durations, mr, stream);
+  return type_dispatcher(durations.type(), dispatch_from_durations_fn{}, durations, stream, mr);
 }
 
 }  // namespace csv
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 602c3e7f82d..9093a4030e8 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -31,6 +31,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <algorithm>
 #include <iostream>
 #include <numeric>
@@ -178,7 +180,7 @@ std::vector<std::string> setColumnNames(std::vector<char> const &header,
   return col_names;
 }
 
-table_with_metadata reader::impl::read(cudaStream_t stream)
+table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
 {
   auto range_offset  = opts_.get_byte_range_offset();
   auto range_size    = opts_.get_byte_range_size();
@@ -353,7 +355,7 @@ table_with_metadata reader::impl::read(cudaStream_t stream)
         out_columns.emplace_back(
           cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr_));
       } else {
-        out_columns.emplace_back(make_column(out_buffers[i], stream, mr_));
+        out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr_));
       }
     }
   } else {
@@ -386,7 +388,7 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
                                       size_t skip_rows,
                                       int64_t num_rows,
                                       bool load_whole_file,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream)
 {
   constexpr size_t max_chunk_bytes = 64 * 1024 * 1024;  // 64MB
   size_t buffer_size               = std::min(max_chunk_bytes, data.size());
@@ -428,8 +430,9 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
                              row_ctx.device_ptr(),
                              num_blocks * sizeof(uint64_t),
                              cudaMemcpyDeviceToHost,
-                             stream));
-    CUDA_TRY(cudaStreamSynchronize(stream));
+                             stream.value()));
+    stream.synchronize();
+
     // Sum up the rows in each character block, selecting the row count that
     // corresponds to the current input context. Also stores the now known input
     // context per character block that will be needed by the second pass.
@@ -447,7 +450,7 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
                                row_ctx.host_ptr(),
                                num_blocks * sizeof(uint64_t),
                                cudaMemcpyHostToDevice,
-                               stream));
+                               stream.value()));
 
       // Pass 2: Output row offsets
       cudf::io::csv::gpu::gather_row_offsets(opts.view(),
@@ -468,8 +471,9 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
                                  row_ctx.device_ptr(),
                                  num_blocks * sizeof(uint64_t),
                                  cudaMemcpyDeviceToHost,
-                                 stream));
-        CUDA_TRY(cudaStreamSynchronize(stream));
+                                 stream.value()));
+        stream.synchronize();
+
         size_t rows_out_of_range = 0;
         for (uint32_t i = 0; i < num_blocks; i++) { rows_out_of_range += row_ctx[i]; }
         if (rows_out_of_range != 0) {
@@ -514,8 +518,9 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
                              row_offsets_.data().get() + header_row_index,
                              2 * sizeof(uint64_t),
                              cudaMemcpyDeviceToHost,
-                             stream));
-    CUDA_TRY(cudaStreamSynchronize(stream));
+                             stream.value()));
+    stream.synchronize();
+
     const auto header_start = buffer_pos + row_ctx[0];
     const auto header_end   = buffer_pos + row_ctx[1];
     CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(),
@@ -529,7 +534,7 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
   if (num_rows >= 0) { row_offsets_.resize(std::min<size_t>(row_offsets_.size(), num_rows + 1)); }
 }
 
-std::vector<data_type> reader::impl::gather_column_types(cudaStream_t stream)
+std::vector<data_type> reader::impl::gather_column_types(rmm::cuda_stream_view stream)
 {
   std::vector<data_type> dtypes;
 
@@ -542,7 +547,7 @@ std::vector<data_type> reader::impl::gather_column_types(cudaStream_t stream)
       auto column_stats = cudf::io::csv::gpu::detect_column_types(
         opts.view(), data_, d_column_flags_, row_offsets_, num_active_cols_, stream);
 
-      CUDA_TRY(cudaStreamSynchronize(stream));
+      stream.synchronize();
 
       for (int col = 0; col < num_active_cols_; col++) {
         unsigned long long int_count_total = column_stats[col].big_int_count +
@@ -648,7 +653,7 @@ std::vector<data_type> reader::impl::gather_column_types(cudaStream_t stream)
 }
 
 std::vector<column_buffer> reader::impl::decode_data(std::vector<data_type> const &column_types,
-                                                     cudaStream_t stream)
+                                                     rmm::cuda_stream_view stream)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
@@ -687,7 +692,7 @@ std::vector<column_buffer> reader::impl::decode_data(std::vector<data_type> cons
   cudf::io::csv::gpu::decode_row_column_data(
     opts.view(), data_, d_column_flags_, row_offsets_, d_dtypes, d_data, d_valid, stream);
 
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  stream.synchronize();
 
   for (int i = 0; i < num_active_cols_; ++i) { out_buffers[i].null_count() = UNKNOWN_NULL_COUNT; }
 
@@ -790,7 +795,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(cudaStream_t stream) { return _impl->read(stream); }
+table_with_metadata reader::read(rmm::cuda_stream_view stream) { return _impl->read(stream); }
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 29cc8bab6fe..67246165be0 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -28,6 +28,8 @@
 #include <cudf/io/detail/csv.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <string>
 #include <utility>
@@ -86,7 +88,7 @@ class reader::impl {
    *
    * @return The set of columns along with metadata
    */
-  table_with_metadata read(cudaStream_t stream);
+  table_with_metadata read(rmm::cuda_stream_view stream);
 
  private:
   /**
@@ -110,7 +112,7 @@ class reader::impl {
                           size_t skip_rows,
                           int64_t num_rows,
                           bool load_whole_file,
-                          cudaStream_t stream);
+                          rmm::cuda_stream_view stream);
 
   /**
    * @brief Find the start position of the first data row
@@ -128,7 +130,7 @@ class reader::impl {
    *
    * @return `std::vector<data_type>` List of column types
    */
-  std::vector<data_type> gather_column_types(cudaStream_t stream);
+  std::vector<data_type> gather_column_types(rmm::cuda_stream_view stream);
 
   /**
    * @brief Converts the row-column data and outputs to column bufferrs.
@@ -139,7 +141,7 @@ class reader::impl {
    * @return list of column buffers of decoded data, or ptr/size in the case of strings.
    */
   std::vector<column_buffer> decode_data(std::vector<data_type> const &column_types,
-                                         cudaStream_t stream);
+                                         rmm::cuda_stream_view stream);
 
  private:
   rmm::mr::device_memory_resource *mr_ = nullptr;
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index d833acb46ef..e3ad19e0445 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -21,27 +21,23 @@
 
 #include "writer_impl.hpp"
 
+#include <strings/utilities.cuh>
+
 #include <cudf/copying.hpp>
 #include <cudf/null_mask.hpp>
-
-#include <cudf/utilities/traits.hpp>
-
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
-
-#include <cudf/strings/combine.hpp>
+#include <cudf/strings/detail/modify_strings.cuh>
 #include <cudf/strings/replace.hpp>
+#include <cudf/utilities/traits.hpp>
 
-#include <strings/utilities.cuh>
-
-#include <algorithm>
-#include <cstring>
-#include <iterator>
-#include <sstream>
-#include <type_traits>
-#include <utility>
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
@@ -50,11 +46,12 @@
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
-#include <rmm/thrust_rmm_allocator.h>
-#include <rmm/device_buffer.hpp>
-
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/detail/modify_strings.cuh>
+#include <algorithm>
+#include <cstring>
+#include <iterator>
+#include <sstream>
+#include <type_traits>
+#include <utility>
 
 namespace cudf {
 namespace io {
@@ -217,7 +214,7 @@ struct column_to_strings_fn {
 
   explicit column_to_strings_fn(csv_writer_options const& options,
                                 rmm::mr::device_memory_resource* mr = nullptr,
-                                cudaStream_t stream                 = nullptr)
+                                rmm::cuda_stream_view stream        = nullptr)
     : options_(options), mr_(mr), stream_(stream)
   {
   }
@@ -268,7 +265,7 @@ struct column_to_strings_fn {
     string_scalar delimiter{std::string{options_.get_inter_column_delimiter()}, true, stream_};
     predicate_special_chars pred{delimiter.value(stream_)};
 
-    return modify_strings<probe_special_chars, modify_special_chars>(column_v, mr_, stream_, pred);
+    return modify_strings<probe_special_chars, modify_special_chars>(column_v, stream_, mr_, pred);
   }
 
   // ints:
@@ -354,7 +351,7 @@ struct column_to_strings_fn {
  private:
   csv_writer_options const& options_;
   rmm::mr::device_memory_resource* mr_;
-  cudaStream_t stream_;
+  rmm::cuda_stream_view stream_;
 };
 }  // unnamed namespace
 
@@ -380,7 +377,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
 //
 void writer::impl::write_chunked_begin(table_view const& table,
                                        const table_metadata* metadata,
-                                       cudaStream_t stream)
+                                       rmm::cuda_stream_view stream)
 {
   if ((metadata != nullptr) && (options_.is_enabled_include_header())) {
     CUDF_EXPECTS(metadata->column_names.size() == static_cast<size_t>(table.num_columns()),
@@ -402,7 +399,7 @@ void writer::impl::write_chunked_begin(table_view const& table,
 
 void writer::impl::write_chunked(strings_column_view const& str_column_view,
                                  const table_metadata* metadata,
-                                 cudaStream_t stream)
+                                 rmm::cuda_stream_view stream)
 {
   // algorithm outline:
   //
@@ -442,9 +439,9 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
                              ptr_all_bytes,
                              total_num_bytes * sizeof(char),
                              cudaMemcpyDeviceToHost,
-                             stream));
+                             stream.value()));
 
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    stream.synchronize();
 
     // host algorithm call, where the underlying call
     // is also host_write taking a host buffer;
@@ -459,7 +456,7 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
 
 void writer::impl::write(table_view const& table,
                          const table_metadata* metadata,
-                         cudaStream_t stream)
+                         rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(table.num_columns() > 0, "Empty table.");
 
@@ -495,15 +492,16 @@ void writer::impl::write(table_view const& table,
       splits.resize(n_chunks);
 
       rmm::device_vector<size_type> d_splits(n_chunks, n_rows_per_chunk);
-      thrust::inclusive_scan(exec->on(stream), d_splits.begin(), d_splits.end(), d_splits.begin());
+      thrust::inclusive_scan(
+        exec->on(stream.value()), d_splits.begin(), d_splits.end(), d_splits.begin());
 
       CUDA_TRY(cudaMemcpyAsync(splits.data(),
                                d_splits.data().get(),
                                n_chunks * sizeof(size_type),
                                cudaMemcpyDeviceToHost,
-                               stream));
+                               stream.value()));
 
-      CUDA_TRY(cudaStreamSynchronize(stream));
+      stream.synchronize();
 
       // split table_view into chunks:
       //
@@ -548,7 +546,9 @@ void writer::impl::write(table_view const& table,
   write_chunked_end(table, metadata, stream);
 }
 
-void writer::write(table_view const& table, const table_metadata* metadata, cudaStream_t stream)
+void writer::write(table_view const& table,
+                   const table_metadata* metadata,
+                   rmm::cuda_stream_view stream)
 {
   _impl->write(table, metadata, stream);
 }
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
index 24ede6a8fe8..f3d2f999070 100644
--- a/cpp/src/io/csv/writer_impl.hpp
+++ b/cpp/src/io/csv/writer_impl.hpp
@@ -28,6 +28,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -65,7 +67,7 @@ class writer::impl {
    **/
   void write(table_view const& table,
              const table_metadata* metadata = nullptr,
-             cudaStream_t stream            = nullptr);
+             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
 
   /**
    * @brief Write the header of a CSV format.
@@ -76,7 +78,7 @@ class writer::impl {
    **/
   void write_chunked_begin(table_view const& table,
                            const table_metadata* metadata = nullptr,
-                           cudaStream_t stream            = nullptr);
+                           rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
 
   /**
    * @brief Write dataset to CSV format without header.
@@ -87,7 +89,7 @@ class writer::impl {
    **/
   void write_chunked(strings_column_view const& strings_column,
                      const table_metadata* metadata = nullptr,
-                     cudaStream_t stream            = nullptr);
+                     rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
 
   /**
    * @brief Write footer of CSV format (typically, empty).
@@ -98,7 +100,7 @@ class writer::impl {
    **/
   void write_chunked_end(table_view const& table,
                          const table_metadata* metadata = nullptr,
-                         cudaStream_t stream            = nullptr)
+                         rmm::cuda_stream_view stream   = rmm::cuda_stream_default)
   {
     // purposely no-op (for now);
   }
@@ -111,7 +113,7 @@ class writer::impl {
 
 std::unique_ptr<column> pandas_format_durations(
   column_view const& durations,
-  cudaStream_t stream,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace csv
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 5f91b2cb2ce..a8e65216e5d 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -30,6 +30,7 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <thrust/detail/copy.h>
@@ -795,7 +796,7 @@ void convert_json_to_columns(parse_options_view const &opts,
                              device_span<void *const> const output_columns,
                              device_span<bitmask_type *const> const valid_fields,
                              device_span<cudf::size_type> num_valid_fields,
-                             cudaStream_t stream)
+                             rmm::cuda_stream_view stream)
 {
   int block_size;
   int min_grid_size;
@@ -804,7 +805,7 @@ void convert_json_to_columns(parse_options_view const &opts,
 
   const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
 
-  convert_data_to_columns_kernel<<<grid_size, block_size, 0, stream>>>(
+  convert_data_to_columns_kernel<<<grid_size, block_size, 0, stream.value()>>>(
     opts, data, row_offsets, column_types, col_map, output_columns, valid_fields, num_valid_fields);
 
   CUDA_TRY(cudaGetLastError());
@@ -821,7 +822,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
   bool do_set_null_count,
   int num_columns,
   col_map_type *col_map,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   int block_size;
   int min_grid_size;
@@ -834,7 +835,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
   if (do_set_null_count) {
     // Set the null count to the row count (all fields assumes to be null).
     thrust::for_each(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream)->on(stream.value()),
       d_column_infos.begin(),
       d_column_infos.end(),
       [num_records = row_offsets.size()] __device__(auto &info) { info.null_count = num_records; });
@@ -843,7 +844,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
   // Calculate actual block count to use based on records count
   const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
 
-  detect_data_types_kernel<<<grid_size, block_size, 0, stream>>>(
+  detect_data_types_kernel<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, row_offsets, col_map, num_columns, d_column_infos);
 
   CUDA_TRY(cudaGetLastError());
@@ -863,7 +864,7 @@ void collect_keys_info(parse_options_view const &options,
                        device_span<uint64_t const> const row_offsets,
                        unsigned long long int *keys_cnt,
                        thrust::optional<mutable_table_device_view> keys_info,
-                       cudaStream_t stream)
+                       rmm::cuda_stream_view stream)
 {
   int block_size;
   int min_grid_size;
@@ -873,7 +874,7 @@ void collect_keys_info(parse_options_view const &options,
   // Calculate actual block count to use based on records count
   const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
 
-  collect_keys_info_kernel<<<grid_size, block_size, 0, stream>>>(
+  collect_keys_info_kernel<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, row_offsets, keys_cnt, keys_info);
 
   CUDA_TRY(cudaGetLastError());
diff --git a/cpp/src/io/json/json_gpu.h b/cpp/src/io/json/json_gpu.h
index de7dd21b7f3..cbab408d2f1 100644
--- a/cpp/src/io/json/json_gpu.h
+++ b/cpp/src/io/json/json_gpu.h
@@ -25,6 +25,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/optional.h>
 
 using cudf::detail::device_span;
@@ -57,7 +59,7 @@ void convert_json_to_columns(parse_options_view const &options,
                              device_span<void *const> output_columns,
                              device_span<bitmask_type *const> valid_fields,
                              device_span<cudf::size_type> num_valid_fields,
-                             cudaStream_t stream = 0);
+                             rmm::cuda_stream_view stream);
 
 /**
  * @brief Process a buffer of data and determine information about the column types within.
@@ -79,7 +81,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
   bool do_set_null_count,
   int num_columns,
   col_map_type *col_map,
-  cudaStream_t stream = 0);
+  rmm::cuda_stream_view stream);
 
 /**
  * @brief Collects information about JSON object keys in the file.
@@ -96,7 +98,7 @@ void collect_keys_info(parse_options_view const &options,
                        device_span<uint64_t const> row_offsets,
                        unsigned long long int *keys_cnt,
                        thrust::optional<mutable_table_device_view> keys_info,
-                       cudaStream_t stream = 0);
+                       rmm::cuda_stream_view stream);
 
 }  // namespace gpu
 }  // namespace json
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 3246f7e9ed0..121f0825228 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -35,6 +35,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 
 #include <thrust/optional.h>
@@ -108,11 +109,12 @@ std::unique_ptr<table> aggregate_keys_info(std::unique_ptr<table> info)
 /**
  * @brief Initializes the (key hash -> column index) hash map.
  */
-col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes, cudaStream_t stream)
+col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes,
+                                           rmm::cuda_stream_view stream)
 {
-  auto key_col_map{col_map_type::create(column_name_hashes.size())};
+  auto key_col_map{col_map_type::create(column_name_hashes.size(), stream)};
   auto const column_data = column_name_hashes.data<uint32_t>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      column_name_hashes.size(),
                      [map = *key_col_map, column_data] __device__(size_type idx) mutable {
@@ -136,7 +138,7 @@ col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes, cudaS
 std::unique_ptr<table> create_json_keys_info_table(const parse_options_view &options,
                                                    device_span<char const> const data,
                                                    device_span<uint64_t const> const row_offsets,
-                                                   cudaStream_t stream)
+                                                   rmm::cuda_stream_view stream)
 {
   // Count keys
   rmm::device_scalar<unsigned long long int> key_counter(0, stream);
@@ -166,7 +168,7 @@ std::unique_ptr<table> create_json_keys_info_table(const parse_options_view &opt
  */
 std::vector<std::string> create_key_strings(char const *h_data,
                                             table_view sorted_info,
-                                            cudaStream_t stream)
+                                            rmm::cuda_stream_view stream)
 {
   auto const num_cols = sorted_info.num_rows();
   std::vector<uint64_t> h_offsets(num_cols);
@@ -174,14 +176,14 @@ std::vector<std::string> create_key_strings(char const *h_data,
                   sorted_info.column(0).data<uint64_t>(),
                   sizeof(uint64_t) * num_cols,
                   cudaMemcpyDefault,
-                  stream);
+                  stream.value());
 
   std::vector<uint16_t> h_lens(num_cols);
   cudaMemcpyAsync(h_lens.data(),
                   sorted_info.column(1).data<uint16_t>(),
                   sizeof(uint16_t) * num_cols,
                   cudaMemcpyDefault,
-                  stream);
+                  stream.value());
 
   std::vector<std::string> names(num_cols);
   std::transform(h_offsets.cbegin(),
@@ -206,7 +208,7 @@ auto sort_keys_info_by_offset(std::unique_ptr<table> info)
  * @return Names of JSON object keys in the file
  */
 std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_object_keys_hashes(
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   auto info = create_json_keys_info_table(
     opts_.view(),
@@ -259,7 +261,7 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
  * Sets the uncomp_data_ and uncomp_size_ data members
  * Loads the data into device memory if byte range parameters are not used
  */
-void reader::impl::decompress_input(cudaStream_t stream)
+void reader::impl::decompress_input(rmm::cuda_stream_view stream)
 {
   const auto compression_type =
     infer_compression_type(options_.get_compression(),
@@ -289,7 +291,7 @@ void reader::impl::decompress_input(cudaStream_t stream)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-void reader::impl::set_record_starts(cudaStream_t stream)
+void reader::impl::set_record_starts(rmm::cuda_stream_view stream)
 {
   std::vector<char> chars_to_count{'\n'};
   // Currently, ignoring lineterminations within quotes is handled by recording the records of both,
@@ -310,7 +312,7 @@ void reader::impl::set_record_starts(cudaStream_t stream)
   // Manually adding an extra row to account for the first row in the file
   if (byte_range_offset_ == 0) {
     find_result_ptr++;
-    CUDA_TRY(cudaMemsetAsync(rec_starts_.data().get(), 0ull, sizeof(uint64_t), stream));
+    CUDA_TRY(cudaMemsetAsync(rec_starts_.data().get(), 0ull, sizeof(uint64_t), stream.value()));
   }
 
   std::vector<char> chars_to_find{'\n'};
@@ -325,7 +327,7 @@ void reader::impl::set_record_starts(cudaStream_t stream)
   // Previous call stores the record pinput_file.typeositions as encountered by all threads
   // Sort the record positions as subsequent processing may require filtering
   // certain rows or other processing on specific records
-  thrust::sort(rmm::exec_policy()->on(stream), rec_starts_.begin(), rec_starts_.end());
+  thrust::sort(rmm::exec_policy()->on(stream.value()), rec_starts_.begin(), rec_starts_.end());
 
   auto filtered_count = prefilter_count;
   if (allow_newlines_in_strings_) {
@@ -343,7 +345,7 @@ void reader::impl::set_record_starts(cudaStream_t stream)
     }
 
     rec_starts_ = h_rec_starts;
-    thrust::sort(rmm::exec_policy()->on(stream), rec_starts_.begin(), rec_starts_.end());
+    thrust::sort(rmm::exec_policy()->on(stream.value()), rec_starts_.begin(), rec_starts_.end());
   }
 
   // Exclude the ending newline as it does not precede a record start
@@ -360,7 +362,7 @@ void reader::impl::set_record_starts(cudaStream_t stream)
  * Also updates the array of record starts to match the device data offset.
  *
  */
-void reader::impl::upload_data_to_device(cudaStream_t stream)
+void reader::impl::upload_data_to_device(rmm::cuda_stream_view stream)
 {
   size_t start_offset = 0;
   size_t end_offset   = uncomp_size_;
@@ -382,7 +384,7 @@ void reader::impl::upload_data_to_device(cudaStream_t stream)
     // Adjust row start positions to account for the data subcopy
     start_offset = h_rec_starts.front();
     rec_starts_.resize(h_rec_starts.size());
-    thrust::transform(rmm::exec_policy()->on(stream),
+    thrust::transform(rmm::exec_policy()->on(stream.value()),
                       rec_starts_.begin(),
                       rec_starts_.end(),
                       thrust::make_constant_iterator(start_offset),
@@ -405,7 +407,7 @@ void reader::impl::upload_data_to_device(cudaStream_t stream)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-void reader::impl::set_column_names(cudaStream_t stream)
+void reader::impl::set_column_names(rmm::cuda_stream_view stream)
 {
   // If file only contains one row, use the file size for the row size
   uint64_t first_row_len = data_.size() / sizeof(char);
@@ -415,12 +417,15 @@ void reader::impl::set_column_names(cudaStream_t stream)
                              rec_starts_.data().get() + 1,
                              sizeof(uint64_t),
                              cudaMemcpyDeviceToHost,
-                             stream));
+                             stream.value()));
   }
   std::vector<char> first_row(first_row_len);
-  CUDA_TRY(cudaMemcpyAsync(
-    first_row.data(), data_.data(), first_row_len * sizeof(char), cudaMemcpyDeviceToHost, stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  CUDA_TRY(cudaMemcpyAsync(first_row.data(),
+                           data_.data(),
+                           first_row_len * sizeof(char),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+  stream.synchronize();
 
   // Determine the row format between:
   //   JSON array - [val1, val2, ...] and
@@ -459,7 +464,7 @@ void reader::impl::set_column_names(cudaStream_t stream)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-void reader::impl::set_data_types(cudaStream_t stream)
+void reader::impl::set_data_types(rmm::cuda_stream_view stream)
 {
   auto const dtype = options_.get_dtypes();
   if (!dtype.empty()) {
@@ -555,7 +560,7 @@ void reader::impl::set_data_types(cudaStream_t stream)
  *
  * @return table_with_metadata struct
  */
-table_with_metadata reader::impl::convert_data_to_table(cudaStream_t stream)
+table_with_metadata reader::impl::convert_data_to_table(rmm::cuda_stream_view stream)
 {
   const auto num_columns = dtypes_.size();
   const auto num_records = rec_starts_.size();
@@ -592,8 +597,7 @@ table_with_metadata reader::impl::convert_data_to_table(cudaStream_t stream)
     d_valid_counts,
     stream);
 
-  CUDA_TRY(cudaStreamSynchronize(stream));
-  CUDA_TRY(cudaGetLastError());
+  stream.synchronize();
 
   // postprocess columns
   auto target = make_strings_column(
@@ -605,7 +609,7 @@ table_with_metadata reader::impl::convert_data_to_table(cudaStream_t stream)
   for (size_t i = 0; i < num_columns; ++i) {
     out_buffers[i].null_count() = num_records - h_valid_counts[i];
 
-    auto out_column = make_column(out_buffers[i], stream, mr_);
+    auto out_column = make_column(out_buffers[i], nullptr, stream, mr_);
     if (out_column->type().id() == type_id::STRING) {
       // Need to remove escape character in case of '\"' and '\\'
       out_columns.emplace_back(cudf::strings::detail::replace(
@@ -624,7 +628,7 @@ reader::impl::impl(std::unique_ptr<datasource> source,
                    std::string filepath,
                    json_reader_options const &options,
                    rmm::mr::device_memory_resource *mr)
-  : source_(std::move(source)), filepath_(filepath), options_(options), mr_(mr)
+  : options_(options), mr_(mr), source_(std::move(source)), filepath_(filepath)
 {
   CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
@@ -644,7 +648,8 @@ reader::impl::impl(std::unique_ptr<datasource> source,
  *
  * @return Table and its metadata
  */
-table_with_metadata reader::impl::read(json_reader_options const &options, cudaStream_t stream)
+table_with_metadata reader::impl::read(json_reader_options const &options,
+                                       rmm::cuda_stream_view stream)
 {
   auto range_offset = options.get_byte_range_offset();
   auto range_size   = options.get_byte_range_size();
@@ -695,7 +700,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(json_reader_options const &options, cudaStream_t stream)
+table_with_metadata reader::read(json_reader_options const &options, rmm::cuda_stream_view stream)
 {
   return table_with_metadata{_impl->read(options, stream)};
 }
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index 7bea0b2cf85..ffd3dc58fe7 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -24,16 +24,19 @@
 #include "json.h"
 #include "json_gpu.h"
 
-#include <thrust/device_vector.h>
-#include <hash/concurrent_unordered_map.cuh>
-#include <rmm/device_buffer.hpp>
-
 #include <io/utilities/column_buffer.hpp>
 
+#include <hash/concurrent_unordered_map.cuh>
+
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+#include <thrust/device_vector.h>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -117,14 +120,14 @@ class reader::impl {
    * @return Array of keys and a map that maps their hash values to column indices
    */
   std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
-    cudaStream_t stream);
+    rmm::cuda_stream_view stream);
 
   /**
    * @brief Decompress the input data, if needed
    *
    * Sets the uncomp_data_ and uncomp_size_ data members
    */
-  void decompress_input(cudaStream_t stream);
+  void decompress_input(rmm::cuda_stream_view stream);
 
   /**
    * @brief Finds all record starts in the file and stores them in rec_starts_
@@ -133,7 +136,7 @@ class reader::impl {
    *
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
-  void set_record_starts(cudaStream_t stream);
+  void set_record_starts(rmm::cuda_stream_view stream);
 
   /**
    * @brief Uploads the relevant segment of the input json data onto the GPU.
@@ -142,7 +145,7 @@ class reader::impl {
    * Only rows that need to be parsed are copied, based on the byte range
    * Also updates the array of record starts to match the device data offset.
    */
-  void upload_data_to_device(cudaStream_t stream);
+  void upload_data_to_device(rmm::cuda_stream_view stream);
 
   /**
    * @brief Parse the first row to set the column name
@@ -151,7 +154,7 @@ class reader::impl {
    *
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
-  void set_column_names(cudaStream_t stream);
+  void set_column_names(rmm::cuda_stream_view stream);
 
   /**
    * @brief Set the data type array data member
@@ -160,7 +163,7 @@ class reader::impl {
    *
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
-  void set_data_types(cudaStream_t stream);
+  void set_data_types(rmm::cuda_stream_view stream);
 
   /**
    * @brief Parse the input data and store results a table
@@ -169,7 +172,7 @@ class reader::impl {
    *
    * @return Table and its metadata
    */
-  table_with_metadata convert_data_to_table(cudaStream_t stream);
+  table_with_metadata convert_data_to_table(rmm::cuda_stream_view stream);
 
  public:
   /**
@@ -188,7 +191,7 @@ class reader::impl {
    *
    * @return Table and its metadata
    */
-  table_with_metadata read(json_reader_options const &options, cudaStream_t stream);
+  table_with_metadata read(json_reader_options const &options, rmm::cuda_stream_view stream);
 };
 
 }  // namespace json
diff --git a/cpp/src/io/orc/chunked_state.hpp b/cpp/src/io/orc/chunked_state.hpp
index c72a8485384..71bdb473f41 100644
--- a/cpp/src/io/orc/chunked_state.hpp
+++ b/cpp/src/io/orc/chunked_state.hpp
@@ -29,6 +29,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -43,7 +45,7 @@ struct orc_chunked_state {
   /// The writer to be used
   std::unique_ptr<detail::orc::writer> wp;
   /// Cuda stream to be used
-  cudaStream_t stream;
+  rmm::cuda_stream_view stream;
   /// Overall file metadata.  Filled in during the process and written during write_chunked_end()
   cudf::io::orc::FileFooter ff;
   cudf::io::orc::Metadata md;
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 7f10097db67..4df1e43dcce 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -13,11 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <io/utilities/block_utils.cuh>
+
 #include "orc_common.h"
 #include "orc_gpu.h"
 
+#include <io/utilities/block_utils.cuh>
+
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
@@ -436,11 +439,11 @@ __global__ void __launch_bounds__(block_size)
 void InitDictionaryIndices(DictionaryChunk *chunks,
                            uint32_t num_columns,
                            uint32_t num_rowgroups,
-                           cudaStream_t stream)
+                           rmm::cuda_stream_view stream)
 {
   dim3 dim_block(512, 1);  // 512 threads per chunk
   dim3 dim_grid(num_columns, num_rowgroups);
-  gpuInitDictionaryIndices<512><<<dim_grid, dim_block, 0, stream>>>(chunks, num_columns);
+  gpuInitDictionaryIndices<512><<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_columns);
 }
 
 /**
@@ -453,8 +456,6 @@ void InitDictionaryIndices(DictionaryChunk *chunks,
  * @param[in] num_rowgroups Number of row groups
  * @param[in] num_columns Number of columns
  * @param[in] stream CUDA stream to use, default 0
- *
- * @return cudaSuccess if successful, a CUDA error code otherwise
  */
 void BuildStripeDictionaries(StripeDictionary *stripes,
                              StripeDictionary *stripes_host,
@@ -462,11 +463,11 @@ void BuildStripeDictionaries(StripeDictionary *stripes,
                              uint32_t num_stripes,
                              uint32_t num_rowgroups,
                              uint32_t num_columns,
-                             cudaStream_t stream)
+                             rmm::cuda_stream_view stream)
 {
   dim3 dim_block(1024, 1);  // 1024 threads per chunk
   dim3 dim_grid_build(num_columns, num_stripes);
-  gpuCompactChunkDictionaries<<<dim_grid_build, dim_block, 0, stream>>>(
+  gpuCompactChunkDictionaries<<<dim_grid_build, dim_block, 0, stream.value()>>>(
     stripes, chunks, num_columns);
   for (uint32_t i = 0; i < num_stripes * num_columns; i++) {
     if (stripes_host[i].dict_data != nullptr) {
@@ -474,7 +475,7 @@ void BuildStripeDictionaries(StripeDictionary *stripes,
       const nvstrdesc_s *str_data =
         static_cast<const nvstrdesc_s *>(stripes_host[i].column_data_base);
       // NOTE: Requires the --expt-extended-lambda nvcc flag
-      thrust::sort(rmm::exec_policy(stream)->on(stream),
+      thrust::sort(rmm::exec_policy(stream)->on(stream.value()),
                    p,
                    p + stripes_host[i].num_strings,
                    [str_data] __device__(const uint32_t &lhs, const uint32_t &rhs) {
@@ -485,7 +486,8 @@ void BuildStripeDictionaries(StripeDictionary *stripes,
                    });
     }
   }
-  gpuBuildStripeDictionaries<1024><<<dim_grid_build, dim_block, 0, stream>>>(stripes, num_columns);
+  gpuBuildStripeDictionaries<1024>
+    <<<dim_grid_build, dim_block, 0, stream.value()>>>(stripes, num_columns);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 65de710f068..de35d4a66b9 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -16,10 +16,13 @@
 
 #pragma once
 
+#include "timezone.cuh"
+
 #include <io/comp/gpuinflate.h>
+#include <io/orc/orc_common.h>
 #include <io/statistics/column_stats.h>
 
-#include "timezone.cuh"
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace io {
@@ -192,8 +195,8 @@ struct StripeDictionary {
 void ParseCompressedStripeData(CompressedStreamInfo *strm_info,
                                int32_t num_streams,
                                uint32_t compression_block_size,
-                               uint32_t log2maxcr  = 24,
-                               cudaStream_t stream = (cudaStream_t)0);
+                               uint32_t log2maxcr           = 24,
+                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for re-assembling decompressed blocks into a single contiguous block
@@ -204,7 +207,7 @@ void ParseCompressedStripeData(CompressedStreamInfo *strm_info,
  */
 void PostDecompressionReassemble(CompressedStreamInfo *strm_info,
                                  int32_t num_streams,
-                                 cudaStream_t stream = (cudaStream_t)0);
+                                 rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for constructing rowgroup from index streams
@@ -224,7 +227,7 @@ void ParseRowGroupIndex(RowGroup *row_groups,
                         uint32_t num_stripes,
                         uint32_t num_rowgroups,
                         uint32_t rowidx_stride,
-                        cudaStream_t stream = (cudaStream_t)0);
+                        rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for decoding NULLs and building string dictionary index tables
@@ -241,9 +244,9 @@ void DecodeNullsAndStringDictionaries(ColumnDesc *chunks,
                                       DictionaryEntry *global_dictionary,
                                       uint32_t num_columns,
                                       uint32_t num_stripes,
-                                      size_t max_rows     = ~0,
-                                      size_t first_row    = 0,
-                                      cudaStream_t stream = (cudaStream_t)0);
+                                      size_t max_rows              = ~0,
+                                      size_t first_row             = 0,
+                                      rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for decoding column data
@@ -271,7 +274,7 @@ void DecodeOrcColumnData(ColumnDesc *chunks,
                          const RowGroup *row_groups   = 0,
                          uint32_t num_rowgroups       = 0,
                          uint32_t rowidx_stride       = 0,
-                         cudaStream_t stream          = (cudaStream_t)0);
+                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for encoding column data
@@ -284,7 +287,7 @@ void DecodeOrcColumnData(ColumnDesc *chunks,
 void EncodeOrcColumnData(EncChunk *chunks,
                          uint32_t num_columns,
                          uint32_t num_rowgroups,
-                         cudaStream_t stream = (cudaStream_t)0);
+                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for encoding column dictionaries
@@ -301,7 +304,7 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
                               uint32_t num_string_columns,
                               uint32_t num_columns,
                               uint32_t num_stripes,
-                              cudaStream_t stream = (cudaStream_t)0);
+                              rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for compacting chunked column data prior to compression
@@ -316,7 +319,7 @@ void CompactOrcDataStreams(StripeStream *strm_desc,
                            EncChunk *chunks,
                            uint32_t num_stripe_streams,
                            uint32_t num_columns,
-                           cudaStream_t stream = (cudaStream_t)0);
+                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel(s) for compressing data streams
@@ -340,7 +343,7 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
-                            cudaStream_t stream = (cudaStream_t)0);
+                            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for initializing dictionary chunks
@@ -353,7 +356,7 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
 void InitDictionaryIndices(DictionaryChunk *chunks,
                            uint32_t num_columns,
                            uint32_t num_rowgroups,
-                           cudaStream_t stream = (cudaStream_t)0);
+                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for building stripe dictionaries
@@ -372,7 +375,7 @@ void BuildStripeDictionaries(StripeDictionary *stripes_dev,
                              uint32_t num_stripes,
                              uint32_t num_rowgroups,
                              uint32_t num_columns,
-                             cudaStream_t stream = (cudaStream_t)0);
+                             rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernels to initialize statistics collection
@@ -389,7 +392,7 @@ void orc_init_statistics_groups(statistics_group *groups,
                                 uint32_t num_columns,
                                 uint32_t num_rowgroups,
                                 uint32_t row_index_stride,
-                                cudaStream_t stream = (cudaStream_t)0);
+                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernels to return statistics buffer offsets and sizes
@@ -402,7 +405,7 @@ void orc_init_statistics_groups(statistics_group *groups,
 void orc_init_statistics_buffersize(statistics_merge_group *groups,
                                     const statistics_chunk *chunks,
                                     uint32_t statistics_count,
-                                    cudaStream_t stream = (cudaStream_t)0);
+                                    rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel to encode statistics in ORC protobuf format
@@ -416,7 +419,7 @@ void orc_encode_statistics(uint8_t *blob_bfr,
                            statistics_merge_group *groups,
                            const statistics_chunk *chunks,
                            uint32_t statistics_count,
-                           cudaStream_t stream = (cudaStream_t)0);
+                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 }  // namespace gpu
 }  // namespace orc
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index c57725b21fb..a9f03aef095 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <algorithm>
@@ -391,7 +392,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   size_t num_stripes,
   rmm::device_vector<gpu::RowGroup> &row_groups,
   size_t row_index_stride,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   // Parse the columns' compressed info
   hostdevice_vector<gpu::CompressedStreamInfo> compinfo(0, stream_info.size(), stream);
@@ -404,7 +405,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                            compinfo.host_ptr(),
                            compinfo.memory_size(),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
   gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                  compinfo.size(),
                                  decompressor->GetBlockSize(),
@@ -414,8 +415,8 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                            compinfo.device_ptr(),
                            compinfo.memory_size(),
                            cudaMemcpyDeviceToHost,
-                           stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+                           stream.value()));
+  stream.synchronize();
 
   // Count the exact number of compressed blocks
   size_t num_compressed_blocks   = 0;
@@ -453,7 +454,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                            compinfo.host_ptr(),
                            compinfo.memory_size(),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
   gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                  compinfo.size(),
                                  decompressor->GetBlockSize(),
@@ -489,8 +490,8 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                            compinfo.device_ptr(),
                            compinfo.memory_size(),
                            cudaMemcpyDeviceToHost,
-                           stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+                           stream.value()));
+  stream.synchronize();
 
   const size_t num_columns = chunks.size() / num_stripes;
 
@@ -511,7 +512,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                              chunks.host_ptr(),
                              chunks.memory_size(),
                              cudaMemcpyHostToDevice,
-                             stream));
+                             stream.value()));
     gpu::ParseRowGroupIndex(row_groups.data().get(),
                             compinfo.device_ptr(),
                             chunks.device_ptr(),
@@ -533,7 +534,7 @@ void reader::impl::decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks
                                       const rmm::device_vector<gpu::RowGroup> &row_groups,
                                       size_t row_index_stride,
                                       std::vector<column_buffer> &out_buffers,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream)
 {
   const auto num_columns = out_buffers.size();
   const auto num_stripes = chunks.size() / out_buffers.size();
@@ -550,8 +551,11 @@ void reader::impl::decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks
   // Allocate global dictionary for deserializing
   rmm::device_vector<gpu::DictionaryEntry> global_dict(num_dicts);
 
-  CUDA_TRY(cudaMemcpyAsync(
-    chunks.device_ptr(), chunks.host_ptr(), chunks.memory_size(), cudaMemcpyHostToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(chunks.device_ptr(),
+                           chunks.host_ptr(),
+                           chunks.memory_size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
   gpu::DecodeNullsAndStringDictionaries(chunks.device_ptr(),
                                         global_dict.data().get(),
                                         num_columns,
@@ -570,9 +574,12 @@ void reader::impl::decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks
                            row_groups.size() / num_columns,
                            row_index_stride,
                            stream);
-  CUDA_TRY(cudaMemcpyAsync(
-    chunks.host_ptr(), chunks.device_ptr(), chunks.memory_size(), cudaMemcpyDeviceToHost, stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  CUDA_TRY(cudaMemcpyAsync(chunks.host_ptr(),
+                           chunks.device_ptr(),
+                           chunks.memory_size(),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+  stream.synchronize();
 
   for (size_t i = 0; i < num_stripes; ++i) {
     for (size_t j = 0; j < num_columns; ++j) {
@@ -584,7 +591,7 @@ void reader::impl::decode_stream_data(hostdevice_vector<gpu::ColumnDesc> &chunks
 reader::impl::impl(std::unique_ptr<datasource> source,
                    orc_reader_options const &options,
                    rmm::mr::device_memory_resource *mr)
-  : _source(std::move(source)), _mr(mr)
+  : _mr(mr), _source(std::move(source))
 {
   // Open and parse the source dataset metadata
   _metadata = std::make_unique<metadata>(_source.get());
@@ -611,7 +618,7 @@ reader::impl::impl(std::unique_ptr<datasource> source,
 table_with_metadata reader::impl::read(size_type skip_rows,
                                        size_type num_rows,
                                        const std::vector<size_type> &stripes,
-                                       cudaStream_t stream)
+                                       rmm::cuda_stream_view stream)
 {
   std::vector<std::unique_ptr<column>> out_columns;
   table_metadata out_metadata;
@@ -698,8 +705,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           stream_count++;
         }
         const auto buffer = _source->host_read(offset, len);
-        CUDA_TRY(cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream));
-        CUDA_TRY(cudaStreamSynchronize(stream));
+        CUDA_TRY(
+          cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value()));
+        stream.synchronize();
       }
 
       // Update chunks to reference streams pointers
@@ -758,7 +766,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                    chunks.host_ptr(),
                                    chunks.memory_size(),
                                    cudaMemcpyHostToDevice,
-                                   stream));
+                                   stream.value()));
           gpu::ParseRowGroupIndex(row_groups.data().get(),
                                   nullptr,
                                   chunks.device_ptr(),
@@ -799,7 +807,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                          stream);
 
       for (size_t i = 0; i < column_types.size(); ++i) {
-        out_columns.emplace_back(make_column(out_buffers[i], stream, _mr));
+        out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, _mr));
       }
     }
   }
@@ -839,7 +847,7 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(orc_reader_options const &options, cudaStream_t stream)
+table_with_metadata reader::read(orc_reader_options const &options, rmm::cuda_stream_view stream)
 {
   return _impl->read(
     options.get_skip_rows(), options.get_num_rows(), options.get_stripes(), stream);
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 4fbea95664a..4684dbdcf96 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -26,6 +26,8 @@
 #include <cudf/io/detail/orc.hpp>
 #include <cudf/io/orc.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <string>
 #include <utility>
@@ -41,7 +43,7 @@ using namespace cudf::io;
 // Forward declarations
 class metadata;
 namespace {
-class orc_stream_info;
+struct orc_stream_info;
 }
 
 /**
@@ -73,7 +75,7 @@ class reader::impl {
   table_with_metadata read(size_type skip_rows,
                            size_type num_rows,
                            const std::vector<size_type> &stripes,
-                           cudaStream_t stream);
+                           rmm::cuda_stream_view stream);
 
  private:
   /**
@@ -97,7 +99,7 @@ class reader::impl {
                                             size_t num_stripes,
                                             rmm::device_vector<gpu::RowGroup> &row_groups,
                                             size_t row_index_stride,
-                                            cudaStream_t stream);
+                                            rmm::cuda_stream_view stream);
 
   /**
    * @brief Converts the stripe column data and outputs to columns
@@ -120,7 +122,7 @@ class reader::impl {
                           const rmm::device_vector<gpu::RowGroup> &row_groups,
                           size_t row_index_stride,
                           std::vector<column_buffer> &out_buffers,
-                          cudaStream_t stream);
+                          rmm::cuda_stream_view stream);
 
  private:
   rmm::mr::device_memory_resource *_mr = nullptr;
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index e7aaba2a7cf..a987c171392 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -13,10 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <io/utilities/block_utils.cuh>
+
 #include "orc_common.h"
 #include "orc_gpu.h"
 
+#include <io/utilities/block_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 namespace orc {
@@ -384,11 +388,11 @@ void orc_init_statistics_groups(statistics_group *groups,
                                 uint32_t num_columns,
                                 uint32_t num_rowgroups,
                                 uint32_t row_index_stride,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream)
 {
   dim3 dim_grid((num_rowgroups + init_groups_per_block - 1) / init_groups_per_block, num_columns);
   dim3 dim_block(init_threads_per_group, init_groups_per_block);
-  gpu_init_statistics_groups<<<dim_grid, dim_block, 0, stream>>>(
+  gpu_init_statistics_groups<<<dim_grid, dim_block, 0, stream.value()>>>(
     groups, cols, num_columns, num_rowgroups, row_index_stride);
 }
 
@@ -403,10 +407,11 @@ void orc_init_statistics_groups(statistics_group *groups,
 void orc_init_statistics_buffersize(statistics_merge_group *groups,
                                     const statistics_chunk *chunks,
                                     uint32_t statistics_count,
-                                    cudaStream_t stream)
+                                    rmm::cuda_stream_view stream)
 {
   dim3 dim_block(buffersize_reduction_dim, buffersize_reduction_dim);
-  gpu_init_statistics_buffersize<<<1, dim_block, 0, stream>>>(groups, chunks, statistics_count);
+  gpu_init_statistics_buffersize<<<1, dim_block, 0, stream.value()>>>(
+    groups, chunks, statistics_count);
 }
 
 /**
@@ -421,12 +426,12 @@ void orc_encode_statistics(uint8_t *blob_bfr,
                            statistics_merge_group *groups,
                            const statistics_chunk *chunks,
                            uint32_t statistics_count,
-                           cudaStream_t stream)
+                           rmm::cuda_stream_view stream)
 {
   unsigned int num_blocks =
     (statistics_count + encode_chunks_per_block - 1) / encode_chunks_per_block;
   dim3 dim_block(encode_threads_per_chunk, encode_chunks_per_block);
-  gpu_encode_statistics<<<num_blocks, dim_block, 0, stream>>>(
+  gpu_encode_statistics<<<num_blocks, dim_block, 0, stream.value()>>>(
     blob_bfr, groups, chunks, statistics_count);
 }
 
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 47192172255..cc456978e7a 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -14,11 +14,15 @@
  * limitations under the License.
  */
 
-#include <cub/cub.cuh>
-#include <io/utilities/block_utils.cuh>
 #include "orc_common.h"
 #include "orc_gpu.h"
 
+#include <io/utilities/block_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cub/cub.cuh>
+
 #define LOG2_BYTESTREAM_BFRSZ 13  // Must be able to handle 512x 8-byte values
 
 #define BYTESTREAM_BFRSZ (1 << LOG2_BYTESTREAM_BFRSZ)
@@ -1778,11 +1782,11 @@ void __host__ DecodeNullsAndStringDictionaries(ColumnDesc *chunks,
                                                uint32_t num_stripes,
                                                size_t max_num_rows,
                                                size_t first_row,
-                                               cudaStream_t stream)
+                                               rmm::cuda_stream_view stream)
 {
   dim3 dim_block(NTHREADS, 1);
   dim3 dim_grid(num_columns, num_stripes * 2);  // 1024 threads per chunk
-  gpuDecodeNullsAndStringDictionaries<NTHREADS><<<dim_grid, dim_block, 0, stream>>>(
+  gpuDecodeNullsAndStringDictionaries<NTHREADS><<<dim_grid, dim_block, 0, stream.value()>>>(
     chunks, global_dictionary, num_columns, num_stripes, max_num_rows, first_row);
 }
 
@@ -1811,21 +1815,21 @@ void __host__ DecodeOrcColumnData(ColumnDesc *chunks,
                                   const RowGroup *row_groups,
                                   uint32_t num_rowgroups,
                                   uint32_t rowidx_stride,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream)
 {
   uint32_t num_chunks = num_columns * num_stripes;
   dim3 dim_block(NTHREADS, 1);  // 1024 threads per chunk
   dim3 dim_grid((num_rowgroups > 0) ? num_columns : num_chunks,
                 (num_rowgroups > 0) ? num_rowgroups : 1);
-  gpuDecodeOrcColumnData<NTHREADS><<<dim_grid, dim_block, 0, stream>>>(chunks,
-                                                                       global_dictionary,
-                                                                       tz_table,
-                                                                       row_groups,
-                                                                       max_num_rows,
-                                                                       first_row,
-                                                                       num_columns,
-                                                                       num_rowgroups,
-                                                                       rowidx_stride);
+  gpuDecodeOrcColumnData<NTHREADS><<<dim_grid, dim_block, 0, stream.value()>>>(chunks,
+                                                                               global_dictionary,
+                                                                               tz_table,
+                                                                               row_groups,
+                                                                               max_num_rows,
+                                                                               first_row,
+                                                                               num_columns,
+                                                                               num_rowgroups,
+                                                                               rowidx_stride);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index ea88e3ea645..51f7fdaeed3 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -13,11 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cub/cub.cuh>
-#include <io/utilities/block_utils.cuh>
+
 #include "orc_common.h"
 #include "orc_gpu.h"
 
+#include <io/utilities/block_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cub/cub.cuh>
+
 // Apache ORC reader does not handle zero-length patch lists for RLEv2 mode2
 // Workaround replaces zero-length patch lists by a dummy zero patch
 #define ZERO_PLL_WAR 1
@@ -1247,12 +1252,12 @@ __global__ void __launch_bounds__(1024) gpuCompactCompressedBlocks(StripeStream
 void EncodeOrcColumnData(EncChunk *chunks,
                          uint32_t num_columns,
                          uint32_t num_rowgroups,
-                         cudaStream_t stream)
+                         rmm::cuda_stream_view stream)
 {
   dim3 dim_block(512, 1);  // 512 threads per chunk
   dim3 dim_grid(num_columns, num_rowgroups);
   gpuEncodeOrcColumnData<512>
-    <<<dim_grid, dim_block, 0, stream>>>(chunks, num_columns, num_rowgroups);
+    <<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_columns, num_rowgroups);
 }
 
 /**
@@ -1270,12 +1275,12 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
                               uint32_t num_string_columns,
                               uint32_t num_columns,
                               uint32_t num_stripes,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream)
 {
   dim3 dim_block(512, 1);  // 512 threads per dictionary
   dim3 dim_grid(num_string_columns * num_stripes, 2);
   gpuEncodeStringDictionaries<512>
-    <<<dim_grid, dim_block, 0, stream>>>(stripes, chunks, num_columns);
+    <<<dim_grid, dim_block, 0, stream.value()>>>(stripes, chunks, num_columns);
 }
 
 /**
@@ -1291,11 +1296,12 @@ void CompactOrcDataStreams(StripeStream *strm_desc,
                            EncChunk *chunks,
                            uint32_t num_stripe_streams,
                            uint32_t num_columns,
-                           cudaStream_t stream)
+                           rmm::cuda_stream_view stream)
 {
   dim3 dim_block(1024, 1);
   dim3 dim_grid(num_stripe_streams, 1);
-  gpuCompactOrcDataStreams<<<dim_grid, dim_block, 0, stream>>>(strm_desc, chunks, num_columns);
+  gpuCompactOrcDataStreams<<<dim_grid, dim_block, 0, stream.value()>>>(
+    strm_desc, chunks, num_columns);
 }
 
 /**
@@ -1321,15 +1327,15 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
-                            cudaStream_t stream)
+                            rmm::cuda_stream_view stream)
 {
   dim3 dim_block_init(256, 1);
   dim3 dim_grid(num_stripe_streams, 1);
-  gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream>>>(
+  gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(
     strm_desc, chunks, comp_in, comp_out, compressed_data, comp_blk_size);
   if (compression == SNAPPY) { gpu_snap(comp_in, comp_out, num_compressed_blocks, stream); }
   dim3 dim_block_compact(1024, 1);
-  gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream>>>(
+  gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
     strm_desc, comp_in, comp_out, compressed_data, comp_blk_size);
 }
 
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 77a1a122e4f..9ccd7a9cfc8 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -13,10 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <io/utilities/block_utils.cuh>
+
 #include "orc_common.h"
 #include "orc_gpu.h"
 
+#include <io/utilities/block_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 namespace orc {
@@ -459,21 +463,22 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo *strm_info,
                                         int32_t num_streams,
                                         uint32_t compression_block_size,
                                         uint32_t log2maxcr,
-                                        cudaStream_t stream)
+                                        rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
-  gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream>>>(
+  gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
     strm_info, num_streams, compression_block_size, log2maxcr);
 }
 
 void __host__ PostDecompressionReassemble(CompressedStreamInfo *strm_info,
                                           int32_t num_streams,
-                                          cudaStream_t stream)
+                                          rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
-  gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream>>>(strm_info, num_streams);
+  gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
+                                                                             num_streams);
 }
 
 /**
@@ -494,11 +499,11 @@ void __host__ ParseRowGroupIndex(RowGroup *row_groups,
                                  uint32_t num_stripes,
                                  uint32_t num_rowgroups,
                                  uint32_t rowidx_stride,
-                                 cudaStream_t stream)
+                                 rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid(num_columns, num_stripes);  // 1 column chunk per block
-  gpuParseRowGroupIndex<<<dim_grid, dim_block, 0, stream>>>(
+  gpuParseRowGroupIndex<<<dim_grid, dim_block, 0, stream.value()>>>(
     row_groups, strm_info, chunks, num_columns, num_stripes, num_rowgroups, rowidx_stride);
 }
 
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 9d73a05766a..ba3696fbefb 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -24,13 +24,14 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
 #include <algorithm>
 #include <cstring>
 #include <utility>
 
-#include <rmm/thrust_rmm_allocator.h>
-#include <rmm/device_buffer.hpp>
-
 namespace cudf {
 namespace io {
 namespace detail {
@@ -141,7 +142,7 @@ class orc_column_view {
                            size_t str_id,
                            column_view const &col,
                            const table_metadata *metadata,
-                           cudaStream_t stream)
+                           rmm::cuda_stream_view stream)
     : _id(id),
       _str_id(str_id),
       _string_type(col.type().id() == type_id::STRING),
@@ -156,14 +157,16 @@ class orc_column_view {
     if (_string_type && _data_count > 0) {
       strings_column_view view{col};
       _indexes = rmm::device_buffer(_data_count * sizeof(gpu::nvstrdesc_s), stream);
-      stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream>>>(
+
+      stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream.value()>>>(
         static_cast<gpu::nvstrdesc_s *>(_indexes.data()),
         view.offsets().data<size_type>() + view.offset(),
         view.chars().data<char>(),
         _nulls,
         _data_count);
       _data = _indexes.data();
-      CUDA_TRY(cudaStreamSynchronize(stream));
+
+      stream.synchronize();
     }
     // Generating default name if name isn't present in metadata
     if (metadata && _id < metadata->column_names.size()) {
@@ -254,7 +257,7 @@ void writer::impl::init_dictionaries(orc_column_view *columns,
                                      uint32_t *dict_data,
                                      uint32_t *dict_index,
                                      hostdevice_vector<gpu::DictionaryChunk> &dict,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream)
 {
   const size_t num_rowgroups = dict.size() / str_col_ids.size();
 
@@ -280,12 +283,18 @@ void writer::impl::init_dictionaries(orc_column_view *columns,
     }
   }
 
-  CUDA_TRY(cudaMemcpyAsync(
-    dict.device_ptr(), dict.host_ptr(), dict.memory_size(), cudaMemcpyHostToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(dict.device_ptr(),
+                           dict.host_ptr(),
+                           dict.memory_size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
   gpu::InitDictionaryIndices(dict.device_ptr(), str_col_ids.size(), num_rowgroups, stream);
-  CUDA_TRY(cudaMemcpyAsync(
-    dict.host_ptr(), dict.device_ptr(), dict.memory_size(), cudaMemcpyDeviceToHost, stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  CUDA_TRY(cudaMemcpyAsync(dict.host_ptr(),
+                           dict.device_ptr(),
+                           dict.memory_size(),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+  stream.synchronize();
 }
 
 void writer::impl::build_dictionaries(orc_column_view *columns,
@@ -295,7 +304,7 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
                                       hostdevice_vector<gpu::DictionaryChunk> const &dict,
                                       uint32_t *dict_index,
                                       hostdevice_vector<gpu::StripeDictionary> &stripe_dict,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream)
 {
   const auto num_rowgroups = dict.size() / str_col_ids.size();
 
@@ -337,7 +346,7 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
                            stripe_dict.host_ptr(),
                            stripe_dict.memory_size(),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
   gpu::BuildStripeDictionaries(stripe_dict.device_ptr(),
                                stripe_dict.host_ptr(),
                                dict.device_ptr(),
@@ -349,8 +358,8 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
                            stripe_dict.device_ptr(),
                            stripe_dict.memory_size(),
                            cudaMemcpyDeviceToHost,
-                           stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+                           stream.value()));
+  stream.synchronize();
 }
 
 std::vector<Stream> writer::impl::gather_streams(orc_column_view *columns,
@@ -522,7 +531,7 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
                                                 std::vector<Stream> const &streams,
                                                 std::vector<int32_t> const &strm_ids,
                                                 hostdevice_vector<gpu::EncChunk> &chunks,
-                                                cudaStream_t stream)
+                                                rmm::cuda_stream_view stream)
 {
   // Allocate combined buffer for RLE data and string data output
   std::vector<size_t> strm_offsets(streams.size());
@@ -627,8 +636,11 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
     }
   }
 
-  CUDA_TRY(cudaMemcpyAsync(
-    chunks.device_ptr(), chunks.host_ptr(), chunks.memory_size(), cudaMemcpyHostToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(chunks.device_ptr(),
+                           chunks.host_ptr(),
+                           chunks.memory_size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
   if (!str_col_ids.empty()) {
     auto d_stripe_dict = columns[str_col_ids[0]].device_stripe_dict();
     gpu::EncodeStripeDictionaries(d_stripe_dict,
@@ -639,7 +651,7 @@ rmm::device_buffer writer::impl::encode_columns(orc_column_view *columns,
                                   stream);
   }
   gpu::EncodeOrcColumnData(chunks.device_ptr(), num_columns, num_rowgroups, stream);
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  stream.synchronize();
 
   return output;
 }
@@ -652,7 +664,7 @@ std::vector<StripeInformation> writer::impl::gather_stripes(
   std::vector<uint32_t> const &stripe_list,
   hostdevice_vector<gpu::EncChunk> &chunks,
   hostdevice_vector<gpu::StripeStream> &strm_desc,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   std::vector<StripeInformation> stripes(stripe_list.size());
   size_t group        = 0;
@@ -687,17 +699,20 @@ std::vector<StripeInformation> writer::impl::gather_stripes(
                            strm_desc.host_ptr(),
                            strm_desc.memory_size(),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
   gpu::CompactOrcDataStreams(
     strm_desc.device_ptr(), chunks.device_ptr(), strm_desc.size(), num_columns, stream);
   CUDA_TRY(cudaMemcpyAsync(strm_desc.host_ptr(),
                            strm_desc.device_ptr(),
                            strm_desc.memory_size(),
                            cudaMemcpyDeviceToHost,
-                           stream));
-  CUDA_TRY(cudaMemcpyAsync(
-    chunks.host_ptr(), chunks.device_ptr(), chunks.memory_size(), cudaMemcpyDeviceToHost, stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(chunks.host_ptr(),
+                           chunks.device_ptr(),
+                           chunks.memory_size(),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+  stream.synchronize();
 
   return stripes;
 }
@@ -710,7 +725,7 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
   std::vector<uint32_t> const &stripe_list,
   std::vector<StripeInformation> const &stripes,
   hostdevice_vector<gpu::EncChunk> &chunks,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   size_t num_stat_blobs = (1 + stripe_list.size()) * num_columns;
   size_t num_chunks     = chunks.size();
@@ -767,12 +782,12 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
                            stat_desc.host_ptr(),
                            stat_desc.memory_size(),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
   CUDA_TRY(cudaMemcpyAsync(stat_merge.device_ptr(),
                            stat_merge.host_ptr(),
                            stat_merge.memory_size(),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
   gpu::orc_init_statistics_groups(stat_groups.data().get(),
                                   stat_desc.device_ptr(),
                                   num_columns,
@@ -798,8 +813,8 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
                            stat_merge.device_ptr(),
                            stat_merge.memory_size(),
                            cudaMemcpyDeviceToHost,
-                           stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+                           stream.value()));
+  stream.synchronize();
 
   hostdevice_vector<uint8_t> blobs(stat_merge[num_stat_blobs - 1].start_chunk +
                                    stat_merge[num_stat_blobs - 1].num_chunks);
@@ -812,10 +827,13 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
                            stat_merge.device_ptr(),
                            stat_merge.memory_size(),
                            cudaMemcpyDeviceToHost,
-                           stream));
-  CUDA_TRY(cudaMemcpyAsync(
-    blobs.host_ptr(), blobs.device_ptr(), blobs.memory_size(), cudaMemcpyDeviceToHost, stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+                           stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(blobs.host_ptr(),
+                           blobs.device_ptr(),
+                           blobs.memory_size(),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+  stream.synchronize();
 
   for (size_t i = 0; i < num_stat_blobs; i++) {
     const uint8_t *stat_begin = blobs.host_ptr(stat_merge[i].start_chunk);
@@ -919,15 +937,16 @@ void writer::impl::write_data_stream(gpu::StripeStream const &strm_desc,
                                      uint8_t *stream_out,
                                      StripeInformation &stripe,
                                      std::vector<Stream> &streams,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream)
 {
   const auto length                                    = strm_desc.stream_size;
   streams[chunk.strm_id[strm_desc.stream_type]].length = length;
   if (length != 0) {
     const auto *stream_in = (compression_kind_ == NONE) ? chunk.streams[strm_desc.stream_type]
                                                         : (compressed_data + strm_desc.bfr_offset);
-    CUDA_TRY(cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream));
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    CUDA_TRY(
+      cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream.value()));
+    stream.synchronize();
 
     out_sink_->host_write(stream_out, length);
   }
@@ -966,7 +985,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
 
 void writer::impl::write(table_view const &table,
                          const table_metadata *metadata,
-                         cudaStream_t stream)
+                         rmm::cuda_stream_view stream)
 {
   orc_chunked_state state;
   state.user_metadata     = metadata;
@@ -1156,7 +1175,7 @@ void writer::impl::write_chunk(table_view const &table, orc_chunked_state &state
                              strm_desc.host_ptr(),
                              strm_desc.memory_size(),
                              cudaMemcpyHostToDevice,
-                             state.stream));
+                             state.stream.value()));
     gpu::CompressOrcDataStreams(static_cast<uint8_t *>(compressed_data.data()),
                                 strm_desc.device_ptr(),
                                 chunks.device_ptr(),
@@ -1171,13 +1190,13 @@ void writer::impl::write_chunk(table_view const &table, orc_chunked_state &state
                              strm_desc.device_ptr(),
                              strm_desc.memory_size(),
                              cudaMemcpyDeviceToHost,
-                             state.stream));
+                             state.stream.value()));
     CUDA_TRY(cudaMemcpyAsync(comp_out.host_ptr(),
                              comp_out.device_ptr(),
                              comp_out.memory_size(),
                              cudaMemcpyDeviceToHost,
-                             state.stream));
-    CUDA_TRY(cudaStreamSynchronize(state.stream));
+                             state.stream.value()));
+    state.stream.synchronize();
   }
 
   ProtobufWriter pbw_(&buffer_);
@@ -1362,7 +1381,9 @@ writer::writer(std::unique_ptr<data_sink> sink,
 writer::~writer() = default;
 
 // Forward to implementation
-void writer::write(table_view const &table, const table_metadata *metadata, cudaStream_t stream)
+void writer::write(table_view const &table,
+                   const table_metadata *metadata,
+                   rmm::cuda_stream_view stream)
 {
   _impl->write(table, metadata, stream);
 }
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 6a96a8d4d7d..a7b1fef87ba 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "chunked_state.hpp"
 #include "orc.h"
 #include "orc_gpu.h"
 
@@ -28,12 +29,12 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "chunked_state.hpp"
-
 namespace cudf {
 namespace io {
 namespace detail {
@@ -76,7 +77,7 @@ class writer::impl {
    * @param metadata The metadata associated with the table
    * @param stream CUDA stream used for device memory operations and kernel launches.
    **/
-  void write(table_view const& table, const table_metadata* metadata, cudaStream_t stream);
+  void write(table_view const& table, const table_metadata* metadata, rmm::cuda_stream_view stream);
 
   /**
    * @brief Begins the chunked/streamed write process.
@@ -121,7 +122,7 @@ class writer::impl {
                          uint32_t* dict_data,
                          uint32_t* dict_index,
                          hostdevice_vector<gpu::DictionaryChunk>& dict,
-                         cudaStream_t stream);
+                         rmm::cuda_stream_view stream);
 
   /**
    * @brief Builds up per-stripe dictionaries for string columns
@@ -142,7 +143,7 @@ class writer::impl {
                           hostdevice_vector<gpu::DictionaryChunk> const& dict,
                           uint32_t* dict_index,
                           hostdevice_vector<gpu::StripeDictionary>& stripe_dict,
-                          cudaStream_t stream);
+                          rmm::cuda_stream_view stream);
 
   /**
    * @brief Returns stream information for each column
@@ -187,7 +188,7 @@ class writer::impl {
                                     std::vector<Stream> const& streams,
                                     std::vector<int32_t> const& strm_ids,
                                     hostdevice_vector<gpu::EncChunk>& chunks,
-                                    cudaStream_t stream);
+                                    rmm::cuda_stream_view stream);
 
   /**
    * @brief Returns stripe information after compacting columns' individual data
@@ -211,7 +212,7 @@ class writer::impl {
                                                 std::vector<uint32_t> const& stripe_list,
                                                 hostdevice_vector<gpu::EncChunk>& chunks,
                                                 hostdevice_vector<gpu::StripeStream>& strm_desc,
-                                                cudaStream_t stream);
+                                                rmm::cuda_stream_view stream);
 
   /**
    * @brief Returns per-stripe and per-file column statistics encoded
@@ -236,7 +237,7 @@ class writer::impl {
     std::vector<uint32_t> const& stripe_list,
     std::vector<StripeInformation> const& stripes,
     hostdevice_vector<gpu::EncChunk>& chunks,
-    cudaStream_t stream);
+    rmm::cuda_stream_view stream);
 
   /**
    * @brief Write the specified column's row index stream
@@ -285,7 +286,7 @@ class writer::impl {
                          uint8_t* stream_out,
                          StripeInformation& stripe,
                          std::vector<Stream>& streams,
-                         cudaStream_t stream);
+                         rmm::cuda_stream_view stream);
 
   /**
    * @brief Insert 3-byte uncompressed block headers in a byte vector
diff --git a/cpp/src/io/parquet/chunked_state.hpp b/cpp/src/io/parquet/chunked_state.hpp
index a6ea7f23385..5bbc5366f70 100644
--- a/cpp/src/io/parquet/chunked_state.hpp
+++ b/cpp/src/io/parquet/chunked_state.hpp
@@ -21,9 +21,12 @@
 
 #pragma once
 
-#include <cudf/io/detail/parquet.hpp>
 #include <io/parquet/parquet.hpp>
 
+#include <cudf/io/detail/parquet.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 
@@ -37,7 +40,7 @@ struct pq_chunked_state {
   /// The writer to be used
   std::unique_ptr<cudf::io::detail::parquet::writer> wp;
   /// Cuda stream to be used
-  cudaStream_t stream;
+  rmm::cuda_stream_view stream;
   /// Overall file metadata.  Filled in during the process and written during write_chunked_end()
   cudf::io::parquet::FileMetaData md;
   /// current write position for rowgroups/chunks
@@ -56,13 +59,13 @@ struct pq_chunked_state {
   pq_chunked_state() = default;
 
   pq_chunked_state(table_metadata const* metadata,
-                   SingleWriteMode mode        = SingleWriteMode::NO,
-                   bool write_int96_timestamps = false,
-                   cudaStream_t str            = 0)
-    : user_metadata{metadata},
+                   SingleWriteMode mode         = SingleWriteMode::NO,
+                   bool write_int96_timestamps  = false,
+                   rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+    : stream{stream},
+      user_metadata{metadata},
       single_write_mode{mode == SingleWriteMode::YES},
-      int96_timestamps(write_int96_timestamps),
-      stream{str}
+      int96_timestamps(write_int96_timestamps)
   {
   }
 };
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 693c82cb4cd..3e4584a9731 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -14,17 +14,20 @@
  * limitations under the License.
  */
 
+#include <io/parquet/parquet_gpu.hpp>
+#include <io/utilities/block_utils.cuh>
+#include <io/utilities/column_buffer.hpp>
+
+#include <cudf/detail/utilities/release_assert.cuh>
+#include <cudf/utilities/bit.hpp>
+
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/scan.h>
 #include <thrust/tuple.h>
-#include <cudf/detail/utilities/release_assert.cuh>
-#include <cudf/utilities/bit.hpp>
-#include <io/utilities/block_utils.cuh>
-#include <io/utilities/column_buffer.hpp>
-
-#include <io/parquet/parquet_gpu.hpp>
 
 #define LOG2_NTHREADS (5 + 2)
 #define NTHREADS (1 << LOG2_NTHREADS)
@@ -1711,16 +1714,16 @@ struct chunk_row_output_iter {
   using reference         = size_type &;
   using iterator_category = thrust::output_device_iterator_tag;
 
-  chunk_row_output_iter operator+ __host__ __device__(int i)
+  __host__ __device__ chunk_row_output_iter operator+(int i)
   {
     return chunk_row_output_iter{p + i};
   }
 
-  void operator++ __host__ __device__() { p++; }
+  __host__ __device__ void operator++() { p++; }
 
-  reference operator[] __device__(int i) { return p[i].chunk_row; }
-  reference operator*__device__() { return p->chunk_row; }
-  void operator= __device__(value_type v) { p->chunk_row = v; }
+  __device__ reference operator[](int i) { return p[i].chunk_row; }
+  __device__ reference operator*() { return p->chunk_row; }
+  __device__ void operator=(value_type v) { p->chunk_row = v; }
 };
 
 struct start_offset_output_iterator {
@@ -1736,19 +1739,19 @@ struct start_offset_output_iterator {
   using reference         = size_type &;
   using iterator_category = thrust::output_device_iterator_tag;
 
-  start_offset_output_iterator operator+ __host__ __device__(int i)
+  __host__ __device__ start_offset_output_iterator operator+(int i)
   {
     return start_offset_output_iterator{
       pages, page_indices, cur_index + i, src_col_schema, nesting_depth};
   }
 
-  void operator++ __host__ __device__() { cur_index++; }
+  __host__ __device__ void operator++() { cur_index++; }
 
-  reference operator[] __device__(int i) { return dereference(cur_index + i); }
-  reference operator*__device__() { return dereference(cur_index); }
+  __device__ reference operator[](int i) { return dereference(cur_index + i); }
+  __device__ reference operator*() { return dereference(cur_index); }
 
  private:
-  reference __device__ dereference(int index)
+  __device__ reference dereference(int index)
   {
     PageInfo const &p = pages[page_indices[index]];
     if (p.src_col_schema != src_col_schema || p.flags & PAGEINFO_FLAGS_DICTIONARY) { return empty; }
@@ -1765,7 +1768,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
                           std::vector<cudf::io::detail::column_buffer> &output_columns,
                           size_t num_rows,
                           size_t min_row,
-                          cudaStream_t stream,
+                          rmm::cuda_stream_view stream,
                           rmm::mr::device_memory_resource *mr)
 {
   dim3 dim_block(NTHREADS, 1);
@@ -1774,9 +1777,9 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
   // computes:
   // PageNestingInfo::size for each level of nesting, for each page.
   // The output from this does not take row bounds (num_rows, min_row) into account
-  gpuComputePageSizes<<<dim_grid, dim_block, 0, stream>>>(
+  gpuComputePageSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
     pages.device_ptr(), chunks.device_ptr(), min_row, num_rows, chunks.size(), false);
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  stream.synchronize();
 
   // computes:
   // PageInfo::chunk_row for all pages
@@ -1784,7 +1787,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
     pages.device_ptr(), [] __device__(PageInfo const &page) { return page.chunk_idx; });
   auto page_input = thrust::make_transform_iterator(
     pages.device_ptr(), [] __device__(PageInfo const &page) { return page.num_rows; });
-  thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream),
+  thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream.value()),
                                 key_input,
                                 key_input + pages.size(),
                                 page_input,
@@ -1793,7 +1796,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
   // computes:
   // PageNestingInfo::size for each level of nesting, for each page, taking row bounds into account.
   // PageInfo::skipped_values, which tells us where to start decoding in the input
-  gpuComputePageSizes<<<dim_grid, dim_block, 0, stream>>>(
+  gpuComputePageSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
     pages.device_ptr(), chunks.device_ptr(), min_row, num_rows, chunks.size(), true);
 
   // retrieve pages back (PageInfo::num_rows has been set. if we don't bring it
@@ -1819,14 +1822,15 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
   rmm::device_uvector<int> page_keys(pages.size(), stream);
   rmm::device_uvector<int> page_index(pages.size(), stream);
   {
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       pages.device_ptr(),
                       pages.device_ptr() + pages.size(),
                       page_keys.begin(),
                       [] __device__(PageInfo const &page) { return page.src_col_schema; });
 
-    thrust::sequence(rmm::exec_policy(stream)->on(stream), page_index.begin(), page_index.end());
-    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
+    thrust::sequence(
+      rmm::exec_policy(stream)->on(stream.value()), page_index.begin(), page_index.end());
+    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream.value()),
                                page_keys.begin(),
                                page_keys.end(),
                                page_index.begin(),
@@ -1860,7 +1864,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
       // columns. so don't compute any given level more than once.
       if (out_buf.size == 0) {
         int size = thrust::reduce(
-          rmm::exec_policy(stream)->on(stream), size_input, size_input + pages.size());
+          rmm::exec_policy(stream)->on(stream.value()), size_input, size_input + pages.size());
 
         // if this is a list column add 1 for non-leaf levels for the terminating offset
         if (out_buf.type.id() == type_id::LIST && l_idx < max_depth) { size++; }
@@ -1870,7 +1874,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
       }
 
       // compute per-page start offset
-      thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream),
+      thrust::exclusive_scan_by_key(rmm::exec_policy(stream)->on(stream.value()),
                                     page_keys.begin(),
                                     page_keys.end(),
                                     size_input,
@@ -1890,12 +1894,12 @@ void __host__ DecodePageData(hostdevice_vector<PageInfo> &pages,
                              hostdevice_vector<ColumnChunkDesc> const &chunks,
                              size_t num_rows,
                              size_t min_row,
-                             cudaStream_t stream)
+                             rmm::cuda_stream_view stream)
 {
   dim3 dim_block(NTHREADS, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
-  gpuDecodePageData<<<dim_grid, dim_block, 0, stream>>>(
+  gpuDecodePageData<<<dim_grid, dim_block, 0, stream.value()>>>(
     pages.device_ptr(), chunks.device_ptr(), min_row, num_rows, chunks.size());
 }
 
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
index b4e87d97857..fba2b3ccfd5 100644
--- a/cpp/src/io/parquet/page_dict.cu
+++ b/cpp/src/io/parquet/page_dict.cu
@@ -13,11 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cub/cub.cuh>
-#include <cudf/utilities/error.hpp>
+
 #include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
 
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cub/cub.cuh>
+
 namespace cudf {
 namespace io {
 namespace parquet {
@@ -331,11 +336,11 @@ void BuildChunkDictionaries(EncColumnChunk *chunks,
                             uint32_t *dev_scratch,
                             size_t scratch_size,
                             uint32_t num_chunks,
-                            cudaStream_t stream)
+                            rmm::cuda_stream_view stream)
 {
   if (num_chunks > 0 && scratch_size > 0) {  // zero scratch size implies no dictionaries
-    CUDA_TRY(cudaMemsetAsync(dev_scratch, 0, scratch_size, stream));
-    gpuBuildChunkDictionaries<1024><<<num_chunks, 1024, 0, stream>>>(chunks, dev_scratch);
+    CUDA_TRY(cudaMemsetAsync(dev_scratch, 0, scratch_size, stream.value()));
+    gpuBuildChunkDictionaries<1024><<<num_chunks, 1024, 0, stream.value()>>>(chunks, dev_scratch);
   }
 }
 
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 3d87901f269..3542c25bfb2 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -13,13 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cub/cub.cuh>
+
 #include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
 
 #include <chrono>
 #include <cudf/detail/utilities/cuda.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cub/cub.cuh>
+
 #include <thrust/gather.h>
 #include <thrust/iterator/discard_iterator.h>
 
@@ -1667,7 +1671,7 @@ __global__ void __launch_bounds__(1024) gpuGatherPages(EncColumnChunk *chunks, c
  *
  * Similarly we merge up all the way till level 0 offsets
  */
-dremel_data get_dremel_data(column_view h_col, cudaStream_t stream)
+dremel_data get_dremel_data(column_view h_col, rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(h_col.type().id() == type_id::LIST,
                "Can only get rep/def levels for LIST type column");
@@ -1679,12 +1683,12 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream)
     auto d_off = lcv.offsets().data<size_type>();
 
     auto empties_idx_end =
-      thrust::copy_if(rmm::exec_policy(stream)->on(stream),
+      thrust::copy_if(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator(start),
                       thrust::make_counting_iterator(end),
                       empties_idx.begin(),
                       [d_off] __device__(auto i) { return d_off[i] == d_off[i + 1]; });
-    auto empties_end = thrust::gather(rmm::exec_policy(stream)->on(stream),
+    auto empties_end = thrust::gather(rmm::exec_policy(stream)->on(stream.value()),
                                       empties_idx.begin(),
                                       empties_idx_end,
                                       lcv.offsets().begin<size_type>(),
@@ -1794,7 +1798,7 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream)
     auto output_zip_it =
       thrust::make_zip_iterator(thrust::make_tuple(rep_level.begin(), def_level.begin()));
 
-    auto ends = thrust::merge_by_key(rmm::exec_policy(stream)->on(stream),
+    auto ends = thrust::merge_by_key(rmm::exec_policy(stream)->on(stream.value()),
                                      empties.begin(),
                                      empties.begin() + empties_size,
                                      thrust::make_counting_iterator(column_offsets[level + 1]),
@@ -1812,14 +1816,14 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream)
                                       [off = lcv.offsets().data<size_type>()] __device__(
                                         auto i) -> int { return off[i] == off[i + 1]; });
     rmm::device_uvector<size_type> scan_out(offset_size_at_level, stream);
-    thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
+    thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                            scan_it,
                            scan_it + offset_size_at_level,
                            scan_out.begin());
 
     // Add scan output to existing offsets to get new offsets into merged rep level values
     new_offsets = rmm::device_uvector<size_type>(offset_size_at_level, stream);
-    thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator(0),
                        offset_size_at_level,
                        [off      = lcv.offsets().data<size_type>() + column_offsets[level],
@@ -1830,7 +1834,7 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream)
 
     // Set rep level values at level starts to appropriate rep level
     auto scatter_it = thrust::make_constant_iterator(level);
-    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+    thrust::scatter(rmm::exec_policy(stream)->on(stream.value()),
                     scatter_it,
                     scatter_it + new_offsets.size() - 1,
                     new_offsets.begin(),
@@ -1881,7 +1885,7 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream)
     auto output_zip_it =
       thrust::make_zip_iterator(thrust::make_tuple(rep_level.begin(), def_level.begin()));
 
-    auto ends = thrust::merge_by_key(rmm::exec_policy(stream)->on(stream),
+    auto ends = thrust::merge_by_key(rmm::exec_policy(stream)->on(stream.value()),
                                      transformed_empties,
                                      transformed_empties + empties_size,
                                      thrust::make_counting_iterator(0),
@@ -1900,14 +1904,14 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream)
                                       [off = lcv.offsets().data<size_type>()] __device__(
                                         auto i) -> int { return off[i] == off[i + 1]; });
     rmm::device_uvector<size_type> scan_out(offset_size_at_level, stream);
-    thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
+    thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                            scan_it,
                            scan_it + offset_size_at_level,
                            scan_out.begin());
 
     // Add scan output to existing offsets to get new offsets into merged rep level values
     rmm::device_uvector<size_type> temp_new_offsets(offset_size_at_level, stream);
-    thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator(0),
                        offset_size_at_level,
                        [off      = lcv.offsets().data<size_type>() + column_offsets[level],
@@ -1920,7 +1924,7 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream)
 
     // Set rep level values at level starts to appropriate rep level
     auto scatter_it = thrust::make_constant_iterator(level);
-    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+    thrust::scatter(rmm::exec_policy(stream)->on(stream.value()),
                     scatter_it,
                     scatter_it + new_offsets.size() - 1,
                     new_offsets.begin(),
@@ -1931,7 +1935,7 @@ dremel_data get_dremel_data(column_view h_col, cudaStream_t stream)
   rep_level.resize(level_vals_size, stream);
   def_level.resize(level_vals_size, stream);
 
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  stream.synchronize();
 
   size_type leaf_col_offset = column_offsets[column_offsets.size() - 1];
   size_type leaf_data_size  = column_ends[column_ends.size() - 1] - leaf_col_offset;
@@ -1958,10 +1962,10 @@ void InitPageFragments(PageFragment *frag,
                        int32_t num_columns,
                        uint32_t fragment_size,
                        uint32_t num_rows,
-                       cudaStream_t stream)
+                       rmm::cuda_stream_view stream)
 {
   dim3 dim_grid(num_columns, num_fragments);  // 1 threadblock per fragment
-  gpuInitPageFragments<512><<<dim_grid, 512, 0, stream>>>(
+  gpuInitPageFragments<512><<<dim_grid, 512, 0, stream.value()>>>(
     frag, col_desc, num_fragments, num_columns, fragment_size, num_rows);
 }
 
@@ -1982,10 +1986,10 @@ void InitFragmentStatistics(statistics_group *groups,
                             int32_t num_fragments,
                             int32_t num_columns,
                             uint32_t fragment_size,
-                            cudaStream_t stream)
+                            rmm::cuda_stream_view stream)
 {
   dim3 dim_grid(num_columns, (num_fragments + 3) >> 2);  // 1 warp per fragment
-  gpuInitFragmentStats<<<dim_grid, 128, 0, stream>>>(
+  gpuInitFragmentStats<<<dim_grid, 128, 0, stream.value()>>>(
     groups, fragments, col_desc, num_fragments, num_columns, fragment_size);
 }
 
@@ -2008,10 +2012,10 @@ void InitEncoderPages(EncColumnChunk *chunks,
                       int32_t num_columns,
                       statistics_merge_group *page_grstats,
                       statistics_merge_group *chunk_grstats,
-                      cudaStream_t stream)
+                      rmm::cuda_stream_view stream)
 {
   dim3 dim_grid(num_columns, num_rowgroups);  // 1 threadblock per rowgroup
-  gpuInitPages<<<dim_grid, 128, 0, stream>>>(
+  gpuInitPages<<<dim_grid, 128, 0, stream.value()>>>(
     chunks, pages, col_desc, page_grstats, chunk_grstats, num_rowgroups, num_columns);
 }
 
@@ -2032,11 +2036,12 @@ void EncodePages(EncPage *pages,
                  uint32_t start_page,
                  gpu_inflate_input_s *comp_in,
                  gpu_inflate_status_s *comp_out,
-                 cudaStream_t stream)
+                 rmm::cuda_stream_view stream)
 {
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<<<num_pages, 128, 0, stream>>>(pages, chunks, comp_in, comp_out, start_page);
+  gpuEncodePages<<<num_pages, 128, 0, stream.value()>>>(
+    pages, chunks, comp_in, comp_out, start_page);
 }
 
 /**
@@ -2054,9 +2059,9 @@ void DecideCompression(EncColumnChunk *chunks,
                        uint32_t num_chunks,
                        uint32_t start_page,
                        const gpu_inflate_status_s *comp_out,
-                       cudaStream_t stream)
+                       rmm::cuda_stream_view stream)
 {
-  gpuDecideCompression<<<num_chunks, 128, 0, stream>>>(chunks, pages, comp_out, start_page);
+  gpuDecideCompression<<<num_chunks, 128, 0, stream.value()>>>(chunks, pages, comp_out, start_page);
 }
 
 /**
@@ -2078,9 +2083,9 @@ void EncodePageHeaders(EncPage *pages,
                        const gpu_inflate_status_s *comp_out,
                        const statistics_chunk *page_stats,
                        const statistics_chunk *chunk_stats,
-                       cudaStream_t stream)
+                       rmm::cuda_stream_view stream)
 {
-  gpuEncodePageHeaders<<<num_pages, 128, 0, stream>>>(
+  gpuEncodePageHeaders<<<num_pages, 128, 0, stream.value()>>>(
     pages, chunks, comp_out, page_stats, chunk_stats, start_page);
 }
 
@@ -2095,9 +2100,9 @@ void EncodePageHeaders(EncPage *pages,
 void GatherPages(EncColumnChunk *chunks,
                  const EncPage *pages,
                  uint32_t num_chunks,
-                 cudaStream_t stream)
+                 rmm::cuda_stream_view stream)
 {
-  gpuGatherPages<<<num_chunks, 1024, 0, stream>>>(chunks, pages);
+  gpuGatherPages<<<num_chunks, 1024, 0, stream.value()>>>(chunks, pages);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index d150eb72bba..ef496e71d96 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -18,6 +18,8 @@
 #include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 namespace parquet {
@@ -468,20 +470,22 @@ extern "C" __global__ void __launch_bounds__(128)
   }
 }
 
-void __host__ DecodePageHeaders(ColumnChunkDesc *chunks, int32_t num_chunks, cudaStream_t stream)
+void __host__ DecodePageHeaders(ColumnChunkDesc *chunks,
+                                int32_t num_chunks,
+                                rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_chunks + 3) >> 2, 1);  // 1 chunk per warp, 4 warps per block
-  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream>>>(chunks, num_chunks);
+  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
 }
 
 void __host__ BuildStringDictionaryIndex(ColumnChunkDesc *chunks,
                                          int32_t num_chunks,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_chunks + 3) >> 2, 1);  // 1 chunk per warp, 4 warps per block
-  gpuBuildStringDictionaryIndex<<<dim_grid, dim_block, 0, stream>>>(chunks, num_chunks);
+  gpuBuildStringDictionaryIndex<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 97420d5d7f1..9f657d58804 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -16,23 +16,24 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
 #include <io/comp/gpuinflate.h>
 #include <io/statistics/column_stats.h>
-#include <cudf/types.hpp>
 #include <io/parquet/parquet_common.hpp>
 #include <io/utilities/column_buffer.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-
 #include <cudf/types.hpp>
-#include <vector>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda_runtime.h>
+
+#include <vector>
+
 namespace cudf {
 namespace io {
 namespace parquet {
@@ -171,8 +172,8 @@ struct ColumnChunkDesc {
       max_num_pages(0),
       page_info(nullptr),
       str_dict_index(nullptr),
-      valid_map_base({nullptr}),
-      column_data_base({nullptr}),
+      valid_map_base{nullptr},
+      column_data_base{nullptr},
       codec(codec_),
       converted_type(converted_type_),
       decimal_scale(decimal_scale_),
@@ -327,9 +328,7 @@ struct EncColumnChunk {
  * @param[in] num_chunks Number of column chunks
  * @param[in] stream CUDA stream to use, default 0
  */
-void DecodePageHeaders(ColumnChunkDesc *chunks,
-                       int32_t num_chunks,
-                       cudaStream_t stream = (cudaStream_t)0);
+void DecodePageHeaders(ColumnChunkDesc *chunks, int32_t num_chunks, rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for building the dictionary index for the column
@@ -341,7 +340,7 @@ void DecodePageHeaders(ColumnChunkDesc *chunks,
  */
 void BuildStringDictionaryIndex(ColumnChunkDesc *chunks,
                                 int32_t num_chunks,
-                                cudaStream_t stream = (cudaStream_t)0);
+                                rmm::cuda_stream_view stream);
 
 /**
  * @brief Preprocess column information for nested schemas.
@@ -368,7 +367,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo> &pages,
                           std::vector<cudf::io::detail::column_buffer> &output_columns,
                           size_t num_rows,
                           size_t min_row,
-                          cudaStream_t stream,
+                          rmm::cuda_stream_view stream,
                           rmm::mr::device_memory_resource *mr);
 
 /**
@@ -387,7 +386,7 @@ void DecodePageData(hostdevice_vector<PageInfo> &pages,
                     hostdevice_vector<ColumnChunkDesc> const &chunks,
                     size_t num_rows,
                     size_t min_row,
-                    cudaStream_t stream = (cudaStream_t)0);
+                    rmm::cuda_stream_view stream);
 
 /**
  * @brief Dremel data that describes one nested type column
@@ -420,7 +419,7 @@ struct dremel_data {
  *
  * @return A struct containing dremel data
  */
-dremel_data get_dremel_data(column_view h_col, cudaStream_t stream = (cudaStream_t)0);
+dremel_data get_dremel_data(column_view h_col, rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for initializing encoder page fragments
@@ -439,7 +438,7 @@ void InitPageFragments(PageFragment *frag,
                        int32_t num_columns,
                        uint32_t fragment_size,
                        uint32_t num_rows,
-                       cudaStream_t stream = (cudaStream_t)0);
+                       rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for initializing fragment statistics groups
@@ -458,7 +457,7 @@ void InitFragmentStatistics(statistics_group *groups,
                             int32_t num_fragments,
                             int32_t num_columns,
                             uint32_t fragment_size,
-                            cudaStream_t stream = (cudaStream_t)0);
+                            rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for initializing encoder data pages
@@ -479,7 +478,7 @@ void InitEncoderPages(EncColumnChunk *chunks,
                       int32_t num_columns,
                       statistics_merge_group *page_grstats  = nullptr,
                       statistics_merge_group *chunk_grstats = nullptr,
-                      cudaStream_t stream                   = (cudaStream_t)0);
+                      rmm::cuda_stream_view stream          = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel for packing column data into parquet pages
@@ -498,7 +497,7 @@ void EncodePages(EncPage *pages,
                  uint32_t start_page            = 0,
                  gpu_inflate_input_s *comp_in   = nullptr,
                  gpu_inflate_status_s *comp_out = nullptr,
-                 cudaStream_t stream            = (cudaStream_t)0);
+                 rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision
@@ -515,7 +514,7 @@ void DecideCompression(EncColumnChunk *chunks,
                        uint32_t num_chunks,
                        uint32_t start_page,
                        const gpu_inflate_status_s *comp_out = nullptr,
-                       cudaStream_t stream                  = (cudaStream_t)0);
+                       rmm::cuda_stream_view stream         = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel to encode page headers
@@ -536,7 +535,7 @@ void EncodePageHeaders(EncPage *pages,
                        const gpu_inflate_status_s *comp_out = nullptr,
                        const statistics_chunk *page_stats   = nullptr,
                        const statistics_chunk *chunk_stats  = nullptr,
-                       cudaStream_t stream                  = (cudaStream_t)0);
+                       rmm::cuda_stream_view stream         = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel to gather pages to a single contiguous block per chunk
@@ -550,7 +549,7 @@ void EncodePageHeaders(EncPage *pages,
 void GatherPages(EncColumnChunk *chunks,
                  const EncPage *pages,
                  uint32_t num_chunks,
-                 cudaStream_t stream = (cudaStream_t)0);
+                 rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for building chunk dictionaries
@@ -565,7 +564,7 @@ void BuildChunkDictionaries(EncColumnChunk *chunks,
                             uint32_t *dev_scratch,
                             size_t scratch_size,
                             uint32_t num_chunks,
-                            cudaStream_t stream = (cudaStream_t)0);
+                            rmm::cuda_stream_view stream);
 
 }  // namespace gpu
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index e615b4782ea..85c9a3c2919 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -28,6 +28,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -758,7 +759,7 @@ void reader::impl::read_column_chunks(
   size_t end_chunk,
   const std::vector<size_t> &column_chunk_offsets,
   std::vector<size_type> const &chunk_source_map,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   // Transfer chunk data, coalescing adjacent chunks
   for (size_t chunk = begin_chunk; chunk < end_chunk;) {
@@ -797,7 +798,7 @@ void reader::impl::read_column_chunks(
  * @copydoc cudf::io::detail::parquet::count_page_headers
  */
 size_t reader::impl::count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
-                                        cudaStream_t stream)
+                                        rmm::cuda_stream_view stream)
 {
   size_t total_pages = 0;
 
@@ -817,7 +818,7 @@ size_t reader::impl::count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>
  */
 void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
                                        hostdevice_vector<gpu::PageInfo> &pages,
-                                       cudaStream_t stream)
+                                       rmm::cuda_stream_view stream)
 {
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
   // please update preprocess_nested_columns to reflect this.
@@ -838,7 +839,7 @@ void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &
 rmm::device_buffer reader::impl::decompress_page_data(
   hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
   hostdevice_vector<gpu::PageInfo> &pages,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   auto for_each_codec_page = [&](parquet::Compression codec, const std::function<void(size_t)> &f) {
     for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
@@ -902,12 +903,12 @@ rmm::device_buffer reader::impl::decompress_page_data(
                                inflate_in.host_ptr(start_pos),
                                sizeof(decltype(inflate_in)::value_type) * (argc - start_pos),
                                cudaMemcpyHostToDevice,
-                               stream));
+                               stream.value()));
       CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos),
                                inflate_out.host_ptr(start_pos),
                                sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
                                cudaMemcpyHostToDevice,
-                               stream));
+                               stream.value()));
       switch (codec.first) {
         case parquet::GZIP:
           CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos),
@@ -936,15 +937,18 @@ rmm::device_buffer reader::impl::decompress_page_data(
                                inflate_out.device_ptr(start_pos),
                                sizeof(decltype(inflate_out)::value_type) * (argc - start_pos),
                                cudaMemcpyDeviceToHost,
-                               stream));
+                               stream.value()));
     }
   }
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  stream.synchronize();
 
   // Update the page information in device memory with the updated value of
   // page_data; it now points to the uncompressed data buffer
-  CUDA_TRY(cudaMemcpyAsync(
-    pages.device_ptr(), pages.host_ptr(), pages.memory_size(), cudaMemcpyHostToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(pages.device_ptr(),
+                           pages.host_ptr(),
+                           pages.memory_size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
 
   return decomp_pages;
 }
@@ -955,7 +959,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
 void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const &chunks,
                                          hostdevice_vector<gpu::PageInfo> &pages,
                                          hostdevice_vector<gpu::PageNestingInfo> &page_nesting_info,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream)
 {
   // compute total # of page_nesting infos needed and allocate space. doing this in one
   // buffer to keep it to a single gpu allocation
@@ -1075,7 +1079,7 @@ void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc> &c
                                       size_t min_row,
                                       size_t total_rows,
                                       bool has_lists,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream)
 {
   // TODO : we should be selectively preprocessing only columns that have
   // lists in them instead of doing them all if even one contains lists.
@@ -1096,7 +1100,7 @@ void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc> &c
     // preprocess per-nesting level sizes by page
     gpu::PreprocessColumnData(
       pages, chunks, _input_columns, _output_columns, total_rows, min_row, stream, _mr);
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    stream.synchronize();
   }
 }
 
@@ -1108,7 +1112,7 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
                                     hostdevice_vector<gpu::PageNestingInfo> &page_nesting,
                                     size_t min_row,
                                     size_t total_rows,
-                                    cudaStream_t stream)
+                                    rmm::cuda_stream_view stream)
 {
   auto is_dict_chunk = [](const gpu::ColumnChunkDesc &chunk) {
     return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
@@ -1218,7 +1222,7 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
   gpu::DecodePageData(pages, chunks, total_rows, min_row, stream);
   pages.device_to_host(stream);
   page_nesting.device_to_host(stream);
-  cudaStreamSynchronize(stream);
+  stream.synchronize();
 
   // for list columns, add the final offset to every offset buffer.
   // TODO : make this happen in more efficiently. Maybe use thrust::for_each
@@ -1248,7 +1252,7 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
                       &offset,
                       sizeof(offset),
                       cudaMemcpyHostToDevice,
-                      stream);
+                      stream.value());
       out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
     }
   }
@@ -1274,13 +1278,13 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
     }
   }
 
-  cudaStreamSynchronize(stream);
+  stream.synchronize();
 }
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>> &&sources,
                    parquet_reader_options const &options,
                    rmm::mr::device_memory_resource *mr)
-  : _sources(std::move(sources)), _mr(mr)
+  : _mr(mr), _sources(std::move(sources))
 {
   // Open and parse the source dataset metadata
   _metadata = std::make_unique<aggregate_metadata>(_sources);
@@ -1304,7 +1308,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>> &&sources,
 table_with_metadata reader::impl::read(size_type skip_rows,
                                        size_type num_rows,
                                        std::vector<std::vector<size_type>> const &row_group_list,
-                                       cudaStream_t stream)
+                                       rmm::cuda_stream_view stream)
 {
   // Select only row groups required
   const auto selected_row_groups =
@@ -1473,7 +1477,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       for (size_t i = 0; i < _output_columns.size(); ++i) {
         out_metadata.schema_info.push_back(column_name_info{""});
         out_columns.emplace_back(
-          make_column(_output_columns[i], stream, _mr, &out_metadata.schema_info.back()));
+          make_column(_output_columns[i], &out_metadata.schema_info.back(), stream, _mr));
       }
     }
   }
@@ -1517,7 +1521,8 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(parquet_reader_options const &options, cudaStream_t stream)
+table_with_metadata reader::read(parquet_reader_options const &options,
+                                 rmm::cuda_stream_view stream)
 {
   return _impl->read(
     options.get_skip_rows(), options.get_num_rows(), options.get_row_groups(), stream);
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index c192b65f0b0..f6df8f9e460 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -31,6 +31,8 @@
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <string>
 #include <utility>
@@ -75,7 +77,7 @@ class reader::impl {
   table_with_metadata read(size_type skip_rows,
                            size_type num_rows,
                            std::vector<std::vector<size_type>> const &row_group_indices,
-                           cudaStream_t stream);
+                           rmm::cuda_stream_view stream);
 
  private:
   /**
@@ -95,7 +97,7 @@ class reader::impl {
                           size_t end_chunk,
                           const std::vector<size_t> &column_chunk_offsets,
                           std::vector<size_type> const &chunk_source_map,
-                          cudaStream_t stream);
+                          rmm::cuda_stream_view stream);
 
   /**
    * @brief Returns the number of total pages from the given column chunks
@@ -105,7 +107,8 @@ class reader::impl {
    *
    * @return The total number of pages
    */
-  size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &chunks, cudaStream_t stream);
+  size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
+                            rmm::cuda_stream_view stream);
 
   /**
    * @brief Returns the page information from the given column chunks.
@@ -116,7 +119,7 @@ class reader::impl {
    */
   void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
                            hostdevice_vector<gpu::PageInfo> &pages,
-                           cudaStream_t stream);
+                           rmm::cuda_stream_view stream);
 
   /**
    * @brief Decompresses the page data, at page granularity.
@@ -129,7 +132,7 @@ class reader::impl {
    */
   rmm::device_buffer decompress_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chunks,
                                           hostdevice_vector<gpu::PageInfo> &pages,
-                                          cudaStream_t stream);
+                                          rmm::cuda_stream_view stream);
 
   /**
    * @brief Allocate nesting information storage for all pages and set pointers
@@ -149,7 +152,7 @@ class reader::impl {
   void allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const &chunks,
                              hostdevice_vector<gpu::PageInfo> &pages,
                              hostdevice_vector<gpu::PageNestingInfo> &page_nesting_info,
-                             cudaStream_t stream);
+                             rmm::cuda_stream_view stream);
 
   /**
    * @brief Preprocess column information for nested schemas.
@@ -174,7 +177,7 @@ class reader::impl {
                           size_t min_row,
                           size_t total_rows,
                           bool has_lists,
-                          cudaStream_t stream);
+                          rmm::cuda_stream_view stream);
 
   /**
    * @brief Converts the page data and outputs to columns.
@@ -191,7 +194,7 @@ class reader::impl {
                         hostdevice_vector<gpu::PageNestingInfo> &page_nesting,
                         size_t min_row,
                         size_t total_rows,
-                        cudaStream_t stream);
+                        rmm::cuda_stream_view stream);
 
  private:
   rmm::mr::device_memory_resource *_mr = nullptr;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index f86e6a5ee67..f6f9ecb431a 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -19,24 +19,26 @@
  * @brief cuDF-IO parquet writer class implementation
  */
 
-#include <io/parquet/compact_protocol_writer.hpp>
 #include "writer_impl.hpp"
 
+#include <io/parquet/compact_protocol_writer.hpp>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <algorithm>
-#include <cstring>
-#include <numeric>
-#include <utility>
-
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <algorithm>
+#include <cstring>
+#include <numeric>
+#include <utility>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -123,7 +125,7 @@ class parquet_column_view {
                                column_view const &col,
                                const table_metadata *metadata,
                                bool int96_timestamps,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream)
     : _col(col),
       _leaf_col(get_leaf_col(col)),
       _id(id),
@@ -286,19 +288,21 @@ class parquet_column_view {
       }
       _offsets_array = offsets_array;
 
-      CUDA_TRY(cudaStreamSynchronize(stream));
+      stream.synchronize();
     }
     if (_string_type && _data_count > 0) {
       strings_column_view view{_leaf_col};
       _indexes = rmm::device_buffer(_data_count * sizeof(gpu::nvstrdesc_s), stream);
-      stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream>>>(
+
+      stringdata_to_nvstrdesc<<<((_data_count - 1) >> 8) + 1, 256, 0, stream.value()>>>(
         reinterpret_cast<gpu::nvstrdesc_s *>(_indexes.data()),
         view.offsets().data<size_type>() + leaf_col_offset,
         view.chars().data<char>(),
         _nulls,
         _data_count);
       _data = _indexes.data();
-      CUDA_TRY(cudaStreamSynchronize(stream));
+
+      stream.synchronize();
     }
 
     // Generating default name if name isn't present in metadata
@@ -427,13 +431,13 @@ void writer::impl::init_page_fragments(hostdevice_vector<gpu::PageFragment> &fra
                                        uint32_t num_fragments,
                                        uint32_t num_rows,
                                        uint32_t fragment_size,
-                                       cudaStream_t stream)
+                                       rmm::cuda_stream_view stream)
 {
   CUDA_TRY(cudaMemcpyAsync(col_desc.device_ptr(),
                            col_desc.host_ptr(),
                            col_desc.memory_size(),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
   gpu::InitPageFragments(frag.device_ptr(),
                          col_desc.device_ptr(),
                          num_fragments,
@@ -441,9 +445,12 @@ void writer::impl::init_page_fragments(hostdevice_vector<gpu::PageFragment> &fra
                          fragment_size,
                          num_rows,
                          stream);
-  CUDA_TRY(cudaMemcpyAsync(
-    frag.host_ptr(), frag.device_ptr(), frag.memory_size(), cudaMemcpyDeviceToHost, stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  CUDA_TRY(cudaMemcpyAsync(frag.host_ptr(),
+                           frag.device_ptr(),
+                           frag.memory_size(),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+  stream.synchronize();
 }
 
 void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk,
@@ -452,7 +459,7 @@ void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk
                                               uint32_t num_columns,
                                               uint32_t num_fragments,
                                               uint32_t fragment_size,
-                                              cudaStream_t stream)
+                                              rmm::cuda_stream_view stream)
 {
   rmm::device_vector<statistics_group> frag_stats_group(num_fragments * num_columns);
 
@@ -465,7 +472,7 @@ void writer::impl::gather_fragment_statistics(statistics_chunk *frag_stats_chunk
                               stream);
   GatherColumnStatistics(
     frag_stats_chunk, frag_stats_group.data().get(), num_fragments * num_columns, stream);
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  stream.synchronize();
 }
 
 void writer::impl::build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChunk> &chunks,
@@ -473,12 +480,15 @@ void writer::impl::build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChun
                                             uint32_t num_rowgroups,
                                             uint32_t num_columns,
                                             uint32_t num_dictionaries,
-                                            cudaStream_t stream)
+                                            rmm::cuda_stream_view stream)
 {
   size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize;
   rmm::device_vector<uint32_t> dict_scratch(dict_scratch_size / sizeof(uint32_t));
-  CUDA_TRY(cudaMemcpyAsync(
-    chunks.device_ptr(), chunks.host_ptr(), chunks.memory_size(), cudaMemcpyHostToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(chunks.device_ptr(),
+                           chunks.host_ptr(),
+                           chunks.memory_size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
   gpu::BuildChunkDictionaries(chunks.device_ptr(),
                               dict_scratch.data().get(),
                               dict_scratch_size,
@@ -492,9 +502,12 @@ void writer::impl::build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChun
                         nullptr,
                         nullptr,
                         stream);
-  CUDA_TRY(cudaMemcpyAsync(
-    chunks.host_ptr(), chunks.device_ptr(), chunks.memory_size(), cudaMemcpyDeviceToHost, stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  CUDA_TRY(cudaMemcpyAsync(chunks.host_ptr(),
+                           chunks.device_ptr(),
+                           chunks.memory_size(),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+  stream.synchronize();
 }
 
 void writer::impl::init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
@@ -506,11 +519,14 @@ void writer::impl::init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk> &ch
                                       uint32_t num_columns,
                                       uint32_t num_pages,
                                       uint32_t num_stats_bfr,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream)
 {
   rmm::device_vector<statistics_merge_group> page_stats_mrg(num_stats_bfr);
-  CUDA_TRY(cudaMemcpyAsync(
-    chunks.device_ptr(), chunks.host_ptr(), chunks.memory_size(), cudaMemcpyHostToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(chunks.device_ptr(),
+                           chunks.host_ptr(),
+                           chunks.memory_size(),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
   InitEncoderPages(chunks.device_ptr(),
                    pages,
                    col_desc.device_ptr(),
@@ -529,7 +545,7 @@ void writer::impl::init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk> &ch
                             stream);
     }
   }
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  stream.synchronize();
 }
 
 void writer::impl::encode_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
@@ -543,7 +559,7 @@ void writer::impl::encode_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
                                 gpu_inflate_status_s *comp_out,
                                 const statistics_chunk *page_stats,
                                 const statistics_chunk *chunk_stats,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream)
 {
   gpu::EncodePages(
     pages, chunks.device_ptr(), pages_in_batch, first_page_in_batch, comp_in, comp_out, stream);
@@ -577,8 +593,8 @@ void writer::impl::encode_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
                            chunks.device_ptr() + first_rowgroup * num_columns,
                            rowgroups_in_batch * num_columns * sizeof(gpu::EncColumnChunk),
                            cudaMemcpyDeviceToHost,
-                           stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+                           stream.value()));
+  stream.synchronize();
 }
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
@@ -598,7 +614,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::write(
   bool return_filemetadata,
   const std::string &column_chunks_file_path,
   bool int96_timestamps,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   pq_chunked_state state{metadata, SingleWriteMode::YES, int96_timestamps, stream};
 
@@ -1091,8 +1107,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
               dev_bfr,
               ck->ck_stat_size,
               cudaMemcpyDeviceToHost,
-              state.stream));
-            CUDA_TRY(cudaStreamSynchronize(state.stream));
+              state.stream.value()));
+            state.stream.synchronize();
           }
         } else {
           // copy the full data
@@ -1100,8 +1116,8 @@ void writer::impl::write_chunk(table_view const &table, pq_chunked_state &state)
                                    dev_bfr,
                                    ck->ck_stat_size + ck->compressed_size,
                                    cudaMemcpyDeviceToHost,
-                                   state.stream));
-          CUDA_TRY(cudaStreamSynchronize(state.stream));
+                                   state.stream.value()));
+          state.stream.synchronize();
           out_sink_->host_write(host_bfr.get() + ck->ck_stat_size, ck->compressed_size);
           if (ck->ck_stat_size != 0) {
             state.md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(
@@ -1174,7 +1190,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::write(table_view const &table,
                                                     bool return_filemetadata,
                                                     const std::string column_chunks_file_path,
                                                     bool int96_timestamps,
-                                                    cudaStream_t stream)
+                                                    rmm::cuda_stream_view stream)
 {
   return _impl->write(
     table, metadata, return_filemetadata, column_chunks_file_path, int96_timestamps, stream);
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 51d4213d782..75130c1881d 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -21,6 +21,8 @@
 
 #pragma once
 
+#include "chunked_state.hpp"
+
 #include <io/parquet/parquet.hpp>
 #include <io/parquet/parquet_gpu.hpp>
 
@@ -33,12 +35,12 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "chunked_state.hpp"
-
 namespace cudf {
 namespace io {
 namespace detail {
@@ -87,7 +89,7 @@ class writer::impl {
                                               bool return_filemetadata,
                                               const std::string& column_chunks_file_path,
                                               bool int96_timestamps,
-                                              cudaStream_t stream);
+                                              rmm::cuda_stream_view stream);
 
   /**
    * @brief Begins the chunked/streamed write process.
@@ -136,7 +138,7 @@ class writer::impl {
                            uint32_t num_fragments,
                            uint32_t num_rows,
                            uint32_t fragment_size,
-                           cudaStream_t stream);
+                           rmm::cuda_stream_view stream);
   /**
    * @brief Gather per-fragment statistics
    *
@@ -154,7 +156,7 @@ class writer::impl {
                                   uint32_t num_columns,
                                   uint32_t num_fragments,
                                   uint32_t fragment_size,
-                                  cudaStream_t stream);
+                                  rmm::cuda_stream_view stream);
   /**
    * @brief Build per-chunk dictionaries and count data pages
    *
@@ -170,7 +172,7 @@ class writer::impl {
                                 uint32_t num_rowgroups,
                                 uint32_t num_columns,
                                 uint32_t num_dictionaries,
-                                cudaStream_t stream);
+                                rmm::cuda_stream_view stream);
   /**
    * @brief Initialize encoder pages
    *
@@ -192,7 +194,7 @@ class writer::impl {
                           uint32_t num_columns,
                           uint32_t num_pages,
                           uint32_t num_stats_bfr,
-                          cudaStream_t stream);
+                          rmm::cuda_stream_view stream);
   /**
    * @brief Encode a batch pages
    *
@@ -220,7 +222,7 @@ class writer::impl {
                     gpu_inflate_status_s* comp_out,
                     const statistics_chunk* page_stats,
                     const statistics_chunk* chunk_stats,
-                    cudaStream_t stream);
+                    rmm::cuda_stream_view stream);
 
  private:
   // TODO : figure out if we want to keep this. It is currently unused.
diff --git a/cpp/src/io/statistics/column_stats.cu b/cpp/src/io/statistics/column_stats.cu
index fb74987f061..69fb714d9c8 100644
--- a/cpp/src/io/statistics/column_stats.cu
+++ b/cpp/src/io/statistics/column_stats.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-20, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,11 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <math_constants.h>
-#include <cub/cub.cuh>
-#include <io/utilities/block_utils.cuh>
+
 #include "column_stats.h"
 
+#include <io/utilities/block_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cub/cub.cuh>
+
+#include <math_constants.h>
+
 namespace cudf {
 namespace io {
 /**
@@ -754,9 +760,9 @@ __global__ void __launch_bounds__(block_size, 1)
 void GatherColumnStatistics(statistics_chunk *chunks,
                             const statistics_group *groups,
                             uint32_t num_chunks,
-                            cudaStream_t stream)
+                            rmm::cuda_stream_view stream)
 {
-  gpuGatherColumnStatistics<1024><<<num_chunks, 1024, 0, stream>>>(chunks, groups);
+  gpuGatherColumnStatistics<1024><<<num_chunks, 1024, 0, stream.value()>>>(chunks, groups);
 }
 
 /**
@@ -772,9 +778,10 @@ void MergeColumnStatistics(statistics_chunk *chunks_out,
                            const statistics_chunk *chunks_in,
                            const statistics_merge_group *groups,
                            uint32_t num_chunks,
-                           cudaStream_t stream)
+                           rmm::cuda_stream_view stream)
 {
-  gpuMergeColumnStatistics<1024><<<num_chunks, 1024, 0, stream>>>(chunks_out, chunks_in, groups);
+  gpuMergeColumnStatistics<1024>
+    <<<num_chunks, 1024, 0, stream.value()>>>(chunks_out, chunks_in, groups);
 }
 
 }  // namespace io
diff --git a/cpp/src/io/statistics/column_stats.h b/cpp/src/io/statistics/column_stats.h
index 588d764e9af..bbecc85b8d8 100644
--- a/cpp/src/io/statistics/column_stats.h
+++ b/cpp/src/io/statistics/column_stats.h
@@ -16,6 +16,8 @@
 #pragma once
 #include <stdint.h>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 
@@ -96,7 +98,7 @@ struct statistics_merge_group {
 void GatherColumnStatistics(statistics_chunk *chunks,
                             const statistics_group *groups,
                             uint32_t num_chunks,
-                            cudaStream_t stream = (cudaStream_t)0);
+                            rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel to merge column statistics
@@ -111,7 +113,7 @@ void MergeColumnStatistics(statistics_chunk *chunks_out,
                            const statistics_chunk *chunks_in,
                            const statistics_merge_group *groups,
                            uint32_t num_chunks,
-                           cudaStream_t stream = (cudaStream_t)0);
+                           rmm::cuda_stream_view stream);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 0290857119b..832817cf7d5 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -51,13 +51,13 @@ namespace detail {
 inline rmm::device_buffer create_data(
   data_type type,
   size_type size,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   std::size_t data_size = size_of(type) * size;
 
   rmm::device_buffer data(data_size, stream, mr);
-  CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream));
+  CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value()));
 
   return data;
 }
@@ -84,7 +84,7 @@ struct column_buffer {
   column_buffer(data_type _type,
                 size_type _size,
                 bool _is_nullable                   = true,
-                cudaStream_t stream                 = 0,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : type(_type), is_nullable(_is_nullable), _null_count(0)
   {
@@ -102,7 +102,7 @@ struct column_buffer {
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
   void create(size_type _size,
-              cudaStream_t stream                 = 0,
+              rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     size = _size;
@@ -164,9 +164,9 @@ namespace {
  */
 std::unique_ptr<column> make_column(
   column_buffer& buffer,
-  cudaStream_t stream                 = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  column_name_info* schema_info       = nullptr)
+  column_name_info* schema_info       = nullptr,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   using str_pair = thrust::pair<const char*, size_type>;
 
@@ -194,7 +194,7 @@ std::unique_ptr<column> make_column(
 
       // make child column
       CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = make_column(buffer.children[0], stream, mr, child_info);
+      auto child = make_column(buffer.children[0], child_info, stream, mr);
 
       // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
       // less)
@@ -219,7 +219,7 @@ std::unique_ptr<column> make_column(
                          schema_info->children.push_back(column_name_info{""});
                          child_info = &schema_info->children.back();
                        }
-                       return make_column(col, stream, mr, child_info);
+                       return make_column(col, child_info, stream, mr);
                      });
 
       return make_structs_column(buffer.size,
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 469683e1ad0..b4c2f491927 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -19,6 +19,8 @@
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace io {
 /**
@@ -86,7 +88,7 @@ class void_sink : public data_sink {
 
   bool supports_device_write() const override { return true; }
 
-  void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
     bytes_written_ += size;
   }
@@ -109,7 +111,7 @@ class user_sink_wrapper : public data_sink {
 
   bool supports_device_write() const override { return user_sink->supports_device_write(); }
 
-  void device_write(void const* gpu_data, size_t size, cudaStream_t stream) override
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
     CUDF_EXPECTS(user_sink->supports_device_write(),
                  "device_write() being called on a data_sink that doesn't support it");
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 0bdd5ca8345..5a4ac8e1d7e 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -16,10 +16,11 @@
 
 #pragma once
 
-#include <rmm/device_buffer.hpp>
-
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
 /**
  * @brief A helper class that wraps fixed-length device memory for the GPU, and
  * a mirror host pinned memory for the CPU.
@@ -43,12 +44,15 @@ class hostdevice_vector {
     return *this;
   }
 
-  explicit hostdevice_vector(size_t max_size, cudaStream_t stream = 0)
+  explicit hostdevice_vector(size_t max_size,
+                             rmm::cuda_stream_view stream = rmm::cuda_stream_default)
     : hostdevice_vector(max_size, max_size, stream)
   {
   }
 
-  explicit hostdevice_vector(size_t initial_size, size_t max_size, cudaStream_t stream = 0)
+  explicit hostdevice_vector(size_t initial_size,
+                             size_t max_size,
+                             rmm::cuda_stream_view stream = rmm::cuda_stream_default)
     : num_elements(initial_size), max_elements(max_size)
   {
     if (max_elements != 0) {
@@ -87,16 +91,18 @@ class hostdevice_vector {
     return reinterpret_cast<T const *>(d_data.data()) + offset;
   }
 
-  void host_to_device(cudaStream_t stream, bool synchronize = false)
+  void host_to_device(rmm::cuda_stream_view stream, bool synchronize = false)
   {
-    cudaMemcpyAsync(d_data.data(), h_data, memory_size(), cudaMemcpyHostToDevice, stream);
-    if (synchronize) { cudaStreamSynchronize(stream); }
+    CUDA_TRY(cudaMemcpyAsync(
+      d_data.data(), h_data, memory_size(), cudaMemcpyHostToDevice, stream.value()));
+    if (synchronize) { stream.synchronize(); }
   }
 
-  void device_to_host(cudaStream_t stream, bool synchronize = false)
+  void device_to_host(rmm::cuda_stream_view stream, bool synchronize = false)
   {
-    cudaMemcpyAsync(h_data, d_data.data(), memory_size(), cudaMemcpyDeviceToHost, stream);
-    if (synchronize) { cudaStreamSynchronize(stream); }
+    CUDA_TRY(cudaMemcpyAsync(
+      h_data, d_data.data(), memory_size(), cudaMemcpyDeviceToHost, stream.value()));
+    if (synchronize) { stream.synchronize(); }
   }
 
  private:
@@ -113,9 +119,9 @@ class hostdevice_vector {
     v.h_data       = nullptr;
   }
 
-  cudaStream_t stream = 0;
-  size_t max_elements = 0;
-  size_t num_elements = 0;
-  T *h_data           = nullptr;
-  rmm::device_buffer d_data;
+  rmm::cuda_stream_view stream{};
+  size_t max_elements{};
+  size_t num_elements{};
+  T *h_data{};
+  rmm::device_buffer d_data{};
 };
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index 5a2dc32e27a..c3145f71efd 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -27,6 +27,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 /**
@@ -37,7 +39,7 @@ namespace detail {
 std::unique_ptr<cudf::table> cross_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
-  cudaStream_t stream,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty");
@@ -75,7 +77,7 @@ std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::cross_join(left, right, 0, mr);
+  return detail::cross_join(left, right, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 91188539790..67b9d3436d8 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <join/hash_join.cuh>
+
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/gather.hpp>
 
-#include <join/hash_join.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <numeric>
 
@@ -133,7 +135,7 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
 get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
                                  size_type left_table_row_count,
                                  size_type right_table_row_count,
-                                 cudaStream_t stream)
+                                 rmm::cuda_stream_view stream)
 {
   // Get array of indices that do not appear in right_indices
 
@@ -146,7 +148,7 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
   // right_indices will be JoinNoneValue, i.e. -1. This if path should
   // produce exactly the same result as the else path but will be faster.
   if (left_table_row_count == 0) {
-    thrust::sequence(rmm::exec_policy(stream)->on(stream),
+    thrust::sequence(rmm::exec_policy(stream)->on(stream.value()),
                      right_indices_complement.begin(),
                      right_indices_complement.end(),
                      0);
@@ -158,7 +160,7 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
 
     // invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count
     // Thus specifying that those locations are valid
-    thrust::scatter_if(rmm::exec_policy(stream)->on(stream),
+    thrust::scatter_if(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_constant_iterator(0),
                        thrust::make_constant_iterator(0) + right_indices.size(),
                        right_indices.begin(),      // Index locations
@@ -169,7 +171,7 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
     size_type end_counter   = static_cast<size_type>(right_table_row_count);
 
     // Create list of indices that have been marked as invalid
-    size_type indices_count = thrust::copy_if(rmm::exec_policy(stream)->on(stream),
+    size_type indices_count = thrust::copy_if(rmm::exec_policy(stream)->on(stream.value()),
                                               thrust::make_counting_iterator(begin_counter),
                                               thrust::make_counting_iterator(end_counter),
                                               invalid_index_map.begin(),
@@ -200,7 +202,7 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
  * @return Built hash table.
  */
 std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_hash_table(
-  cudf::table_device_view build_table, cudaStream_t stream)
+  cudf::table_device_view build_table, rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(0 != build_table.num_columns(), "Selected build dataset is empty");
   CUDF_EXPECTS(0 != build_table.num_rows(), "Build side table has no rows");
@@ -209,17 +211,17 @@ std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_
   size_t const hash_table_size = compute_hash_table_size(build_table_num_rows);
 
   auto hash_table = multimap_type::create(hash_table_size,
+                                          stream,
                                           true,
                                           multimap_type::hasher(),
                                           multimap_type::key_equal(),
-                                          multimap_type::allocator_type(),
-                                          stream);
+                                          multimap_type::allocator_type());
 
   row_hash hash_build{build_table};
   rmm::device_scalar<int> failure(0, stream);
   constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
   detail::grid_1d config(build_table_num_rows, block_size);
-  build_hash_table<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
+  build_hash_table<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
     *hash_table, hash_build, build_table_num_rows, failure.data());
   // Check error code from the kernel
   if (failure.value(stream) == 1) { CUDF_FAIL("Hash Table insert failure."); }
@@ -247,7 +249,7 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_jo
   cudf::table_device_view probe_table,
   multimap_type const &hash_table,
   null_equality compare_nulls,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   size_type estimated_size = estimate_join_output_size<JoinKind, multimap_type>(
     build_table, probe_table, hash_table, compare_nulls, stream);
@@ -278,17 +280,18 @@ std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>> probe_jo
     row_hash hash_probe{probe_table};
     row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
     probe_hash_table<JoinKind, multimap_type, block_size, DEFAULT_JOIN_CACHE_SIZE>
-      <<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(hash_table,
-                                                                       build_table,
-                                                                       probe_table,
-                                                                       hash_probe,
-                                                                       equality,
-                                                                       left_indices.data().get(),
-                                                                       right_indices.data().get(),
-                                                                       write_index.data(),
-                                                                       estimated_size);
-
-    CHECK_CUDA(stream);
+      <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+        hash_table,
+        build_table,
+        probe_table,
+        hash_probe,
+        equality,
+        left_indices.data().get(),
+        right_indices.data().get(),
+        write_index.data(),
+        estimated_size);
+
+    CHECK_CUDA(stream.value());
 
     join_size              = write_index.value(stream);
     current_estimated_size = estimated_size;
@@ -388,8 +391,8 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> construct_join_output_
   VectorPair &joined_indices,
   std::vector<std::pair<size_type, size_type>> const &columns_in_common,
   cudf::hash_join::common_columns_output_side common_columns_output_side,
-  rmm::mr::device_memory_resource *mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
 {
   std::vector<size_type> probe_common_col;
   probe_common_col.reserve(columns_in_common.size());
@@ -481,7 +484,7 @@ hash_join::hash_join_impl::~hash_join_impl() = default;
 
 hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build,
                                           std::vector<size_type> const &build_on,
-                                          cudaStream_t stream)
+                                          rmm::cuda_stream_view stream)
   : _build(build),
     _build_selected(build.select(build_on)),
     _build_on(build_on),
@@ -505,12 +508,12 @@ hash_join::hash_join_impl::inner_join(
   std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
   common_columns_output_side common_columns_output_side,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource *mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join<cudf::detail::join_kind::INNER_JOIN>(
-    probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, mr, stream);
+    probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr);
 }
 
 std::unique_ptr<cudf::table> hash_join::hash_join_impl::left_join(
@@ -518,8 +521,8 @@ std::unique_ptr<cudf::table> hash_join::hash_join_impl::left_join(
   std::vector<size_type> const &probe_on,
   std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource *mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr) const
 {
   CUDF_FUNC_RANGE();
   auto probe_build_pair =
@@ -528,8 +531,8 @@ std::unique_ptr<cudf::table> hash_join::hash_join_impl::left_join(
                                                           columns_in_common,
                                                           common_columns_output_side::PROBE,
                                                           compare_nulls,
-                                                          mr,
-                                                          stream);
+                                                          stream,
+                                                          mr);
   return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
                                           std::move(probe_build_pair.second));
 }
@@ -539,8 +542,8 @@ std::unique_ptr<cudf::table> hash_join::hash_join_impl::full_join(
   std::vector<size_type> const &probe_on,
   std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource *mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr) const
 {
   CUDF_FUNC_RANGE();
   auto probe_build_pair =
@@ -549,8 +552,8 @@ std::unique_ptr<cudf::table> hash_join::hash_join_impl::full_join(
                                                           columns_in_common,
                                                           common_columns_output_side::PROBE,
                                                           compare_nulls,
-                                                          mr,
-                                                          stream);
+                                                          stream,
+                                                          mr);
   return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
                                           std::move(probe_build_pair.second));
 }
@@ -563,8 +566,8 @@ hash_join::hash_join_impl::compute_hash_join(
   std::vector<std::pair<cudf::size_type, cudf::size_type>> const &columns_in_common,
   common_columns_output_side common_columns_output_side,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource *mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
   CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
@@ -600,7 +603,7 @@ hash_join::hash_join_impl::compute_hash_join(
                                                       : JoinKind;
   auto joined_indices = probe_join_indices<ProbeJoinKind>(probe_selected, compare_nulls, stream);
   return cudf::detail::construct_join_output_df<JoinKind>(
-    probe, _build, joined_indices, columns_in_common, common_columns_output_side, mr, stream);
+    probe, _build, joined_indices, columns_in_common, common_columns_output_side, stream, mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
@@ -608,7 +611,7 @@ std::enable_if_t<JoinKind != cudf::detail::join_kind::FULL_JOIN,
                  std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>>
 hash_join::hash_join_impl::probe_join_indices(cudf::table_view const &probe,
                                               null_equality compare_nulls,
-                                              cudaStream_t stream) const
+                                              rmm::cuda_stream_view stream) const
 {
   // Trivial left join case - exit early
   if (!_hash_table && JoinKind == cudf::detail::join_kind::LEFT_JOIN) {
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index f49a563e857..36cb486d4c1 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -15,16 +15,19 @@
  */
 #pragma once
 
+#include <join/join_common_utils.hpp>
+#include <join/join_kernels.cuh>
+
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <join/join_common_utils.hpp>
-#include <join/join_kernels.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/sequence.h>
+
 #include <limits>
 
 namespace cudf {
@@ -58,7 +61,7 @@ size_type estimate_join_output_size(table_device_view build_table,
                                     table_device_view probe_table,
                                     multimap_type const& hash_table,
                                     null_equality compare_nulls,
-                                    cudaStream_t stream)
+                                    rmm::cuda_stream_view stream)
 {
   using estimate_size_type = int64_t;  // use 64-bit size so we can detect overflow
 
@@ -95,7 +98,7 @@ size_type estimate_join_output_size(table_device_view build_table,
   estimate_size_type h_size_estimate{0};
   rmm::device_scalar<estimate_size_type> size_estimate(0, stream);
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 
   constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
   int numBlocks{-1};
@@ -122,14 +125,14 @@ size_type estimate_join_output_size(table_device_view build_table,
     // Probe the hash table without actually building the output to simply
     // find what the size of the output will be.
     compute_join_output_size<JoinKind, multimap_type, block_size>
-      <<<numBlocks * num_sms, block_size, 0, stream>>>(hash_table,
-                                                       build_table,
-                                                       probe_table,
-                                                       hash_probe,
-                                                       equality,
-                                                       sample_probe_num_rows,
-                                                       size_estimate.data());
-    CHECK_CUDA(stream);
+      <<<numBlocks * num_sms, block_size, 0, stream.value()>>>(hash_table,
+                                                               build_table,
+                                                               probe_table,
+                                                               hash_probe,
+                                                               equality,
+                                                               sample_probe_num_rows,
+                                                               size_estimate.data());
+    CHECK_CUDA(stream.value());
 
     // Only in case subset of probe table is chosen,
     // increase the estimated output size by a factor of the ratio between the
@@ -177,13 +180,13 @@ size_type estimate_join_output_size(table_device_view build_table,
  * @return Join output indices vector pair
  */
 inline std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
-get_trivial_left_join_indices(table_view const& left, cudaStream_t stream)
+get_trivial_left_join_indices(table_view const& left, rmm::cuda_stream_view stream)
 {
   rmm::device_vector<size_type> left_indices(left.num_rows());
   thrust::sequence(
-    rmm::exec_policy(stream)->on(stream), left_indices.begin(), left_indices.end(), 0);
+    rmm::exec_policy(stream)->on(stream.value()), left_indices.begin(), left_indices.end(), 0);
   rmm::device_vector<size_type> right_indices(left.num_rows());
-  thrust::fill(rmm::exec_policy(stream)->on(stream),
+  thrust::fill(rmm::exec_policy(stream)->on(stream.value()),
                right_indices.begin(),
                right_indices.end(),
                JoinNoneValue);
@@ -226,7 +229,7 @@ struct hash_join::hash_join_impl {
    */
   hash_join_impl(cudf::table_view const& build,
                  std::vector<size_type> const& build_on,
-                 cudaStream_t stream = 0);
+                 rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> inner_join(
     cudf::table_view const& probe,
@@ -234,24 +237,24 @@ struct hash_join::hash_join_impl {
     std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
     common_columns_output_side common_columns_output_side,
     null_equality compare_nulls,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream = 0) const;
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const;
 
   std::unique_ptr<cudf::table> left_join(
     cudf::table_view const& probe,
     std::vector<size_type> const& probe_on,
     std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
     null_equality compare_nulls,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream = 0) const;
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const;
 
   std::unique_ptr<cudf::table> full_join(
     cudf::table_view const& probe,
     std::vector<size_type> const& probe_on,
     std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
     null_equality compare_nulls,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream = 0) const;
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const;
 
  private:
   /**
@@ -302,8 +305,8 @@ struct hash_join::hash_join_impl {
     std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
     common_columns_output_side common_columns_output_side,
     null_equality compare_nulls,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream = 0) const;
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const;
 
   /**
    * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`,
@@ -325,7 +328,7 @@ struct hash_join::hash_join_impl {
                    std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>>
   probe_join_indices(cudf::table_view const& probe,
                      null_equality compare_nulls,
-                     cudaStream_t stream) const;
+                     rmm::cuda_stream_view stream) const;
 };
 
 }  // namespace cudf
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 7c750395e61..af649fe5fb0 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -13,13 +13,15 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <join/hash_join.cuh>
+#include <join/join_common_utils.hpp>
+
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
 
-#include <join/hash_join.cuh>
-#include <join/join_common_utils.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
@@ -31,15 +33,16 @@ std::unique_ptr<table> inner_join(
   std::vector<size_type> const& right_on,
   std::vector<std::pair<size_type, size_type>> const& columns_in_common,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
     {left_input.select(left_on), right_input.select(right_on)},
-    rmm::mr::get_current_device_resource(),  // temporary objects returned
-    stream);
+    stream,
+    rmm::mr::get_current_device_resource());  // temporary objects returned
+
   // now rebuild the table views with the updated ones
   auto const left  = scatter_columns(matched.second.front(), left_on, left_input);
   auto const right = scatter_columns(matched.second.back(), right_on, right_input);
@@ -58,19 +61,19 @@ std::unique_ptr<table> inner_join(
                                               actual_columns_in_common,
                                               cudf::hash_join::common_columns_output_side::BUILD,
                                               compare_nulls,
-                                              mr,
-                                              stream);
+                                              stream,
+                                              mr);
     return cudf::detail::combine_table_pair(std::move(probe_build_pair.second),
                                             std::move(probe_build_pair.first));
   } else {
-    cudf::hash_join hj_obj(right, right_on);
+    cudf::hash_join hj_obj(right, right_on, stream);
     auto probe_build_pair = hj_obj.inner_join(left,
                                               left_on,
                                               columns_in_common,
                                               cudf::hash_join::common_columns_output_side::PROBE,
                                               compare_nulls,
-                                              mr,
-                                              stream);
+                                              stream,
+                                              mr);
     return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
                                             std::move(probe_build_pair.second));
   }
@@ -83,21 +86,21 @@ std::unique_ptr<table> left_join(
   std::vector<size_type> const& right_on,
   std::vector<std::pair<size_type, size_type>> const& columns_in_common,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
     {left_input.select(left_on), right_input.select(right_on)},  // these should match
-    rmm::mr::get_current_device_resource(),                      // temporary objects returned
-    stream);
+    stream,
+    rmm::mr::get_current_device_resource());  // temporary objects returned
   // now rebuild the table views with the updated ones
   table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
   table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
 
   cudf::hash_join hj_obj(right, right_on, stream);
-  return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, mr, stream);
+  return hj_obj.left_join(left, left_on, columns_in_common, compare_nulls, stream, mr);
 }
 
 std::unique_ptr<table> full_join(
@@ -107,21 +110,21 @@ std::unique_ptr<table> full_join(
   std::vector<size_type> const& right_on,
   std::vector<std::pair<size_type, size_type>> const& columns_in_common,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
     {left_input.select(left_on), right_input.select(right_on)},  // these should match
-    rmm::mr::get_current_device_resource(),                      // temporary objects returned
-    stream);
+    stream,
+    rmm::mr::get_current_device_resource());  // temporary objects returned
   // now rebuild the table views with the updated ones
   table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
   table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
 
   cudf::hash_join hj_obj(right, right_on, stream);
-  return hj_obj.full_join(left, left_on, columns_in_common, compare_nulls, mr, stream);
+  return hj_obj.full_join(left, left_on, columns_in_common, compare_nulls, stream, mr);
 }
 
 }  // namespace detail
@@ -130,7 +133,7 @@ hash_join::~hash_join() = default;
 
 hash_join::hash_join(cudf::table_view const& build,
                      std::vector<size_type> const& build_on,
-                     cudaStream_t stream)
+                     rmm::cuda_stream_view stream)
   : impl{std::make_unique<const hash_join::hash_join_impl>(build, build_on, stream)}
 {
 }
@@ -141,11 +144,11 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> hash_join:
   std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
   common_columns_output_side common_columns_output_side,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
 {
   return impl->inner_join(
-    probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, mr, stream);
+    probe, probe_on, columns_in_common, common_columns_output_side, compare_nulls, stream, mr);
 }
 
 std::unique_ptr<cudf::table> hash_join::left_join(
@@ -153,10 +156,10 @@ std::unique_ptr<cudf::table> hash_join::left_join(
   std::vector<size_type> const& probe_on,
   std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
 {
-  return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, mr, stream);
+  return impl->left_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr);
 }
 
 std::unique_ptr<cudf::table> hash_join::full_join(
@@ -164,10 +167,10 @@ std::unique_ptr<cudf::table> hash_join::full_join(
   std::vector<size_type> const& probe_on,
   std::vector<std::pair<cudf::size_type, cudf::size_type>> const& columns_in_common,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
 {
-  return impl->full_join(probe, probe_on, columns_in_common, compare_nulls, mr, stream);
+  return impl->full_join(probe, probe_on, columns_in_common, compare_nulls, stream, mr);
 }
 
 // external APIs
@@ -183,7 +186,7 @@ std::unique_ptr<table> inner_join(
 {
   CUDF_FUNC_RANGE();
   return detail::inner_join(
-    left, right, left_on, right_on, columns_in_common, compare_nulls, mr, 0);
+    left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> left_join(
@@ -196,7 +199,8 @@ std::unique_ptr<table> left_join(
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::left_join(left, right, left_on, right_on, columns_in_common, compare_nulls, mr, 0);
+  return detail::left_join(
+    left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> full_join(
@@ -209,7 +213,8 @@ std::unique_ptr<table> full_join(
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::full_join(left, right, left_on, right_on, columns_in_common, compare_nulls, mr, 0);
+  return detail::full_join(
+    left, right, left_on, right_on, columns_in_common, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/nested_loop_join.cuh b/cpp/src/join/nested_loop_join.cuh
index 470549265d1..03d684f91d4 100644
--- a/cpp/src/join/nested_loop_join.cuh
+++ b/cpp/src/join/nested_loop_join.cuh
@@ -15,18 +15,21 @@
  */
 #pragma once
 
+#include "hash_join.cuh"
+#include "join_common_utils.hpp"
+#include "join_kernels.cuh"
+
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
-#include <iostream>
+#include <cudf/types.hpp>
 
-#include "cudf/types.hpp"
-#include "hash_join.cuh"
-#include "join_common_utils.hpp"
-#include "join_kernels.cuh"
+#include <rmm/cuda_stream_view.hpp>
+
+#include <iostream>
 
 namespace cudf {
 namespace detail {
@@ -48,7 +51,7 @@ size_type estimate_nested_loop_join_output_size(table_device_view left,
                                                 table_device_view right,
                                                 join_kind JoinKind,
                                                 null_equality compare_nulls,
-                                                cudaStream_t stream)
+                                                rmm::cuda_stream_view stream)
 {
   const size_type left_num_rows{left.num_rows()};
   const size_type right_num_rows{right.num_rows()};
@@ -72,7 +75,7 @@ size_type estimate_nested_loop_join_output_size(table_device_view left,
   size_type h_size_estimate{0};
   rmm::device_scalar<size_type> size_estimate(0, stream);
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 
   constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
   int numBlocks{-1};
@@ -91,9 +94,10 @@ size_type estimate_nested_loop_join_output_size(table_device_view left,
   row_equality equality{left, right, compare_nulls == null_equality::EQUAL};
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
-  compute_nested_loop_join_output_size<block_size><<<numBlocks * num_sms, block_size, 0, stream>>>(
-    left, right, JoinKind, equality, size_estimate.data());
-  CHECK_CUDA(stream);
+  compute_nested_loop_join_output_size<block_size>
+    <<<numBlocks * num_sms, block_size, 0, stream.value()>>>(
+      left, right, JoinKind, equality, size_estimate.data());
+  CHECK_CUDA(stream.value());
 
   h_size_estimate = size_estimate.value(stream);
 
@@ -120,7 +124,7 @@ get_base_nested_loop_join_indices(table_view const& left,
                                   bool flip_join_indices,
                                   join_kind JoinKind,
                                   null_equality compare_nulls,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream)
 {
   // The `right` table is always used for the inner loop. We want to use the smaller table
   // for the inner loop. Thus, if `left` is smaller than `right`, swap `left/right`.
@@ -167,16 +171,16 @@ get_base_nested_loop_join_indices(table_view const& left,
     const auto& join_output_r =
       flip_join_indices ? left_indices.data().get() : right_indices.data().get();
     nested_loop_join<block_size, DEFAULT_JOIN_CACHE_SIZE>
-      <<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(*left_table,
-                                                                       *right_table,
-                                                                       JoinKind,
-                                                                       equality,
-                                                                       join_output_l,
-                                                                       join_output_r,
-                                                                       write_index.data(),
-                                                                       estimated_size);
-
-    CHECK_CUDA(stream);
+      <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(*left_table,
+                                                                               *right_table,
+                                                                               JoinKind,
+                                                                               equality,
+                                                                               join_output_l,
+                                                                               join_output_r,
+                                                                               write_index.data(),
+                                                                               estimated_size);
+
+    CHECK_CUDA(stream.value());
 
     join_size              = write_index.value();
     current_estimated_size = estimated_size;
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 2b58c1a864a..6df329243ed 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -75,8 +75,8 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
   std::vector<cudf::size_type> const& right_on,
   std::vector<cudf::size_type> const& return_columns,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty");
@@ -100,8 +100,9 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
   // This will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
     {left.select(left_on), right.select(right_on)},
-    rmm::mr::get_current_device_resource(),  // temporary objects returned
-    stream);
+    stream,
+    rmm::mr::get_current_device_resource());  // temporary objects returned
+
   auto const left_selected  = matched.second.front();
   auto const right_selected = matched.second.back();
 
@@ -120,13 +121,14 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
   row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
 
   auto hash_table_ptr = hash_table_type::create(hash_table_size,
+                                                stream,
                                                 std::numeric_limits<bool>::max(),
                                                 std::numeric_limits<cudf::size_type>::max(),
                                                 hash_build,
                                                 equality_build);
   auto hash_table     = *hash_table_ptr;
 
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      right_num_rows,
                      [hash_table] __device__(size_type idx) mutable {
@@ -145,7 +147,7 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
 
   // gather_map_end will be the end of valid data in gather_map
   auto gather_map_end = thrust::copy_if(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(left_num_rows),
     gather_map.begin(),
@@ -171,7 +173,7 @@ std::unique_ptr<cudf::table> left_semi_join(cudf::table_view const& left,
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join<detail::join_kind::LEFT_SEMI_JOIN>(
-    left, right, left_on, right_on, return_columns, compare_nulls, mr, 0);
+    left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::table> left_anti_join(cudf::table_view const& left,
@@ -184,7 +186,7 @@ std::unique_ptr<cudf::table> left_anti_join(cudf::table_view const& left,
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join<detail::join_kind::LEFT_ANTI_JOIN>(
-    left, right, left_on, right_on, return_columns, compare_nulls, mr, 0);
+    left, right, left_on, right_on, return_columns, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index ccf57a09d52..c65a0518431 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -31,7 +31,7 @@ namespace detail {
 std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          size_type start,
                                          size_type end,
-                                         cudaStream_t stream,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   if (lists.is_empty()) { return cudf::empty_like(lists.parent()); }
@@ -54,7 +54,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
 
   // Compute the offsets column of the result:
   thrust::transform(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     offsets_data + start,
     offsets_data + end + 1,  // size of offsets column is 1 greater than slice length
     out_offsets.data(),
@@ -73,8 +73,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
           cudf::detail::slice(lists.child(), {start_offset, end_offset}, stream).front());
 
   // Compute the null mask of the result:
-  auto null_mask =
-    cudf::detail::copy_bitmask(lists.null_mask(), start, end, rmm::cuda_stream_view{stream}, mr);
+  auto null_mask = cudf::detail::copy_bitmask(lists.null_mask(), start, end, stream, mr);
 
   return make_lists_column(lists_count,
                            std::move(offsets),
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index 96c20fd93ad..a6d9e0baf40 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -13,13 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <thrust/binary_search.h>
-#include <thrust/iterator/iterator_facade.h>
+
 #include <cudf/detail/gather.cuh>
 #include <cudf/lists/detail/gather.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <thrust/binary_search.h>
+
 namespace cudf {
 namespace lists {
 namespace detail {
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 5adb21a47f1..342bd006ea2 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -18,6 +18,8 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/lists/extract.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -57,7 +59,7 @@ struct map_index_fn {
  */
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              size_type index,
-                                             cudaStream_t stream,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   if (lists_column.is_empty()) return empty_like(lists_column.parent());
@@ -80,13 +82,13 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
   // build the gather map using the offsets and the provided index
   auto const d_column = column_device_view::create(annotated_offsets, stream);
   if (index < 0)
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(gather_map->size()),
                       d_gather_map,
                       map_index_fn<false>{*d_column, index, child_column.size()});
   else
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(gather_map->size()),
                       d_gather_map,
@@ -114,7 +116,7 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
                                              size_type index,
                                              rmm::mr::device_memory_resource* mr)
 {
-  return detail::extract_list_element(lists_column, index, 0, mr);
+  return detail::extract_list_element(lists_column, index, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 265a20bcbb7..0afdac17a7d 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/merge.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -24,6 +25,7 @@
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -103,7 +105,7 @@ void materialize_bitmask(column_view const& left_col,
                          column_view const& right_col,
                          mutable_column_view& out_col,
                          index_type const* merged_indices,
-                         cudaStream_t stream)
+                         rmm::cuda_stream_view stream)
 {
   constexpr size_type BLOCK_SIZE{256};
   detail::grid_1d grid_config{out_col.size(), BLOCK_SIZE};
@@ -119,24 +121,24 @@ void materialize_bitmask(column_view const& left_col,
   if (left_col.has_nulls()) {
     if (right_col.has_nulls()) {
       materialize_merged_bitmask_kernel<true, true>
-        <<<grid_config.num_blocks, grid_config.num_threads_per_block, 0, stream>>>(
+        <<<grid_config.num_blocks, grid_config.num_threads_per_block, 0, stream.value()>>>(
           left_valid, right_valid, out_valid, out_col.size(), merged_indices);
     } else {
       materialize_merged_bitmask_kernel<true, false>
-        <<<grid_config.num_blocks, grid_config.num_threads_per_block, 0, stream>>>(
+        <<<grid_config.num_blocks, grid_config.num_threads_per_block, 0, stream.value()>>>(
           left_valid, right_valid, out_valid, out_col.size(), merged_indices);
     }
   } else {
     if (right_col.has_nulls()) {
       materialize_merged_bitmask_kernel<false, true>
-        <<<grid_config.num_blocks, grid_config.num_threads_per_block, 0, stream>>>(
+        <<<grid_config.num_blocks, grid_config.num_threads_per_block, 0, stream.value()>>>(
           left_valid, right_valid, out_valid, out_col.size(), merged_indices);
     } else {
       CUDF_FAIL("materialize_merged_bitmask_kernel<false, false>() should never be called.");
     }
   }
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 }
 
 /**
@@ -161,8 +163,8 @@ rmm::device_vector<index_type> generate_merged_indices(
   table_view const& right_table,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  bool nullable       = true,
-  cudaStream_t stream = nullptr)
+  bool nullable                = true,
+  rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 {
   const size_type left_size  = left_table.num_rows();
   const size_type right_size = right_table.num_rows();
@@ -200,7 +202,7 @@ rmm::device_vector<index_type> generate_merged_indices(
                                                         *rhs_device_view,
                                                         d_column_order.data().get(),
                                                         d_null_precedence.data().get());
-    thrust::merge(exec_pol->on(stream),
+    thrust::merge(exec_pol->on(stream.value()),
                   left_begin_zip_iterator,
                   left_end_zip_iterator,
                   right_begin_zip_iterator,
@@ -210,7 +212,7 @@ rmm::device_vector<index_type> generate_merged_indices(
   } else {
     auto ineq_op = detail::row_lexicographic_tagged_comparator<false>(
       *lhs_device_view, *rhs_device_view, d_column_order.data().get());
-    thrust::merge(exec_pol->on(stream),
+    thrust::merge(exec_pol->on(stream.value()),
                   left_begin_zip_iterator,
                   left_end_zip_iterator,
                   right_begin_zip_iterator,
@@ -219,7 +221,7 @@ rmm::device_vector<index_type> generate_merged_indices(
                   ineq_op);
   }
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 
   return merged_indices;
 }
@@ -231,24 +233,24 @@ rmm::device_vector<index_type> generate_merged_indices(
  *  (ordered according to indices of key_cols) and the 2 columns to merge.
  */
 struct column_merger {
-  explicit column_merger(
-    index_vector const& row_order,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = nullptr)
-    : row_order_(row_order), mr_(mr), stream_(stream)
-  {
-  }
+  explicit column_merger(index_vector const& row_order) : row_order_(row_order) {}
 
   // column merger operator;
   //
   template <typename Element>  // required: column type
-  std::unique_ptr<column> operator()(column_view const& lcol, column_view const& rcol) const
+  std::unique_ptr<column> operator()(
+    column_view const& lcol,
+    column_view const& rcol,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const
   {
     auto lsz         = lcol.size();
     auto merged_size = lsz + rcol.size();
-    auto type        = lcol.type();
-    auto merged_col  = lcol.has_nulls() ? cudf::allocate_like(lcol, merged_size)
-                                       : cudf::allocate_like(rcol, merged_size);
+    auto merged_col  = cudf::detail::allocate_like(lcol.has_nulls() ? lcol : rcol,
+                                                  merged_size,
+                                                  cudf::mask_allocation_policy::RETAIN,
+                                                  stream,
+                                                  mr);
 
     //"gather" data from lcol, rcol according to row_order_ "map"
     //(directly calling gather() won't work because
@@ -258,12 +260,13 @@ struct column_merger {
 
     // initialize null_mask to all valid:
     //
-    // Note: this initialization in conjunction with _conditionally_
-    // calling materialize_bitmask() below covers the case
-    // materialize_merged_bitmask_kernel<false, false>()
-    // which won't be called anymore (because of the _condition_ below)
+    // Note: this initialization in conjunction with
+    // _conditionally_ calling materialize_bitmask() below covers
+    // the case materialize_merged_bitmask_kernel<false, false>()
+    // which won't be called anymore (because of the _condition_
+    // below)
     //
-    cudf::detail::set_null_mask(merged_view.null_mask(), 0, merged_view.size(), true, stream_);
+    cudf::detail::set_null_mask(merged_view.null_mask(), 0, merged_view.size(), true, stream);
 
     // set the null count:
     //
@@ -276,13 +279,13 @@ struct column_merger {
     auto const d_lcol = lcol.data<Type>();
     auto const d_rcol = rcol.data<Type>();
 
-    auto exe_pol = rmm::exec_policy(stream_);
+    auto exe_pol = rmm::exec_policy(stream);
 
     // capture lcol, rcol
     // and "gather" into merged_view.data()[indx_merged]
     // from lcol or rcol, depending on side;
     //
-    thrust::transform(exe_pol->on(stream_),
+    thrust::transform(exe_pol->on(stream.value()),
                       row_order_.begin(),
                       row_order_.end(),
                       merged_view.begin<Type>(),
@@ -299,7 +302,7 @@ struct column_merger {
     if (lcol.has_nulls() || rcol.has_nulls()) {
       // resolve null mask:
       //
-      materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream_);
+      materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream);
     }
 
     return merged_col;
@@ -307,42 +310,43 @@ struct column_merger {
 
  private:
   index_vector const& row_order_;
-  rmm::mr::device_memory_resource* mr_;
-  cudaStream_t stream_;
 };
 
 // specialization for strings
 template <>
-std::unique_ptr<column> column_merger::operator()<cudf::string_view>(column_view const& lcol,
-                                                                     column_view const& rcol) const
+std::unique_ptr<column> column_merger::operator()<cudf::string_view>(
+  column_view const& lcol,
+  column_view const& rcol,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
 {
   auto column = strings::detail::merge<index_type>(strings_column_view(lcol),
                                                    strings_column_view(rcol),
                                                    row_order_.begin(),
                                                    row_order_.end(),
-                                                   mr_,
-                                                   stream_);
+                                                   stream,
+                                                   mr);
   if (lcol.has_nulls() || rcol.has_nulls()) {
     auto merged_view = column->mutable_view();
-    materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream_);
+    materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream);
   }
   return column;
 }
 
 // specialization for dictionary
 template <>
-std::unique_ptr<column> column_merger::operator()<cudf::dictionary32>(column_view const& lcol,
-                                                                      column_view const& rcol) const
+std::unique_ptr<column> column_merger::operator()<cudf::dictionary32>(
+  column_view const& lcol,
+  column_view const& rcol,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
 {
-  auto result = cudf::dictionary::detail::merge(cudf::dictionary_column_view(lcol),
-                                                cudf::dictionary_column_view(rcol),
-                                                row_order_,
-                                                mr_,
-                                                stream_);
+  auto result = cudf::dictionary::detail::merge(
+    cudf::dictionary_column_view(lcol), cudf::dictionary_column_view(rcol), row_order_, stream, mr);
   // set the validity mask
   if (lcol.has_nulls() || rcol.has_nulls()) {
     auto merged_view = result->mutable_view();
-    materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream_);
+    materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream);
   }
   return result;
 }
@@ -355,8 +359,8 @@ table_ptr_type merge(cudf::table_view const& left_table,
                      std::vector<cudf::size_type> const& key_cols,
                      std::vector<cudf::order> const& column_order,
                      std::vector<cudf::null_order> const& null_precedence,
-                     rmm::mr::device_memory_resource* mr,
-                     cudaStream_t stream = 0)
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr)
 {
   // collect index columns for lhs, rhs, resp.
   //
@@ -375,13 +379,14 @@ table_ptr_type merge(cudf::table_view const& left_table,
   std::vector<std::unique_ptr<column>> merged_cols;
   merged_cols.reserve(n_cols);
 
-  column_merger merger{merged_indices, mr, stream};
+  column_merger merger{merged_indices};
   transform(left_table.begin(),
             left_table.end(),
             right_table.begin(),
             std::back_inserter(merged_cols),
             [&](auto const& left_col, auto const& right_col) {
-              return cudf::type_dispatcher(left_col.type(), merger, left_col, right_col);
+              return cudf::type_dispatcher(
+                left_col.type(), merger, left_col, right_col, stream, mr);
             });
 
   return std::make_unique<cudf::table>(std::move(merged_cols));
@@ -417,8 +422,8 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
                      std::vector<cudf::size_type> const& key_cols,
                      std::vector<cudf::order> const& column_order,
                      std::vector<cudf::null_order> const& null_precedence,
-                     rmm::mr::device_memory_resource* mr,
-                     cudaStream_t stream = 0)
+                     rmm::cuda_stream_view stream,
+                     rmm::mr::device_memory_resource* mr)
 {
   if (tables_to_merge.empty()) { return std::make_unique<cudf::table>(); }
 
@@ -444,7 +449,7 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
-    tables_to_merge, rmm::mr::get_current_device_resource(), stream);
+    tables_to_merge, stream, rmm::mr::get_current_device_resource());
   auto merge_tables = matched.second;
 
   // A queue of (table view, table) pairs
@@ -468,14 +473,14 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
     auto const right_table = top_and_pop(merge_queue);
 
     // Only use mr for the output table
-    auto const& new_tbl_rm = merge_queue.empty() ? mr : rmm::mr::get_current_device_resource();
+    auto const& new_tbl_mr = merge_queue.empty() ? mr : rmm::mr::get_current_device_resource();
     auto merged_table      = merge(left_table.view,
                               right_table.view,
                               key_cols,
                               column_order,
                               null_precedence,
-                              new_tbl_rm,
-                              stream);
+                              stream,
+                              new_tbl_mr);
 
     auto const merged_table_view = merged_table->view();
     merge_queue.emplace(merged_table_view, std::move(merged_table));
@@ -493,7 +498,8 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::merge(tables_to_merge, key_cols, column_order, null_precedence, mr);
+  return detail::merge(
+    tables_to_merge, key_cols, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 3d0f35568f4..6f79cf3aa08 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -26,6 +26,8 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace {
 // Launch configuration for optimized hash partition
@@ -338,7 +340,7 @@ void copy_block_partitions_impl(InputIter const input,
                                 size_type const* block_partition_sizes,
                                 size_type const* scanned_block_partition_sizes,
                                 size_type grid_size,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream)
 {
   // We need 3 chunks of shared memory:
   // 1. BLOCK_SIZE * ROWS_PER_THREAD elements of size_type for copying to output
@@ -347,7 +349,7 @@ void copy_block_partitions_impl(InputIter const input,
   int const smem = OPTIMIZED_BLOCK_SIZE * OPTIMIZED_ROWS_PER_THREAD * sizeof(*output) +
                    (num_partitions + 1) * sizeof(size_type) * 2;
 
-  copy_block_partitions<<<grid_size, OPTIMIZED_BLOCK_SIZE, smem, stream>>>(
+  copy_block_partitions<<<grid_size, OPTIMIZED_BLOCK_SIZE, smem, stream.value()>>>(
     input,
     output,
     num_rows,
@@ -365,7 +367,7 @@ rmm::device_vector<size_type> compute_gather_map(size_type num_rows,
                                                  size_type const* block_partition_sizes,
                                                  size_type const* scanned_block_partition_sizes,
                                                  size_type grid_size,
-                                                 cudaStream_t stream)
+                                                 rmm::cuda_stream_view stream)
 {
   auto sequence = thrust::make_counting_iterator(0);
   rmm::device_vector<size_type> gather_map(num_rows);
@@ -393,8 +395,8 @@ struct copy_block_partitions_dispatcher {
                                      size_type const* block_partition_sizes,
                                      size_type const* scanned_block_partition_sizes,
                                      size_type grid_size,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     rmm::device_buffer output(input.size() * sizeof(DataType), stream, mr);
 
@@ -420,8 +422,8 @@ struct copy_block_partitions_dispatcher {
                                      size_type const* block_partition_sizes,
                                      size_type const* scanned_block_partition_sizes,
                                      size_type grid_size,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     // Use move_to_output_buffer to create an equivalent gather map
     auto gather_map = compute_gather_map(input.size(),
@@ -451,8 +453,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   table_view const& input,
   table_view const& table_to_hash,
   size_type num_partitions,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto const num_rows = table_to_hash.num_rows();
 
@@ -500,14 +502,14 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     compute_row_partition_numbers<<<grid_size,
                                     block_size,
                                     num_partitions * sizeof(size_type),
-                                    stream>>>(hasher,
-                                              num_rows,
-                                              num_partitions,
-                                              partitioner_type(num_partitions),
-                                              row_partition_numbers.data().get(),
-                                              row_partition_offset.data().get(),
-                                              block_partition_sizes.data().get(),
-                                              global_partition_sizes.data().get());
+                                    stream.value()>>>(hasher,
+                                                      num_rows,
+                                                      num_partitions,
+                                                      partitioner_type(num_partitions),
+                                                      row_partition_numbers.data().get(),
+                                                      row_partition_offset.data().get(),
+                                                      block_partition_sizes.data().get(),
+                                                      global_partition_sizes.data().get());
   } else {
     // Determines how the mapping between hash value and partition number is
     // computed
@@ -520,19 +522,19 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     compute_row_partition_numbers<<<grid_size,
                                     block_size,
                                     num_partitions * sizeof(size_type),
-                                    stream>>>(hasher,
-                                              num_rows,
-                                              num_partitions,
-                                              partitioner_type(num_partitions),
-                                              row_partition_numbers.data().get(),
-                                              row_partition_offset.data().get(),
-                                              block_partition_sizes.data().get(),
-                                              global_partition_sizes.data().get());
+                                    stream.value()>>>(hasher,
+                                                      num_rows,
+                                                      num_partitions,
+                                                      partitioner_type(num_partitions),
+                                                      row_partition_numbers.data().get(),
+                                                      row_partition_offset.data().get(),
+                                                      block_partition_sizes.data().get(),
+                                                      global_partition_sizes.data().get());
   }
 
   // Compute exclusive scan of all blocks' partition sizes in-place to determine
   // the starting point for each blocks portion of each partition in the output
-  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
+  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                          block_partition_sizes.begin(),
                          block_partition_sizes.end(),
                          scanned_block_partition_sizes.data().get());
@@ -541,7 +543,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   // location of each partition in final output.
   // TODO This can be done independently on a separate stream
   size_type* scanned_global_partition_sizes{global_partition_sizes.data().get()};
-  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
+  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                          global_partition_sizes.begin(),
                          global_partition_sizes.end(),
                          scanned_global_partition_sizes);
@@ -553,7 +555,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
                            scanned_global_partition_sizes,
                            num_partitions * sizeof(size_type),
                            cudaMemcpyDeviceToHost,
-                           stream));
+                           stream.value()));
 
   // When the number of partitions is less than a threshold, we can apply an
   // optimization using shared memory to copy values to the output buffer.
@@ -579,8 +581,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
                                    block_partition_sizes_ptr,
                                    scanned_block_partition_sizes_ptr,
                                    grid_size,
-                                   mr,
-                                   stream);
+                                   stream,
+                                   mr);
     });
 
     if (has_nulls(input)) {
@@ -609,7 +611,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     compute_row_output_locations<<<grid_size,
                                    block_size,
                                    num_partitions * sizeof(size_type),
-                                   stream>>>(
+                                   stream.value()>>>(
       row_output_locations, num_rows, num_partitions, scanned_block_partition_sizes_ptr);
 
     // Use the resulting scatter map to materialize the output
@@ -646,8 +648,8 @@ struct dispatch_map_type {
   operator()(table_view const& t,
              column_view const& partition_map,
              size_type num_partitions,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream) const
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr) const
   {
     // Build a histogram of the number of rows in each partition
     rmm::device_vector<size_type> histogram(num_partitions + 1);
@@ -663,7 +665,7 @@ struct dispatch_map_type {
                                         lower_level,
                                         upper_level,
                                         partition_map.size(),
-                                        stream);
+                                        stream.value());
 
     rmm::device_buffer temp_storage(temp_storage_bytes, stream);
 
@@ -675,12 +677,14 @@ struct dispatch_map_type {
                                         lower_level,
                                         upper_level,
                                         partition_map.size(),
-                                        stream);
+                                        stream.value());
 
     // `histogram` was created with an extra entry at the end such that an
     // exclusive scan will put the total number of rows at the end
-    thrust::exclusive_scan(
-      rmm::exec_policy()->on(stream), histogram.begin(), histogram.end(), histogram.begin());
+    thrust::exclusive_scan(rmm::exec_policy()->on(stream.value()),
+                           histogram.begin(),
+                           histogram.end(),
+                           histogram.begin());
 
     // Copy offsets to host
     std::vector<size_type> partition_offsets(histogram.size());
@@ -692,7 +696,7 @@ struct dispatch_map_type {
 
     // For each `partition_map[i]`, atomically increment the corresponding
     // partition offset to determine `i`s location in the output
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       partition_map.begin<MapType>(),
                       partition_map.end<MapType>(),
                       scatter_map.begin(),
@@ -713,8 +717,8 @@ struct dispatch_map_type {
   operator()(table_view const& t,
              column_view const& partition_map,
              size_type num_partitions,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream) const
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr) const
   {
     CUDF_FAIL("Unexpected, non-integral partition map.");
   }
@@ -723,12 +727,13 @@ struct dispatch_map_type {
 
 namespace detail {
 namespace local {
+
 std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   table_view const& input,
   std::vector<size_type> const& columns_to_hash,
   int num_partitions,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto table_to_hash = input.select(columns_to_hash);
 
@@ -738,9 +743,9 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   }
 
   if (has_nulls(table_to_hash)) {
-    return hash_partition_table<true>(input, table_to_hash, num_partitions, mr, stream);
+    return hash_partition_table<true>(input, table_to_hash, num_partitions, stream, mr);
   } else {
-    return hash_partition_table<false>(input, table_to_hash, num_partitions, mr, stream);
+    return hash_partition_table<false>(input, table_to_hash, num_partitions, stream, mr);
   }
 }
 }  // namespace local
@@ -749,8 +754,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(t.num_rows() == partition_map.size(),
                "Size mismatch between table and partition map.");
@@ -761,7 +766,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   }
 
   return cudf::type_dispatcher(
-    partition_map.type(), dispatch_map_type{}, t, partition_map, num_partitions, mr, stream);
+    partition_map.type(), dispatch_map_type{}, t, partition_map, num_partitions, stream, mr);
 }
 }  // namespace detail
 
@@ -773,7 +778,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::local::hash_partition(input, columns_to_hash, num_partitions, mr);
+  return detail::local::hash_partition(
+    input, columns_to_hash, num_partitions, rmm::cuda_stream_default, mr);
 }
 
 // Partition based on an explicit partition map
@@ -784,7 +790,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::partition(t, partition_map, num_partitions, mr);
+  return detail::partition(t, partition_map, num_partitions, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index aadcaa6d51f..6367293a9d3 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <rmm/thrust_rmm_allocator.h>
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -27,6 +26,9 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -77,8 +79,8 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
   cudf::table_view const& input,
   cudf::size_type num_partitions,
   cudf::size_type start_partition,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto nrows = input.num_rows();
 
@@ -93,7 +95,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
   if (num_partitions == nrows) {
     VectorT<cudf::size_type> partition_offsets(num_partitions, cudf::size_type{0});
     auto exec = rmm::exec_policy(stream);
-    thrust::sequence(exec->on(stream), partition_offsets.begin(), partition_offsets.end());
+    thrust::sequence(exec->on(stream.value()), partition_offsets.begin(), partition_offsets.end());
 
     auto uniq_tbl = cudf::detail::gather(input,
                                          rotated_iter_begin,
@@ -109,9 +111,9 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                              partition_offsets.data().get(),
                              sizeof(cudf::size_type) * num_partitions,
                              cudaMemcpyDeviceToHost,
-                             stream));
+                             stream.value()));
 
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    stream.synchronize();
 
     return ret_pair;
   } else {  //( num_partitions > nrows )
@@ -122,7 +124,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
     //(this relies on a _stable_ copy_if())
     //
     auto exec = rmm::exec_policy(stream);
-    thrust::copy_if(exec->on(stream),
+    thrust::copy_if(exec->on(stream.value()),
                     rotated_iter_begin,
                     rotated_iter_begin + num_partitions,
                     d_row_indices.begin(),
@@ -151,7 +153,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
     // offsets (part 2: compute partition offsets):
     //
     VectorT<cudf::size_type> partition_offsets(num_partitions, cudf::size_type{0});
-    thrust::exclusive_scan(exec->on(stream),
+    thrust::exclusive_scan(exec->on(stream.value()),
                            nedges_iter_begin,
                            nedges_iter_begin + num_partitions,
                            partition_offsets.begin());
@@ -160,9 +162,9 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                              partition_offsets.data().get(),
                              sizeof(cudf::size_type) * num_partitions,
                              cudaMemcpyDeviceToHost,
-                             stream));
+                             stream.value()));
 
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    stream.synchronize();
 
     return ret_pair;
   }
@@ -175,8 +177,8 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
   table_view const& input,
   cudf::size_type num_partitions,
   cudf::size_type start_partition     = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto nrows = input.num_rows();
 
@@ -191,7 +193,7 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
   // handle degenerate case:
   //
   if (num_partitions >= nrows) {
-    return degenerate_partitions(input, num_partitions, start_partition, mr, stream);
+    return degenerate_partitions(input, num_partitions, start_partition, stream, mr);
   }
 
   auto np_max_size = nrows % num_partitions;  //# partitions of max size
@@ -288,7 +290,8 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robi
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_FUNC_RANGE();
-  return cudf::detail::round_robin_partition(input, num_partitions, start_partition, mr);
+  return cudf::detail::round_robin_partition(
+    input, num_partitions, start_partition, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index bfb592595c3..ab0e45f648a 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -24,8 +24,11 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform_reduce.h>
+
 #include <type_traits>
 
 namespace cudf {
@@ -183,7 +186,7 @@ struct minmax_functor {
             std::enable_if_t<is_supported<T>() and !std::is_same<T, cudf::string_view>::value and
                              !cudf::is_dictionary<T>()> * = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const &col, rmm::mr::device_memory_resource *mr, rmm::cuda_stream_view stream)
+    cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<T>(col, stream);
@@ -202,7 +205,7 @@ struct minmax_functor {
    */
   template <typename T, std::enable_if_t<std::is_same<T, cudf::string_view>::value> * = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const &col, rmm::mr::device_memory_resource *mr, rmm::cuda_stream_view stream)
+    cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<cudf::string_view>(col, stream);
@@ -212,8 +215,8 @@ struct minmax_functor {
     CUDA_TRY(cudaMemcpyAsync(
       &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDeviceToHost, stream.value()));
     // strings are copied to create the scalars here
-    return {std::make_unique<string_scalar>(host_result.min_val, true, stream.value(), mr),
-            std::make_unique<string_scalar>(host_result.max_val, true, stream.value(), mr)};
+    return {std::make_unique<string_scalar>(host_result.min_val, true, stream, mr),
+            std::make_unique<string_scalar>(host_result.max_val, true, stream, mr)};
   }
 
   /**
@@ -221,7 +224,7 @@ struct minmax_functor {
    */
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()> * = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const &col, rmm::mr::device_memory_resource *mr, rmm::cuda_stream_view stream)
+    cudf::column_view const &col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<T>(col, stream);
@@ -232,13 +235,13 @@ struct minmax_functor {
       &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDeviceToHost, stream.value()));
     // get the keys for those indexes
     auto const keys = dictionary_column_view(col).keys();
-    return {get_element(keys, static_cast<size_type>(host_result.min_val), stream.value(), mr),
-            get_element(keys, static_cast<size_type>(host_result.max_val), stream.value(), mr)};
+    return {get_element(keys, static_cast<size_type>(host_result.min_val), stream, mr),
+            get_element(keys, static_cast<size_type>(host_result.max_val), stream, mr)};
   }
 
   template <typename T, std::enable_if_t<!is_supported<T>()> * = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const &, rmm::mr::device_memory_resource *, rmm::cuda_stream_view)
+    cudf::column_view const &, rmm::cuda_stream_view, rmm::mr::device_memory_resource *)
   {
     CUDF_FAIL("type not supported for minmax() operation");
   }
@@ -256,7 +259,7 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
             make_default_constructed_scalar(col.type())};
   }
 
-  return type_dispatcher(col.type(), minmax_functor{}, col, mr, stream);
+  return type_dispatcher(col.type(), minmax_functor{}, col, stream, mr);
 }
 }  // namespace detail
 
@@ -264,9 +267,9 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
  * @copydoc cudf::minmax
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  const cudf::column_view &col, rmm::mr::device_memory_resource *mr)
+  const column_view &col, rmm::mr::device_memory_resource *mr)
 {
-  return cudf::detail::minmax(col, rmm::cuda_stream_default, mr);
+  return detail::minmax(col, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/scan.cu b/cpp/src/reductions/scan.cu
index 6d90124db36..2c0b8b8d71d 100644
--- a/cpp/src/reductions/scan.cu
+++ b/cpp/src/reductions/scan.cu
@@ -57,23 +57,22 @@ struct ScanDispatcher {
   template <typename T, std::enable_if_t<std::is_arithmetic<T>::value, T>* = nullptr>
   auto exclusive_scan(const column_view& input_view,
                       null_policy null_handling,
-                      rmm::mr::device_memory_resource* mr,
-                      cudaStream_t stream)
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr)
   {
     const size_type size = input_view.size();
     auto output_column =
       detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, stream, mr);
     if (null_handling == null_policy::EXCLUDE) {
-      output_column->set_null_mask(
-        detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr),
-        input_view.null_count());
+      output_column->set_null_mask(detail::copy_bitmask(input_view, stream, mr),
+                                   input_view.null_count());
     }
     mutable_column_view output = output_column->mutable_view();
     auto d_input               = column_device_view::create(input_view, stream);
 
     if (input_view.has_nulls()) {
       auto input = make_null_replacement_iterator(*d_input, Op::template identity<T>());
-      thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
+      thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                              input,
                              input + size,
                              output.data<T>(),
@@ -81,7 +80,7 @@ struct ScanDispatcher {
                              Op{});
     } else {
       auto input = d_input->begin<T>();
-      thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
+      thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                              input,
                              input + size,
                              output.data<T>(),
@@ -89,7 +88,7 @@ struct ScanDispatcher {
                              Op{});
     }
 
-    CHECK_CUDA(stream);
+    CHECK_CUDA(stream.value());
     return output_column;
   }
 
@@ -97,24 +96,25 @@ struct ScanDispatcher {
   template <typename T, std::enable_if_t<is_string_supported<T>(), T>* = nullptr>
   std::unique_ptr<column> exclusive_scan(const column_view& input_view,
                                          null_policy null_handling,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("String types supports only inclusive min/max for `cudf::scan`");
   }
 
   rmm::device_buffer mask_inclusive_scan(const column_view& input_view,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
   {
     rmm::device_buffer mask =
       detail::create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr);
-    auto d_input = column_device_view::create(input_view, stream);
-    auto v       = detail::make_validity_iterator(*d_input);
-    auto first_null_position =
-      thrust::find_if_not(
-        rmm::exec_policy(stream)->on(stream), v, v + input_view.size(), thrust::identity<bool>{}) -
-      v;
+    auto d_input             = column_device_view::create(input_view, stream);
+    auto v                   = detail::make_validity_iterator(*d_input);
+    auto first_null_position = thrust::find_if_not(rmm::exec_policy(stream)->on(stream.value()),
+                                                   v,
+                                                   v + input_view.size(),
+                                                   thrust::identity<bool>{}) -
+                               v;
     cudf::set_null_mask(
       static_cast<cudf::bitmask_type*>(mask.data()), 0, first_null_position, true);
     cudf::set_null_mask(
@@ -126,19 +126,18 @@ struct ScanDispatcher {
   template <typename T, std::enable_if_t<std::is_arithmetic<T>::value, T>* = nullptr>
   auto inclusive_scan(const column_view& input_view,
                       null_policy null_handling,
-                      rmm::mr::device_memory_resource* mr,
-                      cudaStream_t stream)
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr)
   {
     const size_type size = input_view.size();
     auto output_column =
       detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, stream, mr);
     if (null_handling == null_policy::EXCLUDE) {
-      output_column->set_null_mask(
-        detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr),
-        input_view.null_count());
+      output_column->set_null_mask(detail::copy_bitmask(input_view, stream, mr),
+                                   input_view.null_count());
     } else {
       if (input_view.nullable()) {
-        output_column->set_null_mask(mask_inclusive_scan(input_view, mr, stream),
+        output_column->set_null_mask(mask_inclusive_scan(input_view, stream, mr),
                                      cudf::UNKNOWN_NULL_COUNT);
       }
     }
@@ -149,14 +148,14 @@ struct ScanDispatcher {
     if (input_view.has_nulls()) {
       auto input = make_null_replacement_iterator(*d_input, Op::template identity<T>());
       thrust::inclusive_scan(
-        rmm::exec_policy(stream)->on(stream), input, input + size, output.data<T>(), Op{});
+        rmm::exec_policy(stream)->on(stream.value()), input, input + size, output.data<T>(), Op{});
     } else {
       auto input = d_input->begin<T>();
       thrust::inclusive_scan(
-        rmm::exec_policy(stream)->on(stream), input, input + size, output.data<T>(), Op{});
+        rmm::exec_policy(stream)->on(stream.value()), input, input + size, output.data<T>(), Op{});
     }
 
-    CHECK_CUDA(stream);
+    CHECK_CUDA(stream.value());
     return output_column;
   }
 
@@ -164,8 +163,8 @@ struct ScanDispatcher {
   template <typename T, std::enable_if_t<is_string_supported<T>(), T>* = nullptr>
   std::unique_ptr<column> inclusive_scan(const column_view& input_view,
                                          null_policy null_handling,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
   {
     const size_type size = input_view.size();
     rmm::device_vector<T> result(size);
@@ -174,23 +173,28 @@ struct ScanDispatcher {
 
     if (input_view.has_nulls()) {
       auto input = make_null_replacement_iterator(*d_input, Op::template identity<T>());
-      thrust::inclusive_scan(
-        rmm::exec_policy(stream)->on(stream), input, input + size, result.data().get(), Op{});
+      thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
+                             input,
+                             input + size,
+                             result.data().get(),
+                             Op{});
     } else {
       auto input = d_input->begin<T>();
-      thrust::inclusive_scan(
-        rmm::exec_policy(stream)->on(stream), input, input + size, result.data().get(), Op{});
+      thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
+                             input,
+                             input + size,
+                             result.data().get(),
+                             Op{});
     }
-    CHECK_CUDA(stream);
+    CHECK_CUDA(stream.value());
 
     auto output_column = make_strings_column(result, Op::template identity<T>(), stream, mr);
     if (null_handling == null_policy::EXCLUDE) {
-      output_column->set_null_mask(
-        detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr),
-        input_view.null_count());
+      output_column->set_null_mask(detail::copy_bitmask(input_view, stream, mr),
+                                   input_view.null_count());
     } else {
       if (input_view.nullable()) {
-        output_column->set_null_mask(mask_inclusive_scan(input_view, mr, stream),
+        output_column->set_null_mask(mask_inclusive_scan(input_view, stream, mr),
                                      cudf::UNKNOWN_NULL_COUNT);
       }
     }
@@ -203,8 +207,8 @@ struct ScanDispatcher {
    *
    * @param input     input column view
    * @param inclusive inclusive or exclusive scan
-   * @param mr Device memory resource used to allocate the returned column's device memory
    * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource used to allocate the returned column's device memory
    * @return
    *
    * @tparam T type of input column
@@ -213,14 +217,14 @@ struct ScanDispatcher {
   std::unique_ptr<column> operator()(const column_view& input,
                                      scan_type inclusive,
                                      null_policy null_handling,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     std::unique_ptr<column> output;
     if (inclusive == scan_type::INCLUSIVE)
-      output = inclusive_scan<T>(input, null_handling, mr, stream);
+      output = inclusive_scan<T>(input, null_handling, stream, mr);
     else
-      output = exclusive_scan<T>(input, null_handling, mr, stream);
+      output = exclusive_scan<T>(input, null_handling, stream, mr);
     if (null_handling == null_policy::EXCLUDE) {
       CUDF_EXPECTS(input.null_count() == output->null_count(),
                    "Input / output column null count mismatch");
@@ -232,8 +236,8 @@ struct ScanDispatcher {
   std::unique_ptr<column> operator()(const column_view& input,
                                      scan_type inclusive,
                                      null_policy null_handling,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Non-arithmetic types not supported for `cudf::scan`");
   }
@@ -244,8 +248,8 @@ std::unique_ptr<column> scan(
   std::unique_ptr<aggregation> const& agg,
   scan_type inclusive,
   null_policy null_handling,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_numeric(input.type()) || is_compound(input.type()),
                "Unexpected non-numeric or non-string type.");
@@ -257,32 +261,32 @@ std::unique_ptr<column> scan(
                                    input,
                                    inclusive,
                                    null_handling,
-                                   mr,
-                                   stream);
+                                   stream,
+                                   mr);
     case aggregation::MIN:
       return cudf::type_dispatcher(input.type(),
                                    ScanDispatcher<cudf::DeviceMin>(),
                                    input,
                                    inclusive,
                                    null_handling,
-                                   mr,
-                                   stream);
+                                   stream,
+                                   mr);
     case aggregation::MAX:
       return cudf::type_dispatcher(input.type(),
                                    ScanDispatcher<cudf::DeviceMax>(),
                                    input,
                                    inclusive,
                                    null_handling,
-                                   mr,
-                                   stream);
+                                   stream,
+                                   mr);
     case aggregation::PRODUCT:
       return cudf::type_dispatcher(input.type(),
                                    ScanDispatcher<cudf::DeviceProduct>(),
                                    input,
                                    inclusive,
                                    null_handling,
-                                   mr,
-                                   stream);
+                                   stream,
+                                   mr);
     default: CUDF_FAIL("Unsupported aggregation operator for scan");
   }
 }
@@ -295,7 +299,7 @@ std::unique_ptr<column> scan(const column_view& input,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scan(input, agg, inclusive, null_handling, mr);
+  return detail::scan(input, agg, inclusive, null_handling, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 4f084aacb54..ab2bb4ea839 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -32,6 +32,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 namespace {
@@ -40,8 +42,8 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> form_offsets_and_cha
   cudf::column_device_view input,
   size_type null_count,
   Transformer offsets_transformer,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   std::unique_ptr<column> offsets_column{};
   auto strings_count = input.size();
@@ -52,19 +54,19 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> form_offsets_and_cha
     auto offsets_transformer_itr =
       thrust::make_transform_iterator(input_begin, offsets_transformer);
     offsets_column = cudf::strings::detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+      offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   } else {
     auto offsets_transformer_itr =
       thrust::make_transform_iterator(input.begin<string_view>(), offsets_transformer);
     offsets_column = cudf::strings::detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+      offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   }
 
   auto d_offsets = offsets_column->view().template data<size_type>();
   // build chars column
   size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
 
   return std::make_pair(std::move(offsets_column), std::move(chars_column));
 }
@@ -75,8 +77,8 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
                                                   ScalarIterator const& lo_replace_itr,
                                                   ScalarIterator const& hi_itr,
                                                   ScalarIterator const& hi_replace_itr,
-                                                  rmm::mr::device_memory_resource* mr,
-                                                  cudaStream_t stream)
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
 {
   auto input_device_column = column_device_view::create(input.parent(), stream);
   auto d_input             = *input_device_column;
@@ -106,7 +108,7 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
   };
 
   auto offset_and_char =
-    form_offsets_and_char_column(d_input, null_count, offsets_transformer, mr, stream);
+    form_offsets_and_char_column(d_input, null_count, offsets_transformer, stream, mr);
   auto offsets_column(std::move(offset_and_char.first));
   auto chars_column(std::move(offset_and_char.second));
 
@@ -135,8 +137,10 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
     };
 
   auto exec = rmm::exec_policy(stream);
-  thrust::for_each_n(
-    exec->on(stream), thrust::make_counting_iterator<size_type>(0), input.size(), copy_transformer);
+  thrust::for_each_n(exec->on(stream.value()),
+                     thrust::make_counting_iterator<size_type>(0),
+                     input.size(),
+                     copy_transformer);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
@@ -154,8 +158,8 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   ScalarIterator const& lo_replace_itr,
   ScalarIterator const& hi_itr,
   ScalarIterator const& hi_replace_itr,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto output =
     detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -185,7 +189,7 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
 
   if (input.has_nulls()) {
     auto input_pair_iterator = make_pair_iterator<T, true>(*input_device_view);
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       input_pair_iterator,
                       input_pair_iterator + input.size(),
                       scalar_zip_itr,
@@ -193,7 +197,7 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
                       trans);
   } else {
     auto input_pair_iterator = make_pair_iterator<T, false>(*input_device_view);
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       input_pair_iterator,
                       input_pair_iterator + input.size(),
                       scalar_zip_itr,
@@ -211,10 +215,10 @@ std::enable_if_t<std::is_same<T, string_view>::value, std::unique_ptr<cudf::colu
   ScalarIterator const& lo_replace_itr,
   ScalarIterator const& hi_itr,
   ScalarIterator const& hi_replace_itr,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
-  return clamp_string_column(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, mr, stream);
+  return clamp_string_column(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
 }
 
 }  // namespace
@@ -226,10 +230,10 @@ std::unique_ptr<column> clamp(
   ScalarIterator const& lo_replace_itr,
   ScalarIterator const& hi_itr,
   ScalarIterator const& hi_replace_itr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return clamper<T>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, mr, stream);
+  return clamper<T>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
 }
 
 struct dispatch_clamp {
@@ -240,8 +244,8 @@ struct dispatch_clamp {
     scalar const& lo_replace,
     scalar const& hi,
     scalar const& hi_replace,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
 
@@ -252,7 +256,7 @@ struct dispatch_clamp {
     auto lo_replace_itr = make_pair_iterator<Type>(lo_replace);
     auto hi_replace_itr = make_pair_iterator<Type>(hi_replace);
 
-    return clamp<Type>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, mr, stream);
+    return clamp<Type>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
   }
 };
 
@@ -263,8 +267,8 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::list_view>(
   scalar const& lo_replace,
   scalar const& hi,
   scalar const& hi_replace,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FAIL("clamp for list_view not supported");
 }
@@ -275,8 +279,8 @@ std::unique_ptr<column> dispatch_clamp::operator()<struct_view>(column_view cons
                                                                 scalar const& lo_replace,
                                                                 scalar const& hi,
                                                                 scalar const& hi_replace,
-                                                                rmm::mr::device_memory_resource* mr,
-                                                                cudaStream_t stream)
+                                                                rmm::cuda_stream_view stream,
+                                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FAIL("clamp for struct_view not supported");
 }
@@ -288,8 +292,8 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
   scalar const& lo_replace,
   scalar const& hi,
   scalar const& hi_replace,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   // add lo_replace and hi_replace to keys
   auto matched_column = [&] {
@@ -298,7 +302,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
     auto add_scalar_key            = [&](scalar const& key, scalar const& key_replace) {
       if (key.is_valid()) {
         result = dictionary::detail::add_keys(
-          matched_view, make_column_from_scalar(key_replace, 1, stream)->view(), mr, stream);
+          matched_view, make_column_from_scalar(key_replace, 1, stream)->view(), stream, mr);
         matched_view = dictionary_column_view(result->view());
       }
     };
@@ -325,8 +329,8 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
                                            *lo_replace_index,
                                            *hi_index,
                                            *hi_replace_index,
-                                           mr,
-                                           stream);
+                                           stream,
+                                           mr);
 
   auto const indices_type = new_indices->type();
   auto const output_size  = new_indices->size();
@@ -364,8 +368,8 @@ std::unique_ptr<column> clamp(
   scalar const& lo_replace,
   scalar const& hi,
   scalar const& hi_replace,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(lo.type() == hi.type(), "mismatching types of limit scalars");
   CUDF_EXPECTS(lo_replace.type() == hi_replace.type(), "mismatching types of replace scalars");
@@ -384,7 +388,7 @@ std::unique_ptr<column> clamp(
   }
 
   return cudf::type_dispatcher(
-    input.type(), dispatch_clamp{}, input, lo, lo_replace, hi, hi_replace, mr, stream);
+    input.type(), dispatch_clamp{}, input, lo, lo_replace, hi, hi_replace, stream, mr);
 }
 
 }  // namespace detail
@@ -398,7 +402,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::clamp(input, lo, lo_replace, hi, hi_replace, mr);
+  return detail::clamp(input, lo, lo_replace, hi, hi_replace, rmm::cuda_stream_default, mr);
 }
 
 // clamp input at lo and hi
@@ -408,6 +412,6 @@ std::unique_ptr<column> clamp(column_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::clamp(input, lo, lo, hi, hi, mr);
+  return detail::clamp(input, lo, lo, hi, hi, rmm::cuda_stream_default, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 2c7542a2f5d..39fa62c99b0 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -236,7 +236,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
     valid_count);
 
   std::unique_ptr<cudf::column> offsets = cudf::strings::detail::make_offsets_child_column(
-    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), mr, stream.value());
+    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
   auto offsets_view = offsets->mutable_view();
 
   int32_t size;
@@ -244,9 +244,9 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
     &size, offsets_view.end<int32_t>() - 1, sizeof(int32_t), cudaMemcpyDefault, stream.value()));
 
   // Allocate chars array and output null mask
-  cudf::size_type null_count                 = input.size() - valid_counter.value(stream);
-  std::unique_ptr<cudf::column> output_chars = cudf::strings::detail::create_chars_child_column(
-    input.size(), null_count, size, mr, stream.value());
+  cudf::size_type null_count = input.size() - valid_counter.value(stream);
+  std::unique_ptr<cudf::column> output_chars =
+    cudf::strings::detail::create_chars_child_column(input.size(), null_count, size, stream, mr);
 
   auto output_chars_view = output_chars->mutable_view();
 
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index a6b129630a8..6abacc6095e 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -397,8 +397,8 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   auto device_sizes             = cudf::mutable_column_device_view::create(sizes_view);
   auto device_indices           = cudf::mutable_column_device_view::create(indices_view);
 
-  rmm::device_buffer valid_bits = cudf::detail::create_null_mask(
-    input_col.size(), cudf::mask_state::UNINITIALIZED, rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer valid_bits =
+    cudf::detail::create_null_mask(input_col.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
 
   // Call first pass kernel to get sizes in offsets
   cudf::detail::grid_1d grid{input_col.size(), BLOCK_SIZE, 1};
@@ -412,7 +412,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
     valid_count);
 
   std::unique_ptr<cudf::column> offsets = cudf::strings::detail::make_offsets_child_column(
-    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), mr, stream.value());
+    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
   auto offsets_view   = offsets->mutable_view();
   auto device_offsets = cudf::mutable_column_device_view::create(offsets_view);
   int32_t size;
@@ -423,7 +423,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   // Allocate chars array and output null mask
   cudf::size_type null_count                 = input_col.size() - valid_counter.value(stream);
   std::unique_ptr<cudf::column> output_chars = cudf::strings::detail::create_chars_child_column(
-    input_col.size(), null_count, size, mr, stream.value());
+    input_col.size(), null_count, size, stream, mr);
 
   auto output_chars_view = output_chars->mutable_view();
   auto device_chars      = cudf::mutable_column_device_view::create(output_chars_view);
@@ -454,13 +454,12 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
 
   auto matched_input = [&] {
     auto new_keys = cudf::detail::concatenate({values.keys(), replacements.keys()}, stream);
-    return cudf::dictionary::detail::add_keys(input, new_keys->view(), mr, stream.value());
+    return cudf::dictionary::detail::add_keys(input, new_keys->view(), stream, mr);
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
-  auto matched_values = cudf::dictionary::detail::set_keys(
-    values, matched_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
-  auto matched_replacements = cudf::dictionary::detail::set_keys(
-    replacements, matched_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
+  auto matched_values = cudf::dictionary::detail::set_keys(values, matched_view.keys(), stream);
+  auto matched_replacements =
+    cudf::dictionary::detail::set_keys(replacements, matched_view.keys(), stream);
 
   auto indices_type = matched_view.indices().type();
   auto new_indices  = cudf::type_dispatcher(
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 0f5c7595cd0..8683754422b 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -35,8 +35,8 @@ struct byte_list_conversion {
   std::enable_if_t<!std::is_integral<T>::value and !is_floating_point<T>(), std::unique_ptr<column>>
   operator()(column_view const& input_column,
              flip_endianness configuration,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream) const
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr) const
   {
     CUDF_FAIL("Unsupported non-numeric and non-string column");
   }
@@ -45,8 +45,8 @@ struct byte_list_conversion {
   std::enable_if_t<is_floating_point<T>() or std::is_integral<T>::value, std::unique_ptr<column>>
   operator()(column_view const& input_column,
              flip_endianness configuration,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream) const
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr) const
   {
     size_type num_bytes = input_column.size() * sizeof(T);
     auto byte_column    = make_numeric_column(
@@ -57,22 +57,21 @@ struct byte_list_conversion {
     size_type mask     = sizeof(T) - 1;
 
     if (configuration == flip_endianness::YES) {
-      thrust::for_each(rmm::exec_policy(stream)->on(stream),
+      thrust::for_each(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator(0),
                        thrust::make_counting_iterator(num_bytes),
                        [d_chars, d_data, mask] __device__(auto index) {
                          d_chars[index] = d_data[index + mask - ((index & mask) << 1)];
                        });
     } else {
-      thrust::copy_n(rmm::exec_policy(stream)->on(stream), d_data, num_bytes, d_chars);
+      thrust::copy_n(rmm::exec_policy(stream)->on(stream.value()), d_data, num_bytes, d_chars);
     }
 
     auto begin          = thrust::make_constant_iterator(cudf::size_of(input_column.type()));
     auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-      begin, begin + input_column.size(), mr, stream);
+      begin, begin + input_column.size(), stream, mr);
 
-    rmm::device_buffer null_mask =
-      detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr);
+    rmm::device_buffer null_mask = detail::copy_bitmask(input_column, stream, mr);
 
     return make_lists_column(input_column.size(),
                              std::move(offsets_column),
@@ -88,8 +87,8 @@ template <>
 std::unique_ptr<cudf::column> byte_list_conversion::operator()<string_view>(
   column_view const& input_column,
   flip_endianness configuration,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
 {
   strings_column_view input_strings(input_column);
   auto strings_count = input_strings.size();
@@ -101,7 +100,7 @@ std::unique_ptr<cudf::column> byte_list_conversion::operator()<string_view>(
     std::move(contents.children[cudf::strings_column_view::offsets_column_index]),
     std::move(contents.children[cudf::strings_column_view::chars_column_index]),
     input_column.null_count(),
-    detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr),
+    detail::copy_bitmask(input_column, stream, mr),
     stream,
     mr);
 }
@@ -114,11 +113,11 @@ std::unique_ptr<cudf::column> byte_list_conversion::operator()<string_view>(
  */
 std::unique_ptr<column> byte_cast(column_view const& input_column,
                                   flip_endianness endian_configuration,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   return type_dispatcher(
-    input_column.type(), byte_list_conversion{}, input_column, endian_configuration, mr, stream);
+    input_column.type(), byte_list_conversion{}, input_column, endian_configuration, stream, mr);
 }
 
 }  // namespace detail
@@ -131,7 +130,7 @@ std::unique_ptr<column> byte_cast(column_view const& input_column,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::byte_cast(input_column, endian_configuration, mr, cudaStreamDefault);
+  return detail::byte_cast(input_column, endian_configuration, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 9e6197afe0f..7173c96daed 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -14,13 +14,15 @@
  * limitations under the License.
  */
 
+#include <strings/utilities.cuh>
+
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 
-#include <strings/utilities.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
@@ -38,8 +40,8 @@ struct interleave_columns_functor {
   std::enable_if_t<std::is_same<T, cudf::string_view>::value, std::unique_ptr<cudf::column>>
   operator()(table_view const& strings_columns,
              bool create_mask,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream = 0)
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
     auto num_columns = strings_columns.num_columns();
     if (num_columns == 1)  // Single strings column returns a copy
@@ -47,7 +49,7 @@ struct interleave_columns_functor {
 
     auto strings_count = strings_columns.num_rows();
     if (strings_count == 0)  // All columns have 0 rows
-      return strings::detail::make_empty_strings_column(mr, stream);
+      return strings::detail::make_empty_strings_column(stream, mr);
 
     // Create device views from the strings columns.
     auto table       = table_device_view::create(strings_columns, stream);
@@ -83,17 +85,17 @@ struct interleave_columns_functor {
     auto offsets_transformer_itr = thrust::make_transform_iterator(
       thrust::make_counting_iterator<size_type>(0), offsets_transformer);
     auto offsets_column = strings::detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + num_strings, mr, stream);
+      offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr);
     auto d_results_offsets = offsets_column->view().template data<int32_t>();
 
     // Create the chars column
     size_type bytes = thrust::device_pointer_cast(d_results_offsets)[num_strings];
     auto chars_column =
-      strings::detail::create_chars_child_column(num_strings, null_count, bytes, mr, stream);
+      strings::detail::create_chars_child_column(num_strings, null_count, bytes, stream, mr);
     // Fill the chars column
     auto d_results_chars = chars_column->mutable_view().data<char>();
     thrust::for_each_n(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream)->on(stream.value()),
       thrust::make_counting_iterator<size_type>(0),
       num_strings,
       [num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) {
@@ -122,8 +124,8 @@ struct interleave_columns_functor {
   std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> operator()(
     table_view const& input,
     bool create_mask,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream = 0)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
   {
     auto arch_column = input.column(0);
     auto output_size = input.num_columns() * input.num_rows();
@@ -142,7 +144,7 @@ struct interleave_columns_functor {
     };
 
     if (not create_mask) {
-      thrust::transform(rmm::exec_policy(stream)->on(stream),
+      thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                         index_begin,
                         index_end,
                         device_output->begin<Type>(),
@@ -156,7 +158,7 @@ struct interleave_columns_functor {
       return input.column(idx % divisor).is_valid(idx / divisor);
     };
 
-    thrust::transform_if(rmm::exec_policy(stream)->on(stream),
+    thrust::transform_if(rmm::exec_policy(stream)->on(stream.value()),
                          index_begin,
                          index_end,
                          device_output->begin<Type>(),
@@ -193,7 +195,12 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
   auto const output_needs_mask = std::any_of(
     std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); });
 
-  return type_dispatcher(dtype, detail::interleave_columns_functor{}, input, output_needs_mask, mr);
+  return type_dispatcher(dtype,
+                         detail::interleave_columns_functor{},
+                         input,
+                         output_needs_mask,
+                         rmm::cuda_stream_default,
+                         mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index a31eabe3964..7d906102cc2 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -14,45 +14,47 @@
  * limitations under the License.
  */
 
+#include <rolling/jit/code/code.h>
+#include <rolling/rolling_detail.hpp>
+#include <rolling/rolling_jit_detail.hpp>
+
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/rolling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
-#include <rolling/rolling_detail.hpp>
-#include <rolling/rolling_jit_detail.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
 
 #include <jit/launcher.h>
 #include <jit/parser.h>
 #include <jit/type.h>
-#include <rolling/jit/code/code.h>
-
 #include <jit/bit.hpp.jit>
 #include <jit/rolling_jit_detail.hpp.jit>
 #include <jit/types.hpp.jit>
 
-#include <thrust/binary_search.h>
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 
-#include <rmm/thrust_rmm_allocator.h>
+#include <thrust/binary_search.h>
 #include <thrust/detail/execution_policy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/find.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/utilities/device_operators.cuh>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/traits.hpp>
+
 #include <memory>
 
 namespace cudf {
@@ -499,7 +501,7 @@ struct rolling_window_launcher {
                             FollowingWindowIterator following_window_begin,
                             size_type min_periods,
                             std::unique_ptr<aggregation> const& agg,
-                            cudaStream_t stream)
+                            rmm::cuda_stream_view stream)
   {
     constexpr cudf::size_type block_size = 256;
     cudf::detail::grid_1d grid(input.size(), block_size);
@@ -512,28 +514,28 @@ struct rolling_window_launcher {
 
     if (input.has_nulls()) {
       gpu_rolling<T, target_type_t<InputType, op>, agg_op, op, block_size, true>
-        <<<grid.num_blocks, block_size, 0, stream>>>(*input_device_view,
-                                                     *default_outputs_device_view,
-                                                     *output_device_view,
-                                                     device_valid_count.data(),
-                                                     preceding_window_begin,
-                                                     following_window_begin,
-                                                     min_periods);
+        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                             *default_outputs_device_view,
+                                                             *output_device_view,
+                                                             device_valid_count.data(),
+                                                             preceding_window_begin,
+                                                             following_window_begin,
+                                                             min_periods);
     } else {
       gpu_rolling<T, target_type_t<InputType, op>, agg_op, op, block_size, false>
-        <<<grid.num_blocks, block_size, 0, stream>>>(*input_device_view,
-                                                     *default_outputs_device_view,
-                                                     *output_device_view,
-                                                     device_valid_count.data(),
-                                                     preceding_window_begin,
-                                                     following_window_begin,
-                                                     min_periods);
+        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                             *default_outputs_device_view,
+                                                             *output_device_view,
+                                                             device_valid_count.data(),
+                                                             preceding_window_begin,
+                                                             following_window_begin,
+                                                             min_periods);
     }
 
     size_type valid_count = device_valid_count.value(stream);
 
     // check the stream for debugging
-    CHECK_CUDA(stream);
+    CHECK_CUDA(stream.value());
 
     return valid_count;
   }
@@ -551,7 +553,7 @@ struct rolling_window_launcher {
                             size_type min_periods,
                             std::unique_ptr<aggregation> const& agg,
                             agg_op const& device_agg_op,
-                            cudaStream_t stream)
+                            rmm::cuda_stream_view stream)
   {
     constexpr cudf::size_type block_size = 256;
     cudf::detail::grid_1d grid(input.size(), block_size);
@@ -564,30 +566,30 @@ struct rolling_window_launcher {
 
     if (input.has_nulls()) {
       gpu_rolling<T, target_type_t<InputType, op>, agg_op, op, block_size, true>
-        <<<grid.num_blocks, block_size, 0, stream>>>(*input_device_view,
-                                                     *default_outputs_device_view,
-                                                     *output_device_view,
-                                                     device_valid_count.data(),
-                                                     preceding_window_begin,
-                                                     following_window_begin,
-                                                     min_periods,
-                                                     device_agg_op);
+        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                             *default_outputs_device_view,
+                                                             *output_device_view,
+                                                             device_valid_count.data(),
+                                                             preceding_window_begin,
+                                                             following_window_begin,
+                                                             min_periods,
+                                                             device_agg_op);
     } else {
       gpu_rolling<T, target_type_t<InputType, op>, agg_op, op, block_size, false>
-        <<<grid.num_blocks, block_size, 0, stream>>>(*input_device_view,
-                                                     *default_outputs_device_view,
-                                                     *output_device_view,
-                                                     device_valid_count.data(),
-                                                     preceding_window_begin,
-                                                     following_window_begin,
-                                                     min_periods,
-                                                     device_agg_op);
+        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                             *default_outputs_device_view,
+                                                             *output_device_view,
+                                                             device_valid_count.data(),
+                                                             preceding_window_begin,
+                                                             following_window_begin,
+                                                             min_periods,
+                                                             device_agg_op);
     }
 
     size_type valid_count = device_valid_count.value(stream);
 
     // check the stream for debugging
-    CHECK_CUDA(stream);
+    CHECK_CUDA(stream.value());
 
     return valid_count;
   }
@@ -610,8 +612,8 @@ struct rolling_window_launcher {
          FollowingWindowIterator following_window_begin,
          size_type min_periods,
          std::unique_ptr<aggregation> const& agg,
-         rmm::mr::device_memory_resource* mr,
-         cudaStream_t stream)
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource* mr)
   {
     if (input.is_empty()) return empty_like(input);
 
@@ -650,8 +652,8 @@ struct rolling_window_launcher {
          FollowingWindowIterator following_window_begin,
          size_type min_periods,
          std::unique_ptr<aggregation> const& agg,
-         rmm::mr::device_memory_resource* mr,
-         cudaStream_t stream)
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource* mr)
   {
     if (input.is_empty()) return empty_like(input);
 
@@ -721,8 +723,8 @@ struct rolling_window_launcher {
          FollowingWindowIterator following_window_begin,
          size_type min_periods,
          std::unique_ptr<aggregation> const& agg,
-         rmm::mr::device_memory_resource* mr,
-         cudaStream_t stream)
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Aggregation operator and/or input type combination is invalid");
   }
@@ -742,8 +744,8 @@ struct rolling_window_launcher {
          size_type min_periods,
          std::unique_ptr<aggregation> const& agg,
          agg_op const& device_agg_op,
-         rmm::mr::device_memory_resource* mr,
-         cudaStream_t stream)
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource* mr)
   {
     if (input.is_empty()) return empty_like(input);
 
@@ -793,8 +795,8 @@ struct rolling_window_launcher {
          size_type min_periods,
          std::unique_ptr<aggregation> const& agg,
          agg_op device_agg_op,
-         rmm::mr::device_memory_resource* mr,
-         cudaStream_t stream)
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL(
       "Aggregation operator and/or input type combination is invalid: "
@@ -812,8 +814,8 @@ struct rolling_window_launcher {
              FollowingWindowIterator following_window_begin,
              size_type min_periods,
              std::unique_ptr<aggregation> const& agg,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream)
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(default_outputs.is_empty(),
                  "Only LEAD/LAG window functions support default values.");
@@ -828,8 +830,8 @@ struct rolling_window_launcher {
                                            following_window_begin,
                                            min_periods,
                                            agg,
-                                           mr,
-                                           stream);
+                                           stream,
+                                           mr);
   }
 
   // This variant is just to handle mean
@@ -843,8 +845,8 @@ struct rolling_window_launcher {
     FollowingWindowIterator following_window_begin,
     size_type min_periods,
     std::unique_ptr<aggregation> const& agg,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
   {
     return launch<InputType, cudf::DeviceSum, op, PrecedingWindowIterator, FollowingWindowIterator>(
       input,
@@ -853,8 +855,8 @@ struct rolling_window_launcher {
       following_window_begin,
       min_periods,
       agg,
-      mr,
-      stream);
+      stream,
+      mr);
   }
 
   template <aggregation::Kind op,
@@ -867,8 +869,8 @@ struct rolling_window_launcher {
              FollowingWindowIterator following_window_begin,
              size_type min_periods,
              std::unique_ptr<aggregation> const& agg,
-             rmm::mr::device_memory_resource* mr,
-             cudaStream_t stream)
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
     return launch<InputType,
                   cudf::DeviceLeadLag,
@@ -882,8 +884,8 @@ struct rolling_window_launcher {
       min_periods,
       agg,
       cudf::DeviceLeadLag{static_cast<cudf::detail::lead_lag_aggregation*>(agg.get())->row_offset},
-      mr,
-      stream);
+      stream,
+      mr);
   }
 };
 
@@ -895,8 +897,8 @@ struct dispatch_rolling {
                                      FollowingWindowIterator following_window_begin,
                                      size_type min_periods,
                                      std::unique_ptr<aggregation> const& agg,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     return aggregation_dispatcher(agg->kind,
                                   rolling_window_launcher<T>{},
@@ -906,8 +908,8 @@ struct dispatch_rolling {
                                   following_window_begin,
                                   min_periods,
                                   agg,
-                                  mr,
-                                  stream);
+                                  stream,
+                                  mr);
   }
 };
 
@@ -916,15 +918,14 @@ struct dispatch_rolling {
 // Applies a user-defined rolling window function to the values in a column.
 template <typename PrecedingWindowIterator, typename FollowingWindowIterator>
 std::unique_ptr<column> rolling_window_udf(column_view const& input,
-
                                            PrecedingWindowIterator preceding_window,
                                            std::string const& preceding_window_str,
                                            FollowingWindowIterator following_window,
                                            std::string const& following_window_str,
                                            size_type min_periods,
                                            std::unique_ptr<aggregation> const& agg,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream = 0)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
@@ -999,7 +1000,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
   output->set_null_count(output->size() - device_valid_count.value(stream));
 
   // check the stream for debugging
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 
   return output;
 }
@@ -1021,8 +1022,8 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        FollowingWindowIterator following_window_begin,
                                        size_type min_periods,
                                        std::unique_ptr<aggregation> const& agg,
-                                       rmm::mr::device_memory_resource* mr,
-                                       cudaStream_t stream = 0)
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
@@ -1037,8 +1038,8 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                following_window_begin,
                                min_periods,
                                agg,
-                               mr,
-                               stream);
+                               stream,
+                               mr);
 }
 
 }  // namespace detail
@@ -1080,8 +1081,8 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                             "cudf::size_type",
                                             min_periods,
                                             agg,
-                                            mr,
-                                            0);
+                                            rmm::cuda_stream_default,
+                                            mr);
   } else {
     auto preceding_window_begin = thrust::make_constant_iterator(preceding_window);
     auto following_window_begin = thrust::make_constant_iterator(following_window);
@@ -1092,8 +1093,8 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                         following_window_begin,
                                         min_periods,
                                         agg,
-                                        mr,
-                                        0);
+                                        rmm::cuda_stream_default,
+                                        mr);
   }
 }
 
@@ -1125,8 +1126,8 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                             "cudf::size_type*",
                                             min_periods,
                                             agg,
-                                            mr,
-                                            0);
+                                            rmm::cuda_stream_default,
+                                            mr);
   } else {
     return cudf::detail::rolling_window(input,
                                         empty_like(input)->view(),
@@ -1134,8 +1135,8 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                         following_window.begin<size_type>(),
                                         min_periods,
                                         agg,
-                                        mr,
-                                        0);
+                                        rmm::cuda_stream_default,
+                                        mr);
   }
 }
 
@@ -1241,8 +1242,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                             "cudf::detail::following_window_wrapper",
                                             min_periods,
                                             aggr,
-                                            mr,
-                                            0);
+                                            rmm::cuda_stream_default,
+                                            mr);
   } else {
     return cudf::detail::rolling_window(
       input,
@@ -1253,8 +1254,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                       following_calculator),
       min_periods,
       aggr,
-      mr,
-      0);
+      rmm::cuda_stream_default,
+      mr);
   }
 }
 
@@ -1387,6 +1388,7 @@ std::unique_ptr<column> time_range_window_ASC(column_view const& input,
                                     following_calculator),
     min_periods,
     aggr,
+    rmm::cuda_stream_default,
     mr);
 }
 
@@ -1558,6 +1560,7 @@ std::unique_ptr<column> time_range_window_ASC(
                                     following_calculator),
     min_periods,
     aggr,
+    rmm::cuda_stream_default,
     mr);
 }
 
@@ -1642,6 +1645,7 @@ std::unique_ptr<column> time_range_window_DESC(column_view const& input,
                                     following_calculator),
     min_periods,
     aggr,
+    rmm::cuda_stream_default,
     mr);
 }
 
@@ -1747,8 +1751,8 @@ std::unique_ptr<column> time_range_window_DESC(
                                       following_calculator),
       min_periods,
       aggr,
-      mr,
-      0);
+      rmm::cuda_stream_default,
+      mr);
   }
 }
 
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index 7bad39af717..644b320dcd5 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,13 +20,16 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace {
 struct scalar_construction_helper {
   template <typename T,
             typename ScalarType = scalar_type_t<T>,
             typename std::enable_if_t<is_fixed_width<T>() and not is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<scalar> operator()(cudaStream_t stream, rmm::mr::device_memory_resource* mr) const
+  std::unique_ptr<scalar> operator()(rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     using Type = device_storage_type_t<T>;
     auto s     = new ScalarType(Type{}, false, stream, mr);
@@ -36,7 +39,8 @@ struct scalar_construction_helper {
   template <typename T,
             typename ScalarType                             = scalar_type_t<T>,
             typename std::enable_if_t<is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<scalar> operator()(cudaStream_t stream, rmm::mr::device_memory_resource* mr) const
+  std::unique_ptr<scalar> operator()(rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     using Type = device_storage_type_t<T>;
     auto s     = new ScalarType(Type{}, numeric::scale_type{0}, false, stream, mr);
@@ -55,7 +59,7 @@ struct scalar_construction_helper {
 
 // Allocate storage for a single numeric element
 std::unique_ptr<scalar> make_numeric_scalar(data_type type,
-                                            cudaStream_t stream,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
@@ -65,7 +69,7 @@ std::unique_ptr<scalar> make_numeric_scalar(data_type type,
 
 // Allocate storage for a single timestamp element
 std::unique_ptr<scalar> make_timestamp_scalar(data_type type,
-                                              cudaStream_t stream,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
@@ -75,7 +79,7 @@ std::unique_ptr<scalar> make_timestamp_scalar(data_type type,
 
 // Allocate storage for a single duration element
 std::unique_ptr<scalar> make_duration_scalar(data_type type,
-                                             cudaStream_t stream,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
@@ -85,7 +89,7 @@ std::unique_ptr<scalar> make_duration_scalar(data_type type,
 
 // Allocate storage for a single fixed width element
 std::unique_ptr<scalar> make_fixed_width_scalar(data_type type,
-                                                cudaStream_t stream,
+                                                rmm::cuda_stream_view stream,
                                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 0efd68ac974..2b3e7e5a60a 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -99,8 +99,7 @@ std::unique_ptr<column> search_ordered(table_view const& t,
 
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
-  auto matched = dictionary::detail::match_dictionaries(
-    {t, values}, rmm::mr::get_current_device_resource(), stream.value());
+  auto matched  = dictionary::detail::match_dictionaries({t, values}, stream);
   auto d_t      = table_device_view::create(matched.second.front(), stream);
   auto d_values = table_device_view::create(matched.second.back(), stream);
   auto count_it = thrust::make_counting_iterator<size_type>(0);
@@ -304,12 +303,10 @@ std::unique_ptr<column> multi_contains_dispatch::operator()<dictionary32>(
   dictionary_column_view const haystack(haystack_in);
   dictionary_column_view const needles(needles_in);
   // first combine keys so both dictionaries have the same set
-  auto haystack_matched = dictionary::detail::add_keys(
-    haystack, needles.keys(), rmm::mr::get_current_device_resource(), stream.value());
+  auto haystack_matched    = dictionary::detail::add_keys(haystack, needles.keys(), stream);
   auto const haystack_view = dictionary_column_view(haystack_matched->view());
-  auto needles_matched     = dictionary::detail::set_keys(
-    needles, haystack_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
-  auto const needles_view = dictionary_column_view(needles_matched->view());
+  auto needles_matched     = dictionary::detail::set_keys(needles, haystack_view.keys(), stream);
+  auto const needles_view  = dictionary_column_view(needles_matched->view());
 
   // now just use the indices for the contains
   column_view const haystack_indices = haystack_view.get_indices_annotated();
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index b737a889e98..1cbbdd0cff6 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -22,16 +22,18 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
+
 template <bool has_nulls>
 auto is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
-               std::vector<null_order> const& null_precedence)
+               std::vector<null_order> const& null_precedence,
+               rmm::cuda_stream_view stream)
 {
-  cudaStream_t stream = 0;
-  auto in_d           = table_device_view::create(in);
+  auto in_d = table_device_view::create(in);
   rmm::device_vector<order> d_column_order(column_order);
   rmm::device_vector<null_order> const d_null_precedence =
     (has_nulls) ? rmm::device_vector<null_order>{null_precedence}
@@ -39,7 +41,7 @@ auto is_sorted(cudf::table_view const& in,
   auto ineq_op = row_lexicographic_comparator<has_nulls>(
     *in_d, *in_d, d_column_order.data().get(), d_null_precedence.data().get());
 
-  auto sorted = thrust::is_sorted(rmm::exec_policy(stream)->on(stream),
+  auto sorted = thrust::is_sorted(rmm::exec_policy(stream)->on(stream.value()),
                                   thrust::make_counting_iterator(0),
                                   thrust::make_counting_iterator(in.num_rows()),
                                   ineq_op);
@@ -68,9 +70,9 @@ bool is_sorted(cudf::table_view const& in,
   }
 
   if (has_nulls(in)) {
-    return detail::is_sorted<true>(in, column_order, null_precedence);
+    return detail::is_sorted<true>(in, column_order, null_precedence, rmm::cuda_stream_default);
   } else {
-    return detail::is_sorted<false>(in, column_order, null_precedence);
+    return detail::is_sorted<false>(in, column_order, null_precedence, rmm::cuda_stream_default);
   }
 }
 
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index 50f8155313f..cb76701dd34 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -55,7 +55,7 @@ struct unique_comparator {
 // Assign rank from 1 to n unique values. Equal values get same rank value.
 rmm::device_vector<size_type> sorted_dense_rank(column_view input_col,
                                                 column_view sorted_order_view,
-                                                cudaStream_t stream)
+                                                rmm::cuda_stream_view stream)
 {
   auto device_table     = table_device_view::create(table_view{{input_col}}, stream);
   auto const input_size = input_col.size();
@@ -68,7 +68,7 @@ rmm::device_vector<size_type> sorted_dense_rank(column_view input_col,
     auto unique_it =
       thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0), conv);
 
-    thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream),
+    thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                            unique_it,
                            unique_it + input_size,
                            dense_rank_sorted.data().get());
@@ -78,7 +78,7 @@ rmm::device_vector<size_type> sorted_dense_rank(column_view input_col,
     auto unique_it =
       thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0), conv);
 
-    thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream),
+    thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                            unique_it,
                            unique_it + input_size,
                            dense_rank_sorted.data().get());
@@ -110,13 +110,13 @@ void tie_break_ranks_transform(rmm::device_vector<size_type> const &dense_rank_s
                                outputIterator rank_iter,
                                TieBreaker tie_breaker,
                                Transformer transformer,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream)
 {
   auto const input_size = sorted_order_view.size();
   rmm::device_vector<TieType> tie_sorted(input_size, 0);
   // algorithm: reduce_by_key(dense_rank, 1, n, reduction_tie_breaker)
   // reduction_tie_breaker = min, max, min_count
-  thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream),
+  thrust::reduce_by_key(rmm::exec_policy(stream)->on(stream.value()),
                         dense_rank_sorted.begin(),
                         dense_rank_sorted.end(),
                         tie_iter,
@@ -129,7 +129,7 @@ void tie_break_ranks_transform(rmm::device_vector<size_type> const &dense_rank_s
     [tied_rank = tie_sorted.begin(), transformer] __device__(auto dense_pos) {
       return transformer(tied_rank[dense_pos - 1]);
     });
-  thrust::scatter(rmm::exec_policy(stream)->on(stream),
+  thrust::scatter(rmm::exec_policy(stream)->on(stream.value()),
                   sorted_tied_rank,
                   sorted_tied_rank + input_size,
                   sorted_order_view.begin<size_type>(),
@@ -139,10 +139,10 @@ void tie_break_ranks_transform(rmm::device_vector<size_type> const &dense_rank_s
 template <typename outputType>
 void rank_first(column_view sorted_order_view,
                 mutable_column_view rank_mutable_view,
-                cudaStream_t stream)
+                rmm::cuda_stream_view stream)
 {
   // stable sort order ranking (no ties)
-  thrust::scatter(rmm::exec_policy(stream)->on(stream),
+  thrust::scatter(rmm::exec_policy(stream)->on(stream.value()),
                   thrust::make_counting_iterator<size_type>(1),
                   thrust::make_counting_iterator<size_type>(rank_mutable_view.size() + 1),
                   sorted_order_view.begin<size_type>(),
@@ -153,10 +153,10 @@ template <typename outputType>
 void rank_dense(rmm::device_vector<size_type> const &dense_rank_sorted,
                 column_view sorted_order_view,
                 mutable_column_view rank_mutable_view,
-                cudaStream_t stream)
+                rmm::cuda_stream_view stream)
 {
   // All equal values have same rank and rank always increases by 1 between groups
-  thrust::scatter(rmm::exec_policy(stream)->on(stream),
+  thrust::scatter(rmm::exec_policy(stream)->on(stream.value()),
                   dense_rank_sorted.begin(),
                   dense_rank_sorted.end(),
                   sorted_order_view.begin<size_type>(),
@@ -167,7 +167,7 @@ template <typename outputType>
 void rank_min(rmm::device_vector<size_type> const &group_keys,
               column_view sorted_order_view,
               mutable_column_view rank_mutable_view,
-              cudaStream_t stream)
+              rmm::cuda_stream_view stream)
 {
   // min of first in the group
   // All equal values have min of ranks among them.
@@ -185,7 +185,7 @@ template <typename outputType>
 void rank_max(rmm::device_vector<size_type> const &group_keys,
               column_view sorted_order_view,
               mutable_column_view rank_mutable_view,
-              cudaStream_t stream)
+              rmm::cuda_stream_view stream)
 {
   // max of first in the group
   // All equal values have max of ranks among them.
@@ -202,7 +202,7 @@ void rank_max(rmm::device_vector<size_type> const &group_keys,
 void rank_average(rmm::device_vector<size_type> const &group_keys,
                   column_view sorted_order_view,
                   mutable_column_view rank_mutable_view,
-                  cudaStream_t stream)
+                  rmm::cuda_stream_view stream)
 {
   // k, k+1, .. k+n-1
   // average = (n*k+ n*(n-1)/2)/n
@@ -236,13 +236,13 @@ std::unique_ptr<column> rank(column_view const &input,
                              null_policy null_handling,
                              null_order null_precedence,
                              bool percentage,
-                             rmm::mr::device_memory_resource *mr,
-                             cudaStream_t stream = 0)
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource *mr)
 {
   data_type const output_type = (percentage or method == rank_method::AVERAGE)
                                   ? data_type(type_id::FLOAT64)
                                   : data_type(type_to_id<size_type>());
-  std::unique_ptr<column> rank_column = [&null_handling, &output_type, &input, &mr, &stream] {
+  std::unique_ptr<column> rank_column = [&null_handling, &output_type, &input, &stream, &mr] {
     // na_option=keep assign NA to NA values
     if (null_handling == null_policy::EXCLUDE)
       return make_numeric_column(output_type,
@@ -320,7 +320,7 @@ std::unique_ptr<column> rank(column_view const &input,
       (null_handling == null_policy::EXCLUDE) ? input.size() - input.null_count() : input.size();
     auto drs            = dense_rank_sorted.data().get();
     bool const is_dense = (method == rank_method::DENSE);
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       rank_iter,
                       rank_iter + input.size(),
                       rank_iter,
@@ -340,6 +340,13 @@ std::unique_ptr<column> rank(column_view const &input,
                              bool percentage,
                              rmm::mr::device_memory_resource *mr)
 {
-  return detail::rank(input, method, column_order, null_handling, null_precedence, percentage, mr);
+  return detail::rank(input,
+                      method,
+                      column_order,
+                      null_handling,
+                      null_precedence,
+                      percentage,
+                      rmm::cuda_stream_default,
+                      mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index e16291b6aa2..5b7459b396f 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -50,11 +50,10 @@ namespace {
 template <typename UnaryFunction>
 std::unique_ptr<column> counts_fn(strings_column_view const& strings,
                                   UnaryFunction& ufn,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream = 0)
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   auto strings_count  = strings.size();
-  auto execpol        = rmm::exec_policy(stream);
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create output column
@@ -62,13 +61,12 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
     cudf::data_type{type_id::INT32},
     strings_count,
     rmm::device_buffer(strings_count * sizeof(int32_t), stream, mr),
-    cudf::detail::copy_bitmask(
-      strings.parent(), rmm::cuda_stream_view{stream}, mr),  // copy the null mask
+    cudf::detail::copy_bitmask(strings.parent(), stream, mr),  // copy the null mask
     strings.null_count());
   auto results_view = results->mutable_view();
   auto d_lengths    = results_view.data<int32_t>();
   // fill in the lengths
-  thrust::transform(execpol->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(strings_count),
                     d_lengths,
@@ -86,20 +84,20 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
 
 std::unique_ptr<column> count_characters(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto ufn = [] __device__(const string_view& d_str) { return d_str.length(); };
-  return counts_fn(strings, ufn, mr, stream);
+  return counts_fn(strings, ufn, stream, mr);
 }
 
 std::unique_ptr<column> count_bytes(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto ufn = [] __device__(const string_view& d_str) { return d_str.size_bytes(); };
-  return counts_fn(strings, ufn, mr, stream);
+  return counts_fn(strings, ufn, stream, mr);
 }
 
 }  // namespace detail
@@ -134,8 +132,8 @@ namespace detail {
 //
 std::unique_ptr<column> code_points(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
@@ -144,7 +142,7 @@ std::unique_ptr<column> code_points(
   rmm::device_vector<size_type> offsets(strings.size() + 1);
   size_type* d_offsets = offsets.data().get();
   thrust::transform_inclusive_scan(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings.size()),
     d_offsets + 1,
@@ -154,7 +152,7 @@ std::unique_ptr<column> code_points(
       return length;
     },
     thrust::plus<size_type>());
-  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(size_type), stream));
+  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(size_type), stream.value()));
 
   // the total size is the number of characters in the entire column
   size_type num_characters = offsets.back();
@@ -165,11 +163,11 @@ std::unique_ptr<column> code_points(
   // fill column with character code-point values
   auto d_results = results_view.data<int32_t>();
   // now set the ranges from each strings' character values
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings.size(),
                      code_points_fn{d_column, d_offsets, d_results});
-  //
+
   results->set_null_count(0);
   return results;
 }
@@ -182,21 +180,21 @@ std::unique_ptr<column> count_characters(strings_column_view const& strings,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_characters(strings, mr);
+  return detail::count_characters(strings, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> count_bytes(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_bytes(strings, mr);
+  return detail::count_bytes(strings, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> code_points(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::code_points(strings, mr);
+  return detail::code_points(strings, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index f2482588cc8..7dd4962e8de 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -15,6 +15,9 @@
  */
 
 #include <strings/char_types/is_flags.h>
+#include <strings/utilities.cuh>
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -25,8 +28,8 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
-#include <strings/utilities.cuh>
-#include <strings/utilities.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace strings {
@@ -257,14 +260,15 @@ std::unique_ptr<column> capitalize(strings_column_view const& strings,
 {
   CUDF_FUNC_RANGE();
   return detail::modify_strings<detail::probe_capitalize, detail::execute_capitalize>(
-    strings, mr, nullptr);
+    strings, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> title(strings_column_view const& strings,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::modify_strings<detail::probe_title, detail::execute_title>(strings, mr, nullptr);
+  return detail::modify_strings<detail::probe_title, detail::execute_title>(
+    strings, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 48306ce4e11..453f1e7daf7 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -128,17 +128,17 @@ struct upper_lower_fn {
  *
  * @param strings Strings to convert.
  * @param case_flag The character type to convert (upper, lower, or both)
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column with characters converted.
  */
 std::unique_ptr<column> convert_case(strings_column_view const& strings,
                                      character_flags_table_type case_flag,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
 
   auto execpol         = rmm::exec_policy(stream);
   auto strings_column  = column_device_view::create(strings.parent(), stream);
@@ -146,8 +146,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
   size_type null_count = strings.null_count();
 
   // copy null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   // get the lookup tables used for case conversion
   auto d_flags = get_character_flags_table();
 
@@ -159,24 +158,24 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
     thrust::make_counting_iterator<size_type>(0),
     upper_lower_fn<SizeOnly>{d_column, case_flag, d_flags, d_case_table, d_special_case_mapping});
   auto offsets_column = detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto offsets_view  = offsets_column->view();
   auto d_new_offsets = offsets_view.data<int32_t>();
 
   // build the chars column -- convert characters based on case_flag parameter
   size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
   auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
 
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     upper_lower_fn<ExecuteOp>{
       d_column, case_flag, d_flags, d_case_table, d_special_case_mapping, d_new_offsets, d_chars});
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -190,32 +189,32 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
 
 std::unique_ptr<column> to_lower(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   character_flags_table_type case_flag = IS_UPPER(0xFF);  // convert only upper case characters
-  return convert_case(strings, case_flag, mr, stream);
+  return convert_case(strings, case_flag, stream, mr);
 }
 
 //
 std::unique_ptr<column> to_upper(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   character_flags_table_type case_flag = IS_LOWER(0xFF);  // convert only lower case characters
-  return convert_case(strings, case_flag, mr, stream);
+  return convert_case(strings, case_flag, stream, mr);
 }
 
 //
 std::unique_ptr<column> swapcase(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   // convert only upper or lower case characters
   character_flags_table_type case_flag = IS_LOWER(0xFF) | IS_UPPER(0xFF);
-  return convert_case(strings, case_flag, mr, stream);
+  return convert_case(strings, case_flag, stream, mr);
 }
 
 }  // namespace detail
@@ -226,21 +225,21 @@ std::unique_ptr<column> to_lower(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_lower(strings, mr);
+  return detail::to_lower(strings, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> to_upper(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_upper(strings, mr);
+  return detail::to_upper(strings, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> swapcase(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::swapcase(strings, mr);
+  return detail::swapcase(strings, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 6e63e756c2e..da85c551adf 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -39,27 +39,26 @@ std::unique_ptr<column> all_characters_of_type(
   strings_column_view const& strings,
   string_character_types types,
   string_character_types verify_types,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count  = strings.size();
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
 
   // create output column
-  auto results = make_numeric_column(
-    data_type{type_id::BOOL8},
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results      = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<bool>();
   // get the static character types table
   auto d_flags = detail::get_character_flags_table();
   // set the output values by checking the character types for each string
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(strings_count),
                     d_results,
@@ -148,7 +147,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                                                   string_character_types types_to_remove,
                                                   string_scalar const& replacement,
                                                   string_character_types types_to_keep,
-                                                  cudaStream_t stream,
+                                                  rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid");
@@ -171,12 +170,11 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                            d_replacement};
 
   // copy null mask from input column
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
   auto children = cudf::strings::detail::make_strings_children(
-    filterer, strings_count, strings.null_count(), mr, stream);
+    filterer, strings_count, strings.null_count(), stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
@@ -190,21 +188,20 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
 
 std::unique_ptr<column> is_integer(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
-  auto results = make_numeric_column(
-    data_type{type_id::BOOL8},
-    strings.size(),
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto d_results = results->mutable_view().data<bool>();
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(strings.size()),
                     d_results,
@@ -216,7 +213,7 @@ std::unique_ptr<column> is_integer(
   return results;
 }
 
-bool all_integer(strings_column_view const& strings, cudaStream_t stream = 0)
+bool all_integer(strings_column_view const& strings, rmm::cuda_stream_view stream)
 {
   auto strings_column  = column_device_view::create(strings.parent(), stream);
   auto d_column        = *strings_column;
@@ -225,7 +222,7 @@ bool all_integer(strings_column_view const& strings, cudaStream_t stream = 0)
       if (d_column.is_null(idx)) return false;
       return string::is_integer(d_column.element<string_view>(idx));
     });
-  return thrust::all_of(rmm::exec_policy(stream)->on(stream),
+  return thrust::all_of(rmm::exec_policy(stream)->on(stream.value()),
                         transformer_itr,
                         transformer_itr + strings.size(),
                         thrust::identity<bool>());
@@ -233,22 +230,21 @@ bool all_integer(strings_column_view const& strings, cudaStream_t stream = 0)
 
 std::unique_ptr<column> is_float(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
-  auto results = make_numeric_column(
-    data_type{type_id::BOOL8},
-    strings.size(),
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto d_results = results->mutable_view().data<bool>();
   // check strings for valid float chars
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(strings.size()),
                     d_results,
@@ -260,7 +256,7 @@ std::unique_ptr<column> is_float(
   return results;
 }
 
-bool all_float(strings_column_view const& strings, cudaStream_t stream = 0)
+bool all_float(strings_column_view const& strings, rmm::cuda_stream_view stream)
 {
   auto strings_column  = column_device_view::create(strings.parent(), stream);
   auto d_column        = *strings_column;
@@ -269,7 +265,7 @@ bool all_float(strings_column_view const& strings, cudaStream_t stream = 0)
       if (d_column.is_null(idx)) return false;
       return string::is_float(d_column.element<string_view>(idx));
     });
-  return thrust::all_of(rmm::exec_policy(stream)->on(stream),
+  return thrust::all_of(rmm::exec_policy(stream)->on(stream.value()),
                         transformer_itr,
                         transformer_itr + strings.size(),
                         thrust::identity<bool>());
@@ -285,7 +281,7 @@ std::unique_ptr<column> all_characters_of_type(strings_column_view const& string
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::all_characters_of_type(strings, types, verify_types, mr);
+  return detail::all_characters_of_type(strings, types, verify_types, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> filter_characters_of_type(strings_column_view const& strings,
@@ -296,33 +292,33 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters_of_type(
-    strings, types_to_remove, replacement, types_to_keep, 0, mr);
+    strings, types_to_remove, replacement, types_to_keep, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> is_integer(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, mr);
+  return detail::is_integer(strings, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> is_float(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_float(strings, mr);
+  return detail::is_float(strings, rmm::cuda_stream_default, mr);
 }
 
 bool all_integer(strings_column_view const& strings)
 {
   CUDF_FUNC_RANGE();
-  return detail::all_integer(strings);
+  return detail::all_integer(strings, rmm::cuda_stream_default);
 }
 
 bool all_float(strings_column_view const& strings)
 {
   CUDF_FUNC_RANGE();
-  return detail::all_float(strings);
+  return detail::all_float(strings, rmm::cuda_stream_default);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu
index 57bd7abef2f..1ef8e691149 100644
--- a/cpp/src/strings/combine.cu
+++ b/cpp/src/strings/combine.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <strings/utilities.cuh>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
@@ -28,9 +30,8 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
 
-#include <strings/utilities.cuh>
-
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/logical.h>
 #include <thrust/transform_reduce.h>
@@ -41,12 +42,12 @@
 namespace cudf {
 namespace strings {
 namespace detail {
-//
+
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& separator,
                                     string_scalar const& narep,
-                                    rmm::mr::device_memory_resource* mr,
-                                    cudaStream_t stream = 0)
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
@@ -59,7 +60,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
   auto strings_count = strings_columns.num_rows();
   if (strings_count == 0)  // empty begets empty
-    return detail::make_empty_strings_column(mr, stream);
+    return detail::make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
   string_view d_separator(separator.data(), separator.size());
@@ -111,17 +112,17 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
   auto offsets_column = detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_results_offsets = offsets_column->view().data<int32_t>();
 
   // create the chars column
   size_type bytes = thrust::device_pointer_cast(d_results_offsets)[strings_count];
   auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   // fill the chars column
   auto d_results_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     [d_table, num_columns, d_separator, d_narep, d_results_offsets, d_results_chars] __device__(
@@ -154,15 +155,14 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                              mr);
 }
 
-//
 std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream = 0)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
 
@@ -178,7 +178,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
   auto d_output_offsets = output_offsets.data().get();
   // using inclusive-scan to compute last entry which is the total size
   thrust::transform_inclusive_scan(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings_count),
     d_output_offsets + 1,
@@ -193,7 +193,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
       return bytes;
     },
     thrust::plus<size_type>());
-  CUDA_TRY(cudaMemsetAsync(d_output_offsets, 0, sizeof(size_type), stream));
+  CUDA_TRY(cudaMemsetAsync(d_output_offsets, 0, sizeof(size_type), stream.value()));
   // total size is the last entry
   size_type bytes = output_offsets.back();
 
@@ -207,7 +207,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                            new_offsets,
                            sizeof(new_offsets),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
 
   // build null mask
   // only one entry so it is either all valid or all null
@@ -218,11 +218,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
     null_count = 1;
   }
   auto chars_column =
-    detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+    detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     [d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) {
@@ -248,13 +248,12 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                              mr);
 }
 
-//
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     strings_column_view const& separators,
                                     string_scalar const& separator_narep,
                                     string_scalar const& col_narep,
-                                    rmm::mr::device_memory_resource* mr,
-                                    cudaStream_t stream = 0)
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
@@ -268,7 +267,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   CUDF_EXPECTS(strings_count == separators.size(),
                "Separators column should be the same size as the strings columns");
   if (strings_count == 0)  // Empty begets empty
-    return detail::make_empty_strings_column(mr, stream);
+    return detail::make_empty_strings_column(stream, mr);
 
   // Invalid output column strings - null rows
   string_view const invalid_str{nullptr, 0};
@@ -287,7 +286,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 
     // Execute it on every element
     thrust::transform(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream)->on(stream.value()),
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(strings_count),
       out_col_strings.data().get(),
@@ -373,17 +372,17 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
   auto offsets_column = detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_results_offsets = offsets_column->view().data<int32_t>();
 
   // Create the chars column
   size_type bytes = thrust::device_pointer_cast(d_results_offsets)[strings_count];
   auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
 
   // Fill the chars column
   auto d_results_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      [d_table,
@@ -453,7 +452,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(strings_columns, separator, narep, mr);
+  return detail::concatenate(strings_columns, separator, narep, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> join_strings(strings_column_view const& strings,
@@ -462,7 +461,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::join_strings(strings, separator, narep, mr);
+  return detail::join_strings(strings, separator, narep, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
@@ -472,7 +471,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(strings_columns, separators, separator_narep, col_narep, mr);
+  return detail::concatenate(
+    strings_columns, separators, separator_narep, col_narep, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 96c87f554b5..246a5cad1ae 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -69,8 +69,8 @@ std::unique_ptr<column> contains_util(
   strings_column_view const& strings,
   std::string const& pattern,
   bool beginning_only                 = false,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count  = strings.size();
   auto strings_column = column_device_view::create(strings.parent(), stream);
@@ -81,32 +81,31 @@ std::unique_ptr<column> contains_util(
   auto d_prog = *prog;
 
   // create the output column
-  auto results = make_numeric_column(
-    data_type{type_id::BOOL8},
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto d_results = results->mutable_view().data<bool>();
 
   // fill the output column
   auto execpol    = rmm::exec_policy(stream);
   int regex_insts = d_prog.insts_counts();
   if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
-    thrust::transform(execpol->on(stream),
+    thrust::transform(execpol->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_results,
                       contains_fn<RX_STACK_SMALL>{d_prog, d_column, beginning_only});
   else if (regex_insts <= RX_MEDIUM_INSTS)
-    thrust::transform(execpol->on(stream),
+    thrust::transform(execpol->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_results,
                       contains_fn<RX_STACK_MEDIUM>{d_prog, d_column, beginning_only});
   else
-    thrust::transform(execpol->on(stream),
+    thrust::transform(execpol->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_results,
@@ -121,19 +120,19 @@ std::unique_ptr<column> contains_util(
 std::unique_ptr<column> contains_re(
   strings_column_view const& strings,
   std::string const& pattern,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return contains_util(strings, pattern, false, mr, stream);
+  return contains_util(strings, pattern, false, stream, mr);
 }
 
 std::unique_ptr<column> matches_re(
   strings_column_view const& strings,
   std::string const& pattern,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return contains_util(strings, pattern, true, mr, stream);
+  return contains_util(strings, pattern, true, stream, mr);
 }
 
 }  // namespace detail
@@ -145,7 +144,7 @@ std::unique_ptr<column> contains_re(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_re(strings, pattern, mr);
+  return detail::contains_re(strings, pattern, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> matches_re(strings_column_view const& strings,
@@ -153,7 +152,7 @@ std::unique_ptr<column> matches_re(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::matches_re(strings, pattern, mr);
+  return detail::matches_re(strings, pattern, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -191,8 +190,8 @@ struct count_fn {
 std::unique_ptr<column> count_re(
   strings_column_view const& strings,
   std::string const& pattern,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count  = strings.size();
   auto strings_column = column_device_view::create(strings.parent(), stream);
@@ -203,32 +202,31 @@ std::unique_ptr<column> count_re(
   auto d_prog = *prog;
 
   // create the output column
-  auto results = make_numeric_column(
-    data_type{type_id::INT32},
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results   = make_numeric_column(data_type{type_id::INT32},
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto d_results = results->mutable_view().data<int32_t>();
 
   // fill the output column
   auto execpol    = rmm::exec_policy(stream);
   int regex_insts = d_prog.insts_counts();
   if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
-    thrust::transform(execpol->on(stream),
+    thrust::transform(execpol->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_results,
                       count_fn<RX_STACK_SMALL>{d_prog, d_column});
   else if (regex_insts <= RX_MEDIUM_INSTS)
-    thrust::transform(execpol->on(stream),
+    thrust::transform(execpol->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_results,
                       count_fn<RX_STACK_MEDIUM>{d_prog, d_column});
   else
-    thrust::transform(execpol->on(stream),
+    thrust::transform(execpol->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_results,
@@ -247,7 +245,7 @@ std::unique_ptr<column> count_re(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_re(strings, pattern, mr);
+  return detail::count_re(strings, pattern, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index 1ba2151c0a7..e46d1dbe4b5 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -39,7 +39,7 @@ namespace detail {
 // Convert strings column to boolean column
 std::unique_ptr<column> to_booleans(strings_column_view const& strings,
                                     string_scalar const& true_string,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
@@ -52,17 +52,16 @@ std::unique_ptr<column> to_booleans(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create output column copying the strings' null-mask
-  auto results = make_numeric_column(
-    data_type{type_id::BOOL8},
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results      = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<bool>();
 
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(strings_count),
                     d_results,
@@ -84,7 +83,7 @@ std::unique_ptr<column> to_booleans(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_booleans(strings, true_string, cudaStream_t{}, mr);
+  return detail::to_booleans(strings, true_string, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -92,11 +91,11 @@ namespace detail {
 std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
-                                      cudaStream_t stream,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = booleans.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(booleans.type().id() == type_id::BOOL8, "Input column must be boolean type");
   CUDF_EXPECTS(true_string.is_valid() && true_string.size() > 0,
@@ -110,8 +109,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
   auto d_column = *column;
 
   // copy null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(booleans, rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(booleans, stream, mr);
   // build offsets column
   auto offsets_transformer_itr =
     thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0),
@@ -125,17 +123,17 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       return bytes;
                                     });
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto offsets_view = offsets_column->view();
   auto d_offsets    = offsets_view.data<int32_t>();
 
   // build chars column
   size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column =
-    create_chars_child_column(strings_count, booleans.null_count(), bytes, mr, stream);
+    create_chars_child_column(strings_count, booleans.null_count(), bytes, stream, mr);
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      [d_column, d_true, d_false, d_offsets, d_chars] __device__(size_type idx) {
@@ -143,7 +141,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                        string_view result = (d_column.element<bool>(idx) ? d_true : d_false);
                        memcpy(d_chars + d_offsets[idx], result.data(), result.size_bytes());
                      });
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -163,7 +161,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_booleans(booleans, true_string, false_string, cudaStream_t{}, mr);
+  return detail::from_booleans(booleans, true_string, false_string, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index f716b1500c6..688ebacb95c 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -123,7 +123,7 @@ struct format_compiler {
                                               {'p', 2},
                                               {'j', 3}};
 
-  format_compiler(const char* fmt, cudaStream_t stream) : format(fmt), d_items(0, stream)
+  format_compiler(const char* fmt, rmm::cuda_stream_view stream) : format(fmt), d_items(0, stream)
   {
     std::vector<format_item> items;
     const char* str = format.c_str();
@@ -165,7 +165,7 @@ struct format_compiler {
                              items.data(),
                              items.size() * sizeof(items[0]),
                              cudaMemcpyHostToDevice,
-                             stream));
+                             stream.value()));
   }
 
   format_item const* format_items() { return d_items.data(); }
@@ -376,14 +376,14 @@ struct dispatch_to_timestamps_fn {
                   std::string const& format,
                   timestamp_units units,
                   mutable_column_view& results_view,
-                  cudaStream_t stream) const
+                  rmm::cuda_stream_view stream) const
   {
     format_compiler compiler(format.c_str(), stream);
     auto d_items   = compiler.format_items();
     auto d_results = results_view.data<T>();
     parse_datetime<T> pfn{
       d_strings, d_items, compiler.items_count(), units, compiler.subsecond_precision()};
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(results_view.size()),
                       d_results,
@@ -394,7 +394,7 @@ struct dispatch_to_timestamps_fn {
                   std::string const&,
                   timestamp_units,
                   mutable_column_view&,
-                  cudaStream_t) const
+                  rmm::cuda_stream_view) const
   {
     CUDF_FAIL("Only timestamps type are expected");
   }
@@ -406,7 +406,7 @@ struct dispatch_to_timestamps_fn {
 std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& strings,
                                             data_type timestamp_type,
                                             std::string const& format,
-                                            cudaStream_t stream,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
@@ -418,13 +418,12 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
 
-  auto results = make_timestamp_column(
-    timestamp_type,
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results      = make_timestamp_column(timestamp_type,
+                                       strings_count,
+                                       cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                       strings.null_count(),
+                                       stream,
+                                       mr);
   auto results_view = results->mutable_view();
   cudf::type_dispatcher(
     timestamp_type, dispatch_to_timestamps_fn(), d_column, format, units, results_view, stream);
@@ -558,7 +557,7 @@ struct check_datetime_format {
 
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& strings,
                                            std::string const& format,
-                                           cudaStream_t stream,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
@@ -569,18 +568,17 @@ std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
 
-  auto results = make_numeric_column(
-    data_type{type_id::BOOL8},
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto d_results = results->mutable_view().data<bool>();
 
   format_compiler compiler(format.c_str(), stream);
   thrust::transform(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings_count),
     d_results,
@@ -600,7 +598,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& strings,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_timestamps(strings, timestamp_type, format, cudaStream_t{}, mr);
+  return detail::to_timestamps(strings, timestamp_type, format, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& strings,
@@ -608,7 +606,7 @@ std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& strings,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_timestamp(strings, format, cudaStream_t{}, mr);
+  return detail::is_timestamp(strings, format, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -849,10 +847,10 @@ struct dispatch_from_timestamps_fn {
                   timestamp_units units,
                   const int32_t* d_offsets,
                   char* d_chars,
-                  cudaStream_t stream) const
+                  rmm::cuda_stream_view stream) const
   {
     datetime_formatter<T> pfn{d_timestamps, d_format_items, items_count, units, d_offsets, d_chars};
-    thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator<cudf::size_type>(0),
                        d_timestamps.size(),
                        pfn);
@@ -864,7 +862,7 @@ struct dispatch_from_timestamps_fn {
                   timestamp_units,
                   const int32_t*,
                   char* d_chars,
-                  cudaStream_t stream) const
+                  rmm::cuda_stream_view stream) const
   {
     CUDF_FAIL("Only timestamps type are expected");
   }
@@ -875,11 +873,11 @@ struct dispatch_from_timestamps_fn {
 //
 std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string const& format,
-                                        cudaStream_t stream,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = timestamps.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.");
   timestamp_units units =
@@ -892,8 +890,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
   auto d_column = *column;
 
   // copy null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(timestamps, rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(timestamps, stream, mr);
   // Each string will be the same number of bytes which can be determined
   // directly from the format string.
   auto d_str_bytes = compiler.template_bytes();  // size in bytes of each string
@@ -904,14 +901,14 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                       return (d_column.is_null(idx) ? 0 : d_str_bytes);
                                     });
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto offsets_view  = offsets_column->view();
   auto d_new_offsets = offsets_view.template data<int32_t>();
 
   // build chars column
   size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
   auto chars_column =
-    create_chars_child_column(strings_count, timestamps.null_count(), bytes, mr, stream);
+    create_chars_child_column(strings_count, timestamps.null_count(), bytes, stream, mr);
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.template data<char>();
   // fill in chars column with timestamps
@@ -925,7 +922,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                         d_new_offsets,
                         d_chars,
                         stream);
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -944,7 +941,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_timestamps(timestamps, format, cudaStream_t{}, mr);
+  return detail::from_timestamps(timestamps, format, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index d2709e2ebe1..cdcef791f7b 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -80,7 +80,8 @@ struct alignas(4) format_item {
 struct format_compiler {
   std::string format;
   rmm::device_uvector<format_item> d_items;
-  format_compiler(const char* format_, cudaStream_t stream) : format(format_), d_items(0, stream)
+  format_compiler(const char* format_, rmm::cuda_stream_view stream)
+    : format(format_), d_items(0, stream)
   {
     static std::map<char, int8_t> const specifier_lengths = {
       {'-', -1},  // '-' if negative
@@ -150,7 +151,7 @@ struct format_compiler {
                              items.data(),
                              items.size() * sizeof(items[0]),
                              cudaMemcpyHostToDevice,
-                             stream));
+                             stream.value()));
   }
 
   format_item const* compiled_format_items() { return d_items.data(); }
@@ -400,8 +401,8 @@ struct dispatch_from_durations_fn {
   template <typename T, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& durations,
                                      std::string const& format,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.");
 
@@ -413,14 +414,13 @@ struct dispatch_from_durations_fn {
     auto d_column           = *column;
 
     // copy null mask
-    rmm::device_buffer null_mask =
-      cudf::detail::copy_bitmask(durations, rmm::cuda_stream_view{stream}, mr);
+    rmm::device_buffer null_mask = cudf::detail::copy_bitmask(durations, stream, mr);
     // build offsets column
     auto offsets_transformer_itr = thrust::make_transform_iterator(
       thrust::make_counting_iterator<int32_t>(0),
       duration_to_string_size_fn<T>{d_column, d_format_items, compiler.items_count()});
     auto offsets_column = detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+      offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
     auto offsets_view  = offsets_column->view();
     auto d_new_offsets = offsets_view.template data<int32_t>();
 
@@ -428,17 +428,16 @@ struct dispatch_from_durations_fn {
     auto const chars_bytes =
       cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
     auto chars_column = detail::create_chars_child_column(
-      strings_count, durations.null_count(), chars_bytes, mr, stream);
+      strings_count, durations.null_count(), chars_bytes, stream, mr);
     auto chars_view = chars_column->mutable_view();
     auto d_chars    = chars_view.template data<char>();
 
-    thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator<size_type>(0),
                        strings_count,
                        duration_to_string_fn<T>{
                          d_column, d_format_items, compiler.items_count(), d_new_offsets, d_chars});
 
-    //
     return make_strings_column(strings_count,
                                std::move(offsets_column),
                                std::move(chars_column),
@@ -452,8 +451,8 @@ struct dispatch_from_durations_fn {
   template <typename T, std::enable_if_t<not cudf::is_duration<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      std::string const& format,
-                                     rmm::mr::device_memory_resource*,
-                                     cudaStream_t) const
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
   {
     CUDF_FAIL("Values for from_durations function must be a duration type.");
   }
@@ -678,13 +677,13 @@ struct dispatch_to_durations_fn {
   void operator()(column_device_view const& d_strings,
                   std::string const& format,
                   mutable_column_view& results_view,
-                  cudaStream_t stream) const
+                  rmm::cuda_stream_view stream) const
   {
     format_compiler compiler(format.c_str(), stream);
     auto d_items   = compiler.compiled_format_items();
     auto d_results = results_view.data<T>();
     parse_duration<T> pfn{d_strings, d_items, compiler.items_count()};
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(results_view.size()),
                       d_results,
@@ -694,7 +693,7 @@ struct dispatch_to_durations_fn {
   void operator()(column_device_view const&,
                   std::string const&,
                   mutable_column_view&,
-                  cudaStream_t) const
+                  rmm::cuda_stream_view) const
   {
     CUDF_FAIL("Only durations type are expected for to_durations function");
   }
@@ -704,20 +703,20 @@ struct dispatch_to_durations_fn {
 
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string const& format,
-                                       cudaStream_t stream,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = durations.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   return type_dispatcher(
-    durations.type(), dispatch_from_durations_fn{}, durations, format, mr, stream);
+    durations.type(), dispatch_from_durations_fn{}, durations, format, stream, mr);
 }
 
 std::unique_ptr<column> to_durations(strings_column_view const& strings,
                                      data_type duration_type,
                                      std::string const& format,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
@@ -728,13 +727,12 @@ std::unique_ptr<column> to_durations(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
 
-  auto results = make_duration_column(
-    duration_type,
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results      = make_duration_column(duration_type,
+                                      strings_count,
+                                      cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                      strings.null_count(),
+                                      stream,
+                                      mr);
   auto results_view = results->mutable_view();
   cudf::type_dispatcher(
     duration_type, dispatch_to_durations_fn(), d_column, format, results_view, stream);
@@ -749,7 +747,7 @@ std::unique_ptr<column> from_durations(column_view const& durations,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_durations(durations, format, cudaStream_t{}, mr);
+  return detail::from_durations(durations, format, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> to_durations(strings_column_view const& strings,
@@ -758,7 +756,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& strings,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_durations(strings, duration_type, format, cudaStream_t{}, mr);
+  return detail::to_durations(strings, duration_type, format, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 8abf49c5dca..4f5edb660e5 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -148,10 +148,10 @@ struct dispatch_to_floats_fn {
             std::enable_if_t<std::is_floating_point<FloatType>::value>* = nullptr>
   void operator()(column_device_view const& strings_column,
                   mutable_column_view& output_column,
-                  cudaStream_t stream) const
+                  rmm::cuda_stream_view stream) const
   {
     auto d_results = output_column.data<FloatType>();
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_column.size()),
                       d_results,
@@ -159,7 +159,7 @@ struct dispatch_to_floats_fn {
   }
   // non-integral types throw an exception
   template <typename T, std::enable_if_t<not std::is_floating_point<T>::value>* = nullptr>
-  void operator()(column_device_view const&, mutable_column_view&, cudaStream_t) const
+  void operator()(column_device_view const&, mutable_column_view&, rmm::cuda_stream_view) const
   {
     CUDF_FAIL("Output for to_floats must be a float type.");
   }
@@ -170,7 +170,7 @@ struct dispatch_to_floats_fn {
 // This will convert a strings column into any float column type.
 std::unique_ptr<column> to_floats(strings_column_view const& strings,
                                   data_type output_type,
-                                  cudaStream_t stream,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
@@ -178,13 +178,12 @@ std::unique_ptr<column> to_floats(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create float output column copying the strings null-mask
-  auto results = make_numeric_column(
-    output_type,
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results      = make_numeric_column(output_type,
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto results_view = results->mutable_view();
   // fill output column with floats
   type_dispatcher(output_type, dispatch_to_floats_fn{}, d_strings, results_view, stream);
@@ -201,7 +200,7 @@ std::unique_ptr<column> to_floats(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_floats(strings, output_type, cudaStream_t{}, mr);
+  return detail::to_floats(strings, output_type, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -463,31 +462,30 @@ struct dispatch_from_floats_fn {
   template <typename FloatType,
             std::enable_if_t<std::is_floating_point<FloatType>::value>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& floats,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     size_type strings_count = floats.size();
     auto column             = column_device_view::create(floats, stream);
     auto d_column           = *column;
 
     // copy the null mask
-    rmm::device_buffer null_mask =
-      cudf::detail::copy_bitmask(floats, rmm::cuda_stream_view{stream}, mr);
+    rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
     // build offsets column
     auto offsets_transformer_itr = thrust::make_transform_iterator(
       thrust::make_counting_iterator<int32_t>(0), float_to_string_size_fn<FloatType>{d_column});
     auto offsets_column = detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+      offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
     auto offsets_view = offsets_column->view();
     auto d_offsets    = offsets_view.template data<int32_t>();
 
     // build chars column
     size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
     auto chars_column =
-      detail::create_chars_child_column(strings_count, floats.null_count(), bytes, mr, stream);
+      detail::create_chars_child_column(strings_count, floats.null_count(), bytes, stream, mr);
     auto chars_view = chars_column->mutable_view();
     auto d_chars    = chars_view.template data<char>();
-    thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator<size_type>(0),
                        strings_count,
                        float_to_string_fn<FloatType>{d_column, d_offsets, d_chars});
@@ -504,8 +502,8 @@ struct dispatch_from_floats_fn {
   // non-float types throw an exception
   template <typename T, std::enable_if_t<not std::is_floating_point<T>::value>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
-                                     rmm::mr::device_memory_resource*,
-                                     cudaStream_t) const
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
   {
     CUDF_FAIL("Values for from_floats function must be a float type.");
   }
@@ -515,13 +513,13 @@ struct dispatch_from_floats_fn {
 
 // This will convert all float column types into a strings column.
 std::unique_ptr<column> from_floats(column_view const& floats,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = floats.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
 
-  return type_dispatcher(floats.type(), dispatch_from_floats_fn{}, floats, mr, stream);
+  return type_dispatcher(floats.type(), dispatch_from_floats_fn{}, floats, stream, mr);
 }
 
 }  // namespace detail
@@ -531,7 +529,7 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_floats(floats, cudaStream_t{}, mr);
+  return detail::from_floats(floats, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index a8ea7cf3ab9..3bb422d17f3 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -93,10 +93,10 @@ struct dispatch_hex_to_integers_fn {
   template <typename IntegerType, std::enable_if_t<std::is_integral<IntegerType>::value>* = nullptr>
   void operator()(column_device_view const& strings_column,
                   mutable_column_view& output_column,
-                  cudaStream_t stream) const
+                  rmm::cuda_stream_view stream) const
   {
     auto d_results = output_column.data<IntegerType>();
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_column.size()),
                       d_results,
@@ -104,7 +104,7 @@ struct dispatch_hex_to_integers_fn {
   }
   // non-integral types throw an exception
   template <typename T, std::enable_if_t<not std::is_integral<T>::value>* = nullptr>
-  void operator()(column_device_view const&, mutable_column_view&, cudaStream_t) const
+  void operator()(column_device_view const&, mutable_column_view&, rmm::cuda_stream_view) const
   {
     CUDF_FAIL("Output for hex_to_integers must be an integral type.");
   }
@@ -113,7 +113,7 @@ struct dispatch_hex_to_integers_fn {
 template <>
 void dispatch_hex_to_integers_fn::operator()<bool>(column_device_view const&,
                                                    mutable_column_view&,
-                                                   cudaStream_t) const
+                                                   rmm::cuda_stream_view) const
 {
   CUDF_FAIL("Output for hex_to_integers must not be a boolean type.");
 }
@@ -124,21 +124,20 @@ void dispatch_hex_to_integers_fn::operator()<bool>(column_device_view const&,
 std::unique_ptr<column> hex_to_integers(
   strings_column_view const& strings,
   data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(output_type);
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create integer output column copying the strings null-mask
-  auto results = make_numeric_column(
-    output_type,
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results      = make_numeric_column(output_type,
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto results_view = results->mutable_view();
   // fill output column with integers
   type_dispatcher(output_type, dispatch_hex_to_integers_fn{}, d_strings, results_view, stream);
@@ -147,21 +146,20 @@ std::unique_ptr<column> hex_to_integers(
 }
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
-                               cudaStream_t stream,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
-  auto results = make_numeric_column(
-    data_type{type_id::BOOL8},
-    strings.size(),
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto d_results = results->mutable_view().data<bool>();
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(strings.size()),
                     d_results,
@@ -193,14 +191,14 @@ std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hex_to_integers(strings, output_type, mr);
+  return detail::hex_to_integers(strings, output_type, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_hex(strings, 0, mr);
+  return detail::is_hex(strings, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 42bd70899a9..cfa64613c90 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -66,10 +66,10 @@ struct dispatch_to_integers_fn {
   template <typename IntegerType, std::enable_if_t<std::is_integral<IntegerType>::value>* = nullptr>
   void operator()(column_device_view const& strings_column,
                   mutable_column_view& output_column,
-                  cudaStream_t stream) const
+                  rmm::cuda_stream_view stream) const
   {
     auto d_results = output_column.data<IntegerType>();
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_column.size()),
                       d_results,
@@ -77,7 +77,7 @@ struct dispatch_to_integers_fn {
   }
   // non-integral types throw an exception
   template <typename T, std::enable_if_t<not std::is_integral<T>::value>* = nullptr>
-  void operator()(column_device_view const&, mutable_column_view&, cudaStream_t) const
+  void operator()(column_device_view const&, mutable_column_view&, rmm::cuda_stream_view) const
   {
     CUDF_FAIL("Output for to_integers must be an integral type.");
   }
@@ -86,7 +86,7 @@ struct dispatch_to_integers_fn {
 template <>
 void dispatch_to_integers_fn::operator()<bool>(column_device_view const&,
                                                mutable_column_view&,
-                                               cudaStream_t) const
+                                               rmm::cuda_stream_view) const
 {
   CUDF_FAIL("Output for to_integers must not be a boolean type.");
 }
@@ -96,7 +96,7 @@ void dispatch_to_integers_fn::operator()<bool>(column_device_view const&,
 // This will convert a strings column into any integer column type.
 std::unique_ptr<column> to_integers(strings_column_view const& strings,
                                     data_type output_type,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
@@ -104,13 +104,12 @@ std::unique_ptr<column> to_integers(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create integer output column copying the strings null-mask
-  auto results = make_numeric_column(
-    output_type,
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results      = make_numeric_column(output_type,
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto results_view = results->mutable_view();
   // fill output column with integers
   type_dispatcher(output_type, dispatch_to_integers_fn{}, d_strings, results_view, stream);
@@ -126,7 +125,7 @@ std::unique_ptr<column> to_integers(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_integers(strings, output_type, cudaStream_t{}, mr);
+  return detail::to_integers(strings, output_type, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -176,35 +175,34 @@ struct integer_to_string_fn {
 struct dispatch_from_integers_fn {
   template <typename IntegerType, std::enable_if_t<std::is_integral<IntegerType>::value>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& integers,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     size_type strings_count = integers.size();
     auto column             = column_device_view::create(integers, stream);
     auto d_column           = *column;
 
     // copy the null mask
-    rmm::device_buffer null_mask =
-      cudf::detail::copy_bitmask(integers, rmm::cuda_stream_view{stream}, mr);
+    rmm::device_buffer null_mask = cudf::detail::copy_bitmask(integers, stream, mr);
     // build offsets column
     auto offsets_transformer_itr = thrust::make_transform_iterator(
       thrust::make_counting_iterator<int32_t>(0), integer_to_string_size_fn<IntegerType>{d_column});
     auto offsets_column = detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+      offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
     auto offsets_view  = offsets_column->view();
     auto d_new_offsets = offsets_view.template data<int32_t>();
 
     // build chars column
     size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
     auto chars_column =
-      detail::create_chars_child_column(strings_count, integers.null_count(), bytes, mr, stream);
+      detail::create_chars_child_column(strings_count, integers.null_count(), bytes, stream, mr);
     auto chars_view = chars_column->mutable_view();
     auto d_chars    = chars_view.template data<char>();
-    thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator<size_type>(0),
                        strings_count,
                        integer_to_string_fn<IntegerType>{d_column, d_new_offsets, d_chars});
-    //
+
     return make_strings_column(strings_count,
                                std::move(offsets_column),
                                std::move(chars_column),
@@ -217,8 +215,8 @@ struct dispatch_from_integers_fn {
   // non-integral types throw an exception
   template <typename T, std::enable_if_t<not std::is_integral<T>::value>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
-                                     rmm::mr::device_memory_resource*,
-                                     cudaStream_t) const
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
   {
     CUDF_FAIL("Values for from_integers function must be an integral type.");
   }
@@ -226,7 +224,7 @@ struct dispatch_from_integers_fn {
 
 template <>
 std::unique_ptr<column> dispatch_from_integers_fn::operator()<bool>(
-  column_view const&, rmm::mr::device_memory_resource*, cudaStream_t) const
+  column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const
 {
   CUDF_FAIL("Input for from_integers must not be a boolean type.");
 }
@@ -235,13 +233,13 @@ std::unique_ptr<column> dispatch_from_integers_fn::operator()<bool>(
 
 // This will convert all integer column types into a strings column.
 std::unique_ptr<column> from_integers(column_view const& integers,
-                                      cudaStream_t stream,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = integers.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
 
-  return type_dispatcher(integers.type(), dispatch_from_integers_fn{}, integers, mr, stream);
+  return type_dispatcher(integers.type(), dispatch_from_integers_fn{}, integers, stream, mr);
 }
 
 }  // namespace detail
@@ -252,7 +250,7 @@ std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_integers(integers, cudaStream_t{}, mr);
+  return detail::from_integers(integers, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index dcccad30f30..e0303270987 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -73,24 +73,23 @@ struct ipv4_to_integers_fn {
 // Convert strings column of IPv4 addresses to integers column
 std::unique_ptr<column> ipv4_to_integers(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_numeric_column(data_type{type_id::INT64}, 0);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   // create output column copying the strings' null-mask
-  auto results = make_numeric_column(
-    data_type{type_id::INT64},
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results   = make_numeric_column(data_type{type_id::INT64},
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto d_results = results->mutable_view().data<int64_t>();
   // fill output column with ipv4 integers
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(strings_count),
                     d_results,
@@ -107,7 +106,7 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& strings,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ipv4_to_integers(strings, mr);
+  return detail::ipv4_to_integers(strings, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -160,11 +159,11 @@ struct integers_to_ipv4_fn {
 // Convert integers into IPv4 addresses
 std::unique_ptr<column> integers_to_ipv4(
   column_view const& integers,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = integers.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
@@ -172,8 +171,7 @@ std::unique_ptr<column> integers_to_ipv4(
   auto d_column = *column;
 
   // copy null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(integers, rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(integers, stream, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0), [d_column] __device__(size_type idx) {
@@ -188,19 +186,19 @@ std::unique_ptr<column> integers_to_ipv4(
       return bytes;
     });
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column =
-    create_chars_child_column(strings_count, integers.null_count(), bytes, mr, stream);
+    create_chars_child_column(strings_count, integers.null_count(), bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      integers_to_ipv4_fn{d_column, d_offsets, d_chars});
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -211,21 +209,20 @@ std::unique_ptr<column> integers_to_ipv4(
 }
 
 std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
-                                cudaStream_t stream,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
-  auto results = make_numeric_column(
-    data_type{type_id::BOOL8},
-    strings.size(),
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results   = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings.size(),
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto d_results = results->mutable_view().data<bool>();
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(strings.size()),
                     d_results,
@@ -264,14 +261,14 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::integers_to_ipv4(integers, mr);
+  return detail::integers_to_ipv4(integers, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_ipv4(strings, 0, mr);
+  return detail::is_ipv4(strings, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 9b5c142511f..7d57e748cf3 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -113,36 +113,36 @@ struct url_encoder_fn {
 //
 std::unique_ptr<column> url_encode(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
 
   // copy null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), url_encoder_fn{d_strings});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
   // build chars column
   auto chars_column =
     create_chars_child_column(strings_count,
                               strings.null_count(),
                               thrust::device_pointer_cast(d_offsets)[strings_count],
-                              mr,
-                              stream);
+                              stream,
+                              mr);
   auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      url_encoder_fn{d_strings, d_offsets, d_chars});
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -159,7 +159,7 @@ std::unique_ptr<column> url_encode(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::url_encode(strings, mr);
+  return detail::url_encode(strings, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -216,23 +216,22 @@ struct url_decoder_fn {
 //
 std::unique_ptr<column> url_decode(
   strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
 
   // copy null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), url_decoder_fn{d_strings});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
@@ -240,14 +239,14 @@ std::unique_ptr<column> url_decode(
     create_chars_child_column(strings_count,
                               strings.null_count(),
                               thrust::device_pointer_cast(d_offsets)[strings_count],
-                              mr,
-                              stream);
+                              stream,
+                              mr);
   auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      url_decoder_fn{d_strings, d_offsets, d_chars});
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -265,7 +264,7 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::url_decode(strings, mr);
+  return detail::url_decode(strings, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index be56c256bfa..ffa6eb9a076 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -67,8 +67,8 @@ auto create_strings_device_views(std::vector<column_view> const& views,
                                  rmm::cuda_stream_view stream)
 {
   // Create device views for each input view
-  using CDViewPtr =
-    decltype(column_device_view::create(std::declval<column_view>(), std::declval<cudaStream_t>()));
+  using CDViewPtr = decltype(
+    column_device_view::create(std::declval<column_view>(), std::declval<rmm::cuda_stream_view>()));
   auto device_view_owners = std::vector<CDViewPtr>(views.size());
   std::transform(
     views.cbegin(), views.cend(), device_view_owners.begin(), [stream](auto const& col) {
@@ -228,7 +228,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
   auto const total_bytes          = std::get<5>(device_views);
   auto const offsets_count        = strings_count + 1;
 
-  if (strings_count == 0) { return make_empty_strings_column(mr, stream.value()); }
+  if (strings_count == 0) { return make_empty_strings_column(stream, mr); }
 
   CUDF_EXPECTS(offsets_count <= std::numeric_limits<size_type>::max(),
                "total number of strings is too large for cudf column");
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index 4c99b45f5ce..384d6780116 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -22,6 +22,8 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/sequence.h>
 
 namespace cudf {
@@ -32,11 +34,11 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
                                          size_type start,
                                          size_type end,
                                          size_type step,
-                                         cudaStream_t stream,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
   if (step == 0) step = 1;
   CUDF_EXPECTS(step > 0, "Parameter step must be positive integer.");
   if (end < 0 || end > strings_count) end = strings_count;
@@ -46,7 +48,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
   auto execpol = rmm::exec_policy(stream);
   // build indices
   rmm::device_vector<size_type> indices(strings_count);
-  thrust::sequence(execpol->on(stream), indices.begin(), indices.end(), start, step);
+  thrust::sequence(execpol->on(stream.value()), indices.begin(), indices.end(), start, step);
   // create a column_view as a wrapper of these indices
   column_view indices_view(
     data_type{type_id::INT32}, strings_count, indices.data().get(), nullptr, 0);
diff --git a/cpp/src/strings/extract.cu b/cpp/src/strings/extract.cu
index 7a8fe7bee29..2973a52d27e 100644
--- a/cpp/src/strings/extract.cu
+++ b/cpp/src/strings/extract.cu
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,8 +26,8 @@
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace strings {
@@ -70,8 +73,8 @@ struct extract_fn {
 std::unique_ptr<table> extract(
   strings_column_view const& strings,
   std::string const& pattern,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count  = strings.size();
   auto strings_column = column_device_view::create(strings.parent(), stream);
@@ -93,24 +96,24 @@ std::unique_ptr<table> extract(
     string_index_pair* d_indices = indices.data().get();
 
     if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
-      thrust::transform(execpol->on(stream),
+      thrust::transform(execpol->on(stream.value()),
                         thrust::make_counting_iterator<size_type>(0),
                         thrust::make_counting_iterator<size_type>(strings_count),
                         d_indices,
                         extract_fn<RX_STACK_SMALL>{d_prog, d_strings, column_index});
     else if (regex_insts <= RX_MEDIUM_INSTS)
-      thrust::transform(execpol->on(stream),
+      thrust::transform(execpol->on(stream.value()),
                         thrust::make_counting_iterator<size_type>(0),
                         thrust::make_counting_iterator<size_type>(strings_count),
                         d_indices,
                         extract_fn<RX_STACK_MEDIUM>{d_prog, d_strings, column_index});
     else
-      thrust::transform(execpol->on(stream),
+      thrust::transform(execpol->on(stream.value()),
                         thrust::make_counting_iterator<size_type>(0),
                         thrust::make_counting_iterator<size_type>(strings_count),
                         d_indices,
                         extract_fn<RX_STACK_LARGE>{d_prog, d_strings, column_index});
-    //
+
     results.emplace_back(make_strings_column(indices, stream, mr));
   }
   return std::make_unique<table>(std::move(results));
@@ -125,7 +128,7 @@ std::unique_ptr<table> extract(strings_column_view const& strings,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract(strings, pattern, mr);
+  return detail::extract(strings, pattern, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 5ed3de2c888..d2e89d5b668 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -40,7 +40,7 @@ std::unique_ptr<column> fill(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(mr, stream.value());
+  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
   CUDF_EXPECTS((begin >= 0) && (end <= strings_count),
                "Parameters [begin,end) are outside the range of the provided strings column");
   CUDF_EXPECTS(begin <= end, "Parameters [begin,end) have invalid range values");
@@ -74,13 +74,13 @@ std::unique_ptr<column> fill(
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
   auto offsets_column = detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // create the chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, null_count, bytes, mr, stream.value());
+  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
+  auto chars_column =
+    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   // fill the chars column
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 975d84c7875..e75950f2984 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -105,11 +105,11 @@ std::unique_ptr<column> filter_characters(
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
   filter_type keep_characters,
   string_scalar const& replacement,
-  cudaStream_t stream,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
   CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid");
   cudf::string_view d_replacement(replacement.data(), replacement.size());
 
@@ -127,23 +127,22 @@ std::unique_ptr<column> filter_characters(
   auto d_strings      = *strings_column;
 
   // create null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // create offsets column
   filter_fn ffn{d_strings, keep_characters, table.begin(), table.end(), d_replacement};
   auto offsets_transformer_itr =
     thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0), ffn);
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   ffn.d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, mr, stream);
+    strings_count, strings.null_count(), bytes, stream, mr);
   ffn.d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
                      ffn);
@@ -171,7 +170,7 @@ std::unique_ptr<column> filter_characters(
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters(
-    strings, characters_to_filter, keep_characters, replacement, 0, mr);
+    strings, characters_to_filter, keep_characters, replacement, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu
index d5a6356e3f1..67c2eff33b3 100644
--- a/cpp/src/strings/find.cu
+++ b/cpp/src/strings/find.cu
@@ -56,8 +56,8 @@ std::unique_ptr<column> find_fn(strings_column_view const& strings,
                                 size_type start,
                                 size_type stop,
                                 FindFunction& pfn,
-                                rmm::mr::device_memory_resource* mr,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
   CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
@@ -77,7 +77,7 @@ std::unique_ptr<column> find_fn(strings_column_view const& strings,
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<int32_t>();
   // set the position values by evaluating the passed function
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(strings_count),
                     d_results,
@@ -99,8 +99,8 @@ std::unique_ptr<column> find(
   string_scalar const& target,
   size_type start                     = 0,
   size_type stop                      = -1,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto pfn = [] __device__(
                string_view d_string, string_view d_target, size_type start, size_type stop) {
@@ -111,7 +111,7 @@ std::unique_ptr<column> find(
     return d_string.find(d_target, begin, end - begin);
   };
 
-  return find_fn(strings, target, start, stop, pfn, mr, stream);
+  return find_fn(strings, target, start, stop, pfn, stream, mr);
 }
 
 std::unique_ptr<column> rfind(
@@ -119,8 +119,8 @@ std::unique_ptr<column> rfind(
   string_scalar const& target,
   size_type start                     = 0,
   size_type stop                      = -1,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto pfn = [] __device__(
                string_view d_string, string_view d_target, size_type start, size_type stop) {
@@ -131,7 +131,7 @@ std::unique_ptr<column> rfind(
     return d_string.rfind(d_target, begin, end - begin);
   };
 
-  return find_fn(strings, target, start, stop, pfn, mr, stream);
+  return find_fn(strings, target, start, stop, pfn, stream, mr);
 }
 
 }  // namespace detail
@@ -145,7 +145,7 @@ std::unique_ptr<column> find(strings_column_view const& strings,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::find(strings, target, start, stop, mr);
+  return detail::find(strings, target, start, stop, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> rfind(strings_column_view const& strings,
@@ -155,7 +155,7 @@ std::unique_ptr<column> rfind(strings_column_view const& strings,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rfind(strings, target, start, stop, mr);
+  return detail::rfind(strings, target, start, stop, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -179,8 +179,8 @@ template <typename BoolFunction>
 std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                                     string_scalar const& target,
                                     BoolFunction pfn,
-                                    rmm::mr::device_memory_resource* mr,
-                                    cudaStream_t stream)
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(data_type{type_id::BOOL8});
@@ -208,7 +208,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<bool>();
   // set the bool values by evaluating the passed function
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(strings_count),
                     d_results,
@@ -240,8 +240,8 @@ template <typename BoolFunction>
 std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                                     strings_column_view const& targets,
                                     BoolFunction pfn,
-                                    rmm::mr::device_memory_resource* mr,
-                                    cudaStream_t stream)
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   if (strings.is_empty()) return make_empty_column(data_type{type_id::BOOL8});
 
@@ -263,7 +263,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
   auto d_results    = results_view.data<bool>();
   // set the bool values by evaluating the passed function
   thrust::transform(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings.size()),
     d_results,
@@ -286,56 +286,56 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
 std::unique_ptr<column> contains(
   strings_column_view const& strings,
   string_scalar const& target,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return d_string.find(d_target) >= 0;
   };
-  return contains_fn(strings, target, pfn, mr, stream);
+  return contains_fn(strings, target, pfn, stream, mr);
 }
 
 std::unique_ptr<column> contains(
   strings_column_view const& strings,
   strings_column_view const& targets,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return d_string.find(d_target) >= 0;
   };
-  return contains_fn(strings, targets, pfn, mr, stream);
+  return contains_fn(strings, targets, pfn, stream, mr);
 }
 
 std::unique_ptr<column> starts_with(
   strings_column_view const& strings,
   string_scalar const& target,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return d_string.find(d_target) == 0;
   };
-  return contains_fn(strings, target, pfn, mr, stream);
+  return contains_fn(strings, target, pfn, stream, mr);
 }
 
 std::unique_ptr<column> starts_with(
   strings_column_view const& strings,
   strings_column_view const& targets,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return d_string.find(d_target) == 0;
   };
-  return contains_fn(strings, targets, pfn, mr, stream);
+  return contains_fn(strings, targets, pfn, stream, mr);
 }
 
 std::unique_ptr<column> ends_with(
   strings_column_view const& strings,
   string_scalar const& target,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto str_length = d_string.length();
@@ -344,14 +344,14 @@ std::unique_ptr<column> ends_with(
     return d_string.find(d_target, str_length - tgt_length) >= 0;
   };
 
-  return contains_fn(strings, target, pfn, mr, stream);
+  return contains_fn(strings, target, pfn, stream, mr);
 }
 
 std::unique_ptr<column> ends_with(
   strings_column_view const& strings,
   strings_column_view const& targets,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto str_length = d_string.length();
@@ -360,7 +360,7 @@ std::unique_ptr<column> ends_with(
     return d_string.find(d_target, str_length - tgt_length) >= 0;
   };
 
-  return contains_fn(strings, targets, pfn, mr, stream);
+  return contains_fn(strings, targets, pfn, stream, mr);
 }
 
 }  // namespace detail
@@ -372,7 +372,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(strings, target, mr);
+  return detail::contains(strings, target, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> contains(strings_column_view const& strings,
@@ -380,7 +380,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(strings, targets, mr);
+  return detail::contains(strings, targets, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
@@ -388,7 +388,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::starts_with(strings, target, mr);
+  return detail::starts_with(strings, target, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
@@ -396,7 +396,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::starts_with(strings, targets, mr);
+  return detail::starts_with(strings, targets, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
@@ -404,7 +404,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ends_with(strings, target, mr);
+  return detail::ends_with(strings, target, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
@@ -412,7 +412,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ends_with(strings, targets, mr);
+  return detail::ends_with(strings, targets, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/find_multiple.cu b/cpp/src/strings/find_multiple.cu
index 45225b13196..6bcaf7ccea7 100644
--- a/cpp/src/strings/find_multiple.cu
+++ b/cpp/src/strings/find_multiple.cu
@@ -22,6 +22,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -30,8 +32,8 @@ namespace detail {
 std::unique_ptr<column> find_multiple(
   strings_column_view const& strings,
   strings_column_view const& targets,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(data_type{type_id::INT32});
@@ -55,7 +57,7 @@ std::unique_ptr<column> find_multiple(
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<int32_t>();
   // fill output column with position values
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(total_count),
                     d_results,
@@ -78,7 +80,7 @@ std::unique_ptr<column> find_multiple(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::find_multiple(strings, targets, mr);
+  return detail::find_multiple(strings, targets, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 05b5293e432..578787605ee 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -50,17 +50,16 @@ struct compute_pad_output_length_fn {
 
 }  // namespace
 
-//
 std::unique_ptr<column> pad(
   strings_column_view const& strings,
   size_type width,
   pad_side side                       = pad_side::RIGHT,
   std::string const& fill_char        = " ",
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
   CUDF_EXPECTS(!fill_char.empty(), "fill_char parameter must not be empty");
   char_utf8 d_fill_char    = 0;
   size_type fill_char_size = to_char_utf8(fill_char.c_str(), d_fill_char);
@@ -70,26 +69,25 @@ std::unique_ptr<column> pad(
   auto d_strings      = *strings_column;
 
   // create null_mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // build offsets column
   auto offsets_transformer_itr =
     thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0),
                                     compute_pad_output_length_fn{d_strings, width, fill_char_size});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, mr, stream);
+    strings_count, strings.null_count(), bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
 
   if (side == pad_side::LEFT) {
     thrust::for_each_n(
-      execpol->on(stream),
+      execpol->on(stream.value()),
       thrust::make_counting_iterator<cudf::size_type>(0),
       strings_count,
       [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) {
@@ -102,7 +100,7 @@ std::unique_ptr<column> pad(
       });
   } else if (side == pad_side::RIGHT) {
     thrust::for_each_n(
-      execpol->on(stream),
+      execpol->on(stream.value()),
       thrust::make_counting_iterator<cudf::size_type>(0),
       strings_count,
       [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) {
@@ -115,7 +113,7 @@ std::unique_ptr<column> pad(
       });
   } else if (side == pad_side::BOTH) {
     thrust::for_each_n(
-      execpol->on(stream),
+      execpol->on(stream.value()),
       thrust::make_counting_iterator<cudf::size_type>(0),
       strings_count,
       [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) {
@@ -131,7 +129,7 @@ std::unique_ptr<column> pad(
         while (right_pad-- > 0) ptr += from_char_utf8(d_fill_char, ptr);
       });
   }
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -149,34 +147,33 @@ std::unique_ptr<column> pad(
 std::unique_ptr<column> zfill(
   strings_column_view const& strings,
   size_type width,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
 
   // copy bitmask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0),
     compute_pad_output_length_fn{d_strings, width, 1});  // fillchar is 1 byte
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, mr, stream);
+    strings_count, strings.null_count(), bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
 
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
                      [d_strings, width, d_offsets, d_chars] __device__(size_type idx) {
@@ -199,7 +196,7 @@ std::unique_ptr<column> zfill(
 
 }  // namespace detail
 
-// external APIs
+// Public APIs
 
 std::unique_ptr<column> pad(strings_column_view const& strings,
                             size_type width,
@@ -208,7 +205,7 @@ std::unique_ptr<column> pad(strings_column_view const& strings,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::pad(strings, width, side, fill_char, mr);
+  return detail::pad(strings, width, side, fill_char, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> zfill(strings_column_view const& strings,
@@ -216,7 +213,7 @@ std::unique_ptr<column> zfill(strings_column_view const& strings,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::zfill(strings, width, mr);
+  return detail::zfill(strings, width, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 6e9e92844c5..6e03c183a8d 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -15,16 +15,20 @@
  */
 #pragma once
 
-#include <cuda_runtime.h>
 #include <strings/regex/regcomp.h>
+
+#include <rmm/cuda_stream_view.hpp>
+
 #include <functional>
 #include <memory>
 
 namespace cudf {
+
 class string_view;
 
 namespace strings {
 namespace detail {
+
 struct reljunk;
 struct reinst;
 class reprog;
@@ -76,7 +80,7 @@ class reprog_device {
     std::string const& pattern,
     const uint8_t* cp_flags,
     int32_t strings_count,
-    cudaStream_t stream = 0);
+    rmm::cuda_stream_view stream);
   /**
    * @brief Called automatically by the unique_ptr returned from create().
    */
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index 46f7904410b..8089244803e 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,12 @@
  */
 
 #include <strings/regex/regcomp.h>
+#include <strings/regex/regex.cuh>
+
 #include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <strings/regex/regex.cuh>
 
 namespace cudf {
 namespace strings {
@@ -72,7 +75,7 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   std::string const& pattern,
   const uint8_t* codepoint_flags,
   size_type strings_count,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   std::vector<char32_t> pattern32 = string_to_char32_vector(pattern);
   // compile pattern into host object
@@ -148,7 +151,8 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   }
 
   // copy flat prog to device memory
-  CUDA_TRY(cudaMemcpy(d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice));
+  CUDA_TRY(cudaMemcpyAsync(
+    d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice, stream.value()));
   //
   auto deleter = [d_buffer, d_relists](reprog_device* t) {
     t->destroy();
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 3b3043d37d0..95f9ecbe2ef 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -14,21 +14,24 @@
  * limitations under the License.
  */
 
+#include "backref_re.cuh"
+
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.hpp>
 
-#include <regex>
+#include <rmm/cuda_stream_view.hpp>
 
-#include "backref_re.cuh"
+#include <regex>
 
 namespace cudf {
 namespace strings {
@@ -81,11 +84,11 @@ std::unique_ptr<column> replace_with_backrefs(
   strings_column_view const& strings,
   std::string const& pattern,
   std::string const& repl,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
   CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty");
@@ -105,7 +108,7 @@ std::unique_ptr<column> replace_with_backrefs(
   string_view d_repl_template{repl_scalar.data(), repl_scalar.size()};
 
   // copy null mask
-  auto null_mask  = copy_bitmask(strings.parent());
+  auto null_mask  = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   auto null_count = strings.null_count();
 
   // create child columns
@@ -118,14 +121,14 @@ std::unique_ptr<column> replace_with_backrefs(
         d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
       strings_count,
       null_count,
-      mr,
-      stream);
+      stream,
+      mr);
   } else if (regex_insts <= RX_MEDIUM_INSTS)
     children = replace_with_backrefs_medium(
-      d_strings, d_prog, d_repl_template, backrefs, null_count, mr, stream);
+      d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr);
   else
     children = replace_with_backrefs_large(
-      d_strings, d_prog, d_repl_template, backrefs, null_count, mr, stream);
+      d_strings, d_prog, d_repl_template, backrefs, null_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(children.first),
@@ -146,7 +149,7 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_with_backrefs(strings, pattern, repl, mr);
+  return detail::replace_with_backrefs(strings, pattern, repl, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index bf0644c65ee..d5bec759528 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -20,6 +20,8 @@
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -118,16 +120,16 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
                                            string_view const& d_repl_template,
                                            rmm::device_vector<backref_type>& backrefs,
                                            size_type null_count,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream);
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 children_pair replace_with_backrefs_large(column_device_view const& d_strings,
                                           reprog_device& d_prog,
                                           string_view const& d_repl_template,
                                           rmm::device_vector<backref_type>& backrefs,
                                           size_type null_count,
-                                          rmm::mr::device_memory_resource* mr,
-                                          cudaStream_t stream);
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/replace/backref_re_large.cu b/cpp/src/strings/replace/backref_re_large.cu
index 5e7b2b3c2fc..0b078132623 100644
--- a/cpp/src/strings/replace/backref_re_large.cu
+++ b/cpp/src/strings/replace/backref_re_large.cu
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 
+#include "backref_re.cuh"
+
 #include <cudf/strings/detail/utilities.hpp>
 
-#include "backref_re.cuh"
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace strings {
@@ -28,16 +30,16 @@ children_pair replace_with_backrefs_large(column_device_view const& d_strings,
                                           string_view const& d_repl_template,
                                           rmm::device_vector<backref_type>& backrefs,
                                           size_type null_count,
-                                          rmm::mr::device_memory_resource* mr,
-                                          cudaStream_t stream)
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
   return make_strings_children(
     backrefs_fn<RX_STACK_LARGE>{
       d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
     d_strings.size(),
     null_count,
-    mr,
-    stream);
+    stream,
+    mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/backref_re_medium.cu b/cpp/src/strings/replace/backref_re_medium.cu
index e75494e8c55..899e0cb2a3e 100644
--- a/cpp/src/strings/replace/backref_re_medium.cu
+++ b/cpp/src/strings/replace/backref_re_medium.cu
@@ -14,9 +14,11 @@
  * limitations under the License.
  */
 
+#include "backref_re.cuh"
+
 #include <cudf/strings/detail/utilities.hpp>
 
-#include "backref_re.cuh"
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace strings {
@@ -28,16 +30,16 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
                                            string_view const& d_repl_template,
                                            rmm::device_vector<backref_type>& backrefs,
                                            size_type null_count,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   return make_strings_children(
     backrefs_fn<RX_STACK_MEDIUM>{
       d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
     d_strings.size(),
     null_count,
-    mr,
-    stream);
+    stream,
+    mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index d43dff4548c..81f1c694716 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.cuh>
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,9 +27,8 @@
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.cuh>
-#include <strings/utilities.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace strings {
@@ -127,16 +130,15 @@ struct replace_multi_regex_fn {
 
 }  // namespace
 
-//
 std::unique_ptr<column> replace_re(
   strings_column_view const& strings,
   std::vector<std::string> const& patterns,
   strings_column_view const& repls,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
   if (patterns.empty())  // no patterns; just return a copy
     return std::make_unique<column>(strings.parent());
 
@@ -177,25 +179,25 @@ std::unique_ptr<column> replace_re(
         d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, d_repls},
       strings_count,
       null_count,
-      mr,
-      stream);
+      stream,
+      mr);
   else if (regex_insts <= RX_MEDIUM_INSTS)
     children = make_strings_children(
       replace_multi_regex_fn<RX_STACK_MEDIUM>{
         d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, d_repls},
       strings_count,
       null_count,
-      mr,
-      stream);
+      stream,
+      mr);
   else
     children = make_strings_children(
       replace_multi_regex_fn<RX_STACK_LARGE>{
         d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, d_repls},
       strings_count,
       null_count,
-      mr,
-      stream);
-  //
+      stream,
+      mr);
+
   return make_strings_column(strings_count,
                              std::move(children.first),
                              std::move(children.second),
@@ -215,7 +217,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(strings, patterns, repls, mr);
+  return detail::replace_re(strings, patterns, repls, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index a1aca664e25..7a22d70d4d3 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -99,7 +99,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
   CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
   CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string.");
@@ -117,20 +117,20 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
     thrust::make_counting_iterator<int32_t>(0),
     replace_fn<two_pass::SIZE_ONLY>{d_strings, d_target, d_repl, maxrepl});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column =
-    create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value());
+    create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
     rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     replace_fn<two_pass::EXECUTE_OP>{d_strings, d_target, d_repl, maxrepl, d_offsets, d_chars});
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -187,7 +187,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
   if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
 
@@ -197,21 +197,20 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
   auto d_strings      = *strings_column;
 
   // copy the null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0),
     replace_slice_fn<two_pass::SIZE_ONLY>{d_strings, d_repl, start, stop});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto offsets_view = offsets_column->view();
   auto d_offsets    = offsets_view.data<int32_t>();
 
   // build chars column
   size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column =
-    create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value());
+    create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr);
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
   thrust::for_each_n(
@@ -291,7 +290,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
   CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
                "Parameters targets must not be empty and must not have nulls");
   CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)),
@@ -313,20 +312,20 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
     thrust::make_counting_iterator<int32_t>(0),
     replace_multi_fn<two_pass::SIZE_ONLY>{d_strings, d_targets, d_repls});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column =
-    create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value());
+    create_chars_child_column(strings_count, strings.null_count(), bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
     rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     replace_multi_fn<two_pass::EXECUTE_OP>{d_strings, d_targets, d_repls, d_offsets, d_chars});
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -342,7 +341,7 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
 
   string_view d_repl(repl.data(), repl.size());
@@ -357,13 +356,13 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                     : d_strings.element<string_view>(idx).size_bytes();
     });
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, mr, stream.value());
+    strings_count, strings.null_count(), bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 4f6ec56d213..4eff05ba7b7 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -14,18 +14,21 @@
  * limitations under the License.
  */
 
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.cuh>
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/regex/regex.cuh>
-#include <strings/utilities.cuh>
-#include <strings/utilities.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace strings {
@@ -105,11 +108,11 @@ std::unique_ptr<column> replace_re(
   std::string const& pattern,
   string_scalar const& repl           = string_scalar(""),
   size_type maxrepl                   = -1,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid");
   string_view d_repl(repl.data(), repl.size());
@@ -122,7 +125,7 @@ std::unique_ptr<column> replace_re(
   auto regex_insts = d_prog.insts_counts();
 
   // copy null mask
-  auto null_mask  = copy_bitmask(strings.parent());
+  auto null_mask  = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   auto null_count = strings.null_count();
 
   // create child columns
@@ -134,22 +137,22 @@ std::unique_ptr<column> replace_re(
       make_strings_children(replace_regex_fn<RX_STACK_SMALL>{d_strings, d_prog, d_repl, maxrepl},
                             strings_count,
                             null_count,
-                            mr,
-                            stream);
+                            stream,
+                            mr);
   else if (regex_insts <= RX_MEDIUM_INSTS)
     children =
       make_strings_children(replace_regex_fn<RX_STACK_MEDIUM>{d_strings, d_prog, d_repl, maxrepl},
                             strings_count,
                             null_count,
-                            mr,
-                            stream);
+                            stream,
+                            mr);
   else
     children =
       make_strings_children(replace_regex_fn<RX_STACK_LARGE>{d_strings, d_prog, d_repl, maxrepl},
                             strings_count,
                             null_count,
-                            mr,
-                            stream);
+                            stream,
+                            mr);
 
   return make_strings_column(strings_count,
                              std::move(children.first),
@@ -171,7 +174,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(strings, pattern, repl, maxrepl, mr);
+  return detail::replace_re(strings, pattern, repl, maxrepl, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 6102de7335e..14c0e754abd 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -24,6 +24,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <vector>
 
 namespace cudf {
@@ -176,8 +178,8 @@ struct rpartition_fn : public partition_fn {
 std::unique_ptr<table> partition(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -189,7 +191,7 @@ std::unique_ptr<table> partition(
   partition_fn partitioner(
     *strings_column, d_delimiter, left_indices, delim_indices, right_indices);
 
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      partitioner);
@@ -203,8 +205,8 @@ std::unique_ptr<table> partition(
 std::unique_ptr<table> rpartition(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -215,7 +217,7 @@ std::unique_ptr<table> rpartition(
     right_indices(strings_count);
   rpartition_fn partitioner(
     *strings_column, d_delimiter, left_indices, delim_indices, right_indices);
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      partitioner);
@@ -236,7 +238,7 @@ std::unique_ptr<table> partition(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::partition(strings, delimiter, mr);
+  return detail::partition(strings, delimiter, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> rpartition(strings_column_view const& strings,
@@ -244,7 +246,7 @@ std::unique_ptr<table> rpartition(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rpartition(strings, delimiter, mr);
+  return detail::rpartition(strings, delimiter, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index fb0efa1131c..61d7adf8674 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <strings/split/split_utils.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -26,7 +28,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <strings/split/split_utils.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/binary_search.h>  // upper_bound()
 #include <thrust/copy.h>           // copy_if()
@@ -422,13 +424,13 @@ struct rsplit_tokenizer_fn : base_split_tokenizer {
 template <typename Tokenizer>
 std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
                                 Tokenizer tokenizer,
-                                rmm::mr::device_memory_resource* mr,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   std::vector<std::unique_ptr<column>> results;
   auto strings_count = strings_column.size();
   if (strings_count == 0) {
-    results.push_back(make_empty_strings_column(mr, stream));
+    results.push_back(make_empty_strings_column(stream, mr));
     return std::make_unique<table>(std::move(results));
   }
 
@@ -440,7 +442,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
 
   // count the number of delimiters in the entire column
   size_type delimiter_count =
-    thrust::count_if(execpol->on(stream),
+    thrust::count_if(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      thrust::make_counting_iterator<size_type>(chars_bytes),
                      [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
@@ -450,7 +452,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
   // create vector of every delimiter position in the chars column
   rmm::device_vector<size_type> delimiter_positions(delimiter_count);
   auto d_positions = delimiter_positions.data().get();
-  auto copy_end    = thrust::copy_if(execpol->on(stream),
+  auto copy_end    = thrust::copy_if(execpol->on(stream.value()),
                                   thrust::make_counting_iterator<size_type>(0),
                                   thrust::make_counting_iterator<size_type>(chars_bytes),
                                   delimiter_positions.begin(),
@@ -461,7 +463,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
   // create vector of string indices for each delimiter
   rmm::device_vector<size_type> string_indices(delimiter_count);  // these will be strings that
   auto d_string_indices = string_indices.data().get();            // only contain delimiters
-  thrust::upper_bound(execpol->on(stream),
+  thrust::upper_bound(execpol->on(stream.value()),
                       d_offsets,
                       d_offsets + strings_count,
                       delimiter_positions.begin(),
@@ -472,7 +474,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
   rmm::device_vector<size_type> token_counts(strings_count);
   auto d_token_counts = token_counts.data().get();
   // first, initialize token counts for strings without delimiters in them
-  thrust::transform(execpol->on(stream),
+  thrust::transform(execpol->on(stream.value()),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(strings_count),
                     d_token_counts,
@@ -482,7 +484,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
                     });
   // now compute the number of tokens in each string
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     delimiter_count,
     [tokenizer, d_positions, delimiter_count, d_string_indices, d_token_counts] __device__(
@@ -492,7 +494,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
 
   // the columns_count is the maximum number of tokens for any string
   size_type columns_count =
-    *thrust::max_element(execpol->on(stream), token_counts.begin(), token_counts.end());
+    *thrust::max_element(execpol->on(stream.value()), token_counts.begin(), token_counts.end());
   // boundary case: if no columns, return one null column (custrings issue #119)
   if (columns_count == 0) {
     results.push_back(std::make_unique<column>(
@@ -508,7 +510,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
   string_index_pair* d_tokens = tokens.data().get();
   // initialize the token positions
   // -- accounts for nulls, empty, and strings with no delimiter in them
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      [tokenizer, columns_count, d_tokens] __device__(size_type idx) {
@@ -516,7 +518,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
                      });
 
   // get the positions for every token using the delimiter positions
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      delimiter_count,
                      [tokenizer,
@@ -541,7 +543,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
   for (size_type col = 0; col < columns_count; ++col) {
     auto column_tokens = d_tokens + (col * strings_count);
     results.emplace_back(
-      make_strings_column(column_tokens, column_tokens + strings_count, mr, stream));
+      make_strings_column(column_tokens, column_tokens + strings_count, stream, mr));
   }
   return std::make_unique<table>(std::move(results));
 }
@@ -742,8 +744,8 @@ struct whitespace_rsplit_tokenizer_fn : base_whitespace_split_tokenizer {
 template <typename Tokenizer>
 std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
                                            Tokenizer tokenizer,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   auto execpol = rmm::exec_policy(stream);
 
@@ -753,14 +755,14 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
   auto d_token_counts = token_counts.data().get();
   if (strings_count > 0) {
     thrust::transform(
-      execpol->on(stream),
+      execpol->on(stream.value()),
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(strings_count),
       d_token_counts,
       [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); });
     // column count is the maximum number of tokens for any string
     columns_count =
-      *thrust::max_element(execpol->on(stream), token_counts.begin(), token_counts.end());
+      *thrust::max_element(execpol->on(stream.value()), token_counts.begin(), token_counts.end());
   }
 
   std::vector<std::unique_ptr<column>> results;
@@ -777,12 +779,12 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
   // get the positions for every token
   rmm::device_vector<string_index_pair> tokens(columns_count * strings_count);
   string_index_pair* d_tokens = tokens.data().get();
-  thrust::fill(execpol->on(stream),
+  thrust::fill(execpol->on(stream.value()),
                d_tokens,
                d_tokens + (columns_count * strings_count),
                string_index_pair{nullptr, 0});
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     [tokenizer, columns_count, d_token_counts, d_tokens] __device__(size_type idx) {
@@ -795,7 +797,7 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
   for (size_type col = 0; col < columns_count; ++col) {
     auto column_tokens = d_tokens + (col * strings_count);
     results.emplace_back(
-      make_strings_column(column_tokens, column_tokens + strings_count, mr, stream));
+      make_strings_column(column_tokens, column_tokens + strings_count, stream, mr));
   }
   return std::make_unique<table>(std::move(results));
 }
@@ -806,8 +808,8 @@ std::unique_ptr<table> split(
   strings_column_view const& strings_column,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
 
@@ -818,21 +820,21 @@ std::unique_ptr<table> split(
   if (delimiter.size() == 0) {
     return whitespace_split_fn(strings_column.size(),
                                whitespace_split_tokenizer_fn{*strings_device_view, max_tokens},
-                               mr,
-                               stream);
+                               stream,
+                               mr);
   }
 
   string_view d_delimiter(delimiter.data(), delimiter.size());
   return split_fn(
-    strings_column, split_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, mr, stream);
+    strings_column, split_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, stream, mr);
 }
 
 std::unique_ptr<table> rsplit(
   strings_column_view const& strings_column,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
 
@@ -843,13 +845,13 @@ std::unique_ptr<table> rsplit(
   if (delimiter.size() == 0) {
     return whitespace_split_fn(strings_column.size(),
                                whitespace_rsplit_tokenizer_fn{*strings_device_view, max_tokens},
-                               mr,
-                               stream);
+                               stream,
+                               mr);
   }
 
   string_view d_delimiter(delimiter.data(), delimiter.size());
   return split_fn(
-    strings_column, rsplit_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, mr, stream);
+    strings_column, rsplit_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, stream, mr);
 }
 
 }  // namespace detail
@@ -862,7 +864,7 @@ std::unique_ptr<table> split(strings_column_view const& strings_column,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(strings_column, delimiter, maxsplit, mr);
+  return detail::split(strings_column, delimiter, maxsplit, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
@@ -871,7 +873,7 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit(strings_column, delimiter, maxsplit, mr);
+  return detail::rsplit(strings_column, delimiter, maxsplit, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 6a88809ea92..8cd5ed1fd1f 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <strings/split/split_utils.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,7 +25,8 @@
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/split/split_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/scan.h>
 #include <thrust/transform.h>
@@ -221,21 +224,23 @@ template <typename TokenCounter, typename TokenReader>
 std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
                                         TokenCounter counter,
                                         TokenReader reader,
-                                        rmm::mr::device_memory_resource* mr,
-                                        cudaStream_t stream)
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 {
   // create offsets column by counting the number of tokens per string
   auto strings_count = strings.size();
   auto offsets       = make_numeric_column(
     data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto d_offsets = offsets->mutable_view().data<int32_t>();
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator(0),
                     thrust::make_counting_iterator(strings_count),
                     d_offsets,
                     counter);
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream)->on(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
+                         d_offsets,
+                         d_offsets + strings_count + 1,
+                         d_offsets);
 
   // last entry is the total number of tokens to be generated
   auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
@@ -243,12 +248,12 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
   rmm::device_vector<string_index_pair> tokens(total_tokens);
   reader.d_token_offsets = d_offsets;
   reader.d_tokens        = tokens.data().get();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      reader);
   // convert the index-pairs into one big strings column
-  auto strings_output = make_strings_column(tokens.begin(), tokens.end(), mr, stream);
+  auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
   // create a lists column using the offsets and the strings columns
   return make_lists_column(strings_count,
                            std::move(offsets),
@@ -262,8 +267,8 @@ std::unique_ptr<column> split_record(
   strings_column_view const& strings,
   string_scalar const& delimiter      = string_scalar(""),
   size_type maxsplit                  = -1,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
 
@@ -275,15 +280,15 @@ std::unique_ptr<column> split_record(
     return split_record_fn(strings,
                            whitespace_token_counter_fn{*d_strings_column_ptr, max_tokens},
                            whitespace_token_reader_fn<dir>{*d_strings_column_ptr, max_tokens},
-                           mr,
-                           stream);
+                           stream,
+                           mr);
   } else {
     string_view d_delimiter(delimiter.data(), delimiter.size());
     return split_record_fn(strings,
                            token_counter_fn{*d_strings_column_ptr, d_delimiter, max_tokens},
                            token_reader_fn<dir>{*d_strings_column_ptr, d_delimiter},
-                           mr,
-                           stream);
+                           stream,
+                           mr);
   }
 }
 
@@ -297,7 +302,8 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record<detail::Dir::FORWARD>(strings, delimiter, maxsplit, mr, 0);
+  return detail::split_record<detail::Dir::FORWARD>(
+    strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
@@ -306,7 +312,8 @@ std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record<detail::Dir::BACKWARD>(strings, delimiter, maxsplit, mr, 0);
+  return detail::split_record<detail::Dir::BACKWARD>(
+    strings, delimiter, maxsplit, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 60da9b682ec..2e387d91d2b 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -28,32 +28,32 @@
 #include <thrust/for_each.h>
 #include <thrust/transform_reduce.h>
 
-// clang-format off
 namespace cudf {
 
 // Create a strings-type column from vector of pointer/size pairs
 std::unique_ptr<column> make_strings_column(
   const rmm::device_vector<thrust::pair<const char*, size_type>>& strings,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) {
+  rmm::mr::device_memory_resource* mr)
+{
   CUDF_FUNC_RANGE();
   size_type strings_count = strings.size();
-  if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream.value());
+  if (strings_count == 0) return strings::detail::make_empty_strings_column(stream, mr);
 
   auto execpol   = rmm::exec_policy(stream);
   auto d_strings = strings.data().get();
 
   // check total size is not too large for cudf column
-  size_t bytes = thrust::transform_reduce(
-    execpol->on(stream.value()),
-    thrust::make_counting_iterator<size_t>(0),
-    thrust::make_counting_iterator<size_t>(strings_count),
-    [d_strings] __device__(size_t idx) {
-      auto item = d_strings[idx];
-      return (item.first != nullptr) ? item.second : 0;
-    },
-    0,
-    thrust::plus<size_t>());
+  auto size_checker = [d_strings] __device__(size_t idx) {
+    auto item = d_strings[idx];
+    return (item.first != nullptr) ? item.second : 0;
+  };
+  size_t bytes = thrust::transform_reduce(execpol->on(stream.value()),
+                                          thrust::make_counting_iterator<size_t>(0),
+                                          thrust::make_counting_iterator<size_t>(strings_count),
+                                          size_checker,
+                                          0,
+                                          thrust::plus<size_t>());
   CUDF_EXPECTS(bytes < std::numeric_limits<size_type>::max(),
                "total size of strings is too large for cudf column");
 
@@ -65,7 +65,7 @@ std::unique_ptr<column> make_strings_column(
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
   auto offsets_column = strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto offsets_view = offsets_column->view();
   auto d_offsets    = offsets_view.data<int32_t>();
 
@@ -82,7 +82,7 @@ std::unique_ptr<column> make_strings_column(
 
   // build chars column
   auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream.value());
+    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
   thrust::for_each_n(execpol->on(stream.value()),
@@ -107,7 +107,8 @@ std::unique_ptr<column> make_strings_column(
 struct string_view_to_pair {
   string_view null_placeholder;
   string_view_to_pair(string_view n) : null_placeholder(n) {}
-  __device__ thrust::pair<const char*, size_type> operator()(const string_view& i) {
+  __device__ thrust::pair<const char*, size_type> operator()(const string_view& i)
+  {
     return (i.data() == null_placeholder.data())
              ? thrust::pair<const char*, size_type>{nullptr, 0}
              : thrust::pair<const char*, size_type>{i.data(), i.size_bytes()};
@@ -118,7 +119,8 @@ struct string_view_to_pair {
 std::unique_ptr<column> make_strings_column(const rmm::device_vector<string_view>& string_views,
                                             const string_view null_placeholder,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr) {
+                                            rmm::mr::device_memory_resource* mr)
+{
   auto it_pair =
     thrust::make_transform_iterator(string_views.begin(), string_view_to_pair{null_placeholder});
   const rmm::device_vector<thrust::pair<const char*, size_type>> dev_strings(
@@ -132,10 +134,11 @@ std::unique_ptr<column> make_strings_column(const rmm::device_vector<char>& stri
                                             const rmm::device_vector<bitmask_type>& valid_mask,
                                             size_type null_count,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr) {
+                                            rmm::mr::device_memory_resource* mr)
+{
   CUDF_FUNC_RANGE();
   size_type num_strings = offsets.size() - 1;
-  if (num_strings == 0) return strings::detail::make_empty_strings_column(mr, stream.value());
+  if (num_strings == 0) return strings::detail::make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(null_count < num_strings, "null strings column not yet supported");
   if (null_count > 0) {
@@ -147,8 +150,8 @@ std::unique_ptr<column> make_strings_column(const rmm::device_vector<char>& stri
   CUDF_EXPECTS(bytes >= 0, "invalid offsets vector");
 
   // build offsets column -- this is the number of strings + 1
-  auto offsets_column =
-    make_numeric_column(data_type{type_id::INT32}, num_strings + 1, mask_state::UNALLOCATED, stream, mr);
+  auto offsets_column = make_numeric_column(
+    data_type{type_id::INT32}, num_strings + 1, mask_state::UNALLOCATED, stream, mr);
   auto offsets_view = offsets_column->mutable_view();
   CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
                            offsets.data().get(),
@@ -159,17 +162,20 @@ std::unique_ptr<column> make_strings_column(const rmm::device_vector<char>& stri
   rmm::device_buffer null_mask{
     valid_mask.data().get(),
     valid_mask.size() *
-      sizeof(
-        bitmask_type)};  // Or this works too: sizeof(typename std::remove_reference_t<decltype(valid_mask)>::value_type)
-    // Following give the incorrect value of 8 instead of 4 because of smart references:
-    // sizeof(valid_mask[0]), sizeof(decltype(valid_mask.front()))
+      sizeof(bitmask_type)};  // Or this works too: sizeof(typename
+                              // std::remove_reference_t<decltype(valid_mask)>::value_type)
+  // Following give the incorrect value of 8 instead of 4 because of smart references:
+  // sizeof(valid_mask[0]), sizeof(decltype(valid_mask.front()))
 
   // build chars column
   auto chars_column =
-    strings::detail::create_chars_child_column(num_strings, null_count, bytes, mr, stream.value());
+    strings::detail::create_chars_child_column(num_strings, null_count, bytes, stream, mr);
   auto chars_view = chars_column->mutable_view();
-  CUDA_TRY(cudaMemcpyAsync(
-    chars_view.data<char>(), strings.data().get(), bytes, cudaMemcpyDeviceToDevice, stream.value()));
+  CUDA_TRY(cudaMemcpyAsync(chars_view.data<char>(),
+                           strings.data().get(),
+                           bytes,
+                           cudaMemcpyDeviceToDevice,
+                           stream.value()));
 
   return make_strings_column(num_strings,
                              std::move(offsets_column),
@@ -186,7 +192,8 @@ std::unique_ptr<column> make_strings_column(const std::vector<char>& strings,
                                             const std::vector<bitmask_type>& null_mask,
                                             size_type null_count,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr) {
+                                            rmm::mr::device_memory_resource* mr)
+{
   rmm::device_vector<char> d_strings{strings};
   rmm::device_vector<size_type> d_offsets{offsets};
   rmm::device_vector<bitmask_type> d_null_mask{null_mask};
@@ -201,7 +208,8 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
                                             size_type null_count,
                                             rmm::device_buffer&& null_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr) {
+                                            rmm::mr::device_memory_resource* mr)
+{
   if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable.");
   CUDF_EXPECTS(num_strings == offsets_column->size() - 1,
                "Invalid offsets column size for strings column.");
@@ -220,4 +228,3 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
 }
 
 }  // namespace cudf
-// clang-format on TODO fix
diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu
index 679ac6a2bb5..106f133229b 100644
--- a/cpp/src/strings/strings_column_view.cu
+++ b/cpp/src/strings/strings_column_view.cu
@@ -20,9 +20,12 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/for_each.h>
 #include <thrust/transform.h>
 #include <thrust/transform_scan.h>
+
 #include <iostream>
 
 namespace cudf {
@@ -130,30 +133,36 @@ void print(strings_column_view const& strings,
 
 //
 std::pair<rmm::device_vector<char>, rmm::device_vector<size_type>> create_offsets(
-  strings_column_view const& strings, cudaStream_t stream, rmm::mr::device_memory_resource* mr)
+  strings_column_view const& strings,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   size_type count          = strings.size();
   const int32_t* d_offsets = strings.offsets().data<int32_t>();
   d_offsets += strings.offset();  // nvbug-2808421 : do not combine with the previous line
   int32_t first = 0;
-  CUDA_TRY(cudaMemcpyAsync(&first, d_offsets, sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(
+    cudaMemcpyAsync(&first, d_offsets, sizeof(int32_t), cudaMemcpyDeviceToHost, stream.value()));
   rmm::device_vector<size_type> offsets(count + 1);
   // normalize the offset values for the column offset
   thrust::transform(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     d_offsets,
     d_offsets + count + 1,
     offsets.begin(),
     [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
   // copy the chars column data
   int32_t bytes = 0;  // last offset entry is the size in bytes
-  CUDA_TRY(
-    cudaMemcpyAsync(&bytes, d_offsets + count, sizeof(int32_t), cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    &bytes, d_offsets + count, sizeof(int32_t), cudaMemcpyDeviceToHost, stream.value()));
+  stream.synchronize();
+
   bytes -= first;
   const char* d_chars = strings.chars().data<char>() + first;
   rmm::device_vector<char> chars(bytes);
-  CUDA_TRY(cudaMemcpyAsync(chars.data().get(), d_chars, bytes, cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(
+    cudaMemcpyAsync(chars.data().get(), d_chars, bytes, cudaMemcpyDeviceToHost, stream.value()));
   // return offsets and chars
   return std::make_pair(std::move(chars), std::move(offsets));
 }
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index d3256e4ccb8..9c7f905cb0b 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,12 @@
 
 #include <cudf/scalar/scalar.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 // Create a strings-type column from array of pointer/size pairs
 std::unique_ptr<scalar> make_string_scalar(std::string const& string,
-                                           cudaStream_t stream,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   auto s = new string_scalar(string, true, stream, mr);
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index ea5a2d8ef69..907999bf50d 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -106,11 +106,11 @@ std::unique_ptr<column> strip(
   strings_column_view const& strings,
   strip_type stype                    = strip_type::BOTH,
   string_scalar const& to_strip       = string_scalar(""),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
 
   CUDF_EXPECTS(to_strip.is_valid(), "Parameter to_strip must be valid");
   string_view d_to_strip(to_strip.data(), to_strip.size());
@@ -121,27 +121,26 @@ std::unique_ptr<column> strip(
   size_type null_count = strings.null_count();
 
   // copy null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // build offsets column -- calculate the size of each output string
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), strip_fn<SizeOnly>{d_column, stype, d_to_strip});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto offsets_view = offsets_column->view();
   auto d_offsets    = offsets_view.data<int32_t>();
 
   // build the chars column -- convert characters based on case_flag parameter
   size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+  auto chars_column = create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   auto chars_view   = chars_column->mutable_view();
   auto d_chars      = chars_view.data<char>();
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      strip_fn<ExecuteOp>{d_column, stype, d_to_strip, d_offsets, d_chars});
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -161,7 +160,7 @@ std::unique_ptr<column> strip(strings_column_view const& strings,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::strip(strings, stype, to_strip, mr);
+  return detail::strip(strings, stype, to_strip, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 7cc039b5141..2bb5723dc9b 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <strings/utilities.cuh>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
@@ -28,7 +30,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/substring.hpp>
 
-#include <strings/utilities.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace strings {
@@ -98,11 +100,11 @@ std::unique_ptr<column> slice_strings(
   numeric_scalar<size_type> const& start = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                    = 0)
+  rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   if (step.is_valid()) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0");
 
@@ -113,22 +115,21 @@ std::unique_ptr<column> slice_strings(
   auto d_step         = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
   // copy the null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0), substring_fn{d_column, d_start, d_stop, d_step});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_new_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   auto bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, mr, stream);
+    strings_count, strings.null_count(), bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      substring_fn{d_column, d_start, d_stop, d_step, d_new_offsets, d_chars});
@@ -153,7 +154,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, start, stop, step, mr);
+  return detail::slice_strings(strings, start, stop, step, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -205,8 +206,8 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
                                                    size_type null_count,
                                                    cudf::detail::input_indexalator starts,
                                                    cudf::detail::input_indexalator stops,
-                                                   rmm::mr::device_memory_resource* mr,
-                                                   cudaStream_t stream)
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = d_column.size();
 
@@ -220,16 +221,16 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), substring_from_fn{d_column, starts, stops});
   auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_new_offsets = offsets_column->view().data<int32_t>();
 
   // Build chars column
   auto bytes = cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.template data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
                      substring_from_fn{d_column, starts, stops, d_new_offsets, d_chars});
@@ -255,13 +256,13 @@ void compute_substring_indices(column_device_view const& d_column,
                                size_type delimiter_count,
                                size_type* start_char_pos,
                                size_type* end_char_pos,
-                               rmm::mr::device_memory_resource* mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = d_column.size();
 
   thrust::for_each_n(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     [delim_itr, delimiter_count, start_char_pos, end_char_pos, d_column] __device__(size_type idx) {
@@ -313,11 +314,11 @@ std::unique_ptr<column> slice_strings(
   strings_column_view const& strings,
   column_view const& starts_column,
   column_view const& stops_column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
   CUDF_EXPECTS(starts_column.size() == strings_count,
                "Parameter starts must have the same number of rows as strings.");
   CUDF_EXPECTS(stops_column.size() == strings_count,
@@ -334,15 +335,15 @@ std::unique_ptr<column> slice_strings(
   auto starts_iter    = cudf::detail::indexalator_factory::make_input_iterator(starts_column);
   auto stops_iter     = cudf::detail::indexalator_factory::make_input_iterator(stops_column);
   return compute_substrings_from_fn(
-    *strings_column, strings.null_count(), starts_iter, stops_iter, mr, stream);
+    *strings_column, strings.null_count(), starts_iter, stops_iter, stream, mr);
 }
 
 template <typename DelimiterItrT>
 std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       DelimiterItrT const delimiter_itr,
                                       size_type count,
-                                      rmm::mr::device_memory_resource* mr,
-                                      cudaStream_t stream = 0)
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
   // If there aren't any rows, return an empty strings column
@@ -368,7 +369,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
   if (count != 0) {
     // Compute the substring indices first
     compute_substring_indices(
-      d_column, delimiter_itr, count, start_char_pos, end_char_pos, mr, stream);
+      d_column, delimiter_itr, count, start_char_pos, end_char_pos, stream, mr);
   }
 
   // Extract the substrings using the indices next
@@ -377,7 +378,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
   auto stops_iter =
     cudf::detail::indexalator_factory::make_input_iterator(stop_chars_pos_vec->view());
   return compute_substrings_from_fn(
-    d_column, strings.null_count(), starts_iter, stops_iter, mr, stream);
+    d_column, strings.null_count(), starts_iter, stops_iter, stream, mr);
 }
 
 }  // namespace detail
@@ -390,7 +391,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, starts_column, stops_column, mr);
+  return detail::slice_strings(strings, starts_column, stops_column, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> slice_strings(strings_column_view const& strings,
@@ -399,8 +400,11 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(
-    strings, cudf::detail::make_pair_iterator<string_view>(delimiter), count, mr, nullptr);
+  return detail::slice_strings(strings,
+                               cudf::detail::make_pair_iterator<string_view>(delimiter),
+                               count,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> slice_strings(strings_column_view const& strings,
@@ -419,11 +423,13 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                strings,
                cudf::detail::make_pair_iterator<string_view, true>(delimiters_dev_view),
                count,
+               rmm::cuda_stream_default,
                mr)
            : detail::slice_strings(
                strings,
                cudf::detail::make_pair_iterator<string_view, false>(delimiters_dev_view),
                count,
+               rmm::cuda_stream_default,
                mr);
 }
 
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 4cc5d2bcba8..f643a60722a 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <strings/utilities.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -24,7 +26,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
 
-#include <strings/utilities.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/find.h>
 
@@ -76,11 +78,11 @@ struct translate_fn {
 std::unique_ptr<column> translate(
   strings_column_view const& strings,
   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(stream, mr);
 
   size_type table_size = static_cast<size_type>(chars_table.size());
   // convert input table
@@ -95,26 +97,25 @@ std::unique_ptr<column> translate(
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   // create offsets column
   auto offsets_transformer_itr =
     thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0),
                                     translate_fn{d_strings, table.begin(), table.end()});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, mr, stream);
+    strings_count, strings.null_count(), bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
                      translate_fn{d_strings, table.begin(), table.end(), d_offsets, d_chars});
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -133,7 +134,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::translate(strings, chars_table);
+  return detail::translate(strings, chars_table, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index ce15fb80960..0737cd1e003 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,16 +16,20 @@
 
 #include <strings/char_types/char_cases.h>
 #include <strings/char_types/char_flags.h>
+#include <strings/utilities.cuh>
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/error.hpp>
-#include <strings/utilities.cuh>
-#include <strings/utilities.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/transform_reduce.h>
 #include <thrust/transform_scan.h>
+
 #include <cstring>
 
 namespace cudf {
@@ -33,14 +37,14 @@ namespace strings {
 namespace detail {
 // Used to build a temporary string_view object from a single host string.
 std::unique_ptr<string_view, std::function<void(string_view*)>> string_from_host(
-  const char* str, cudaStream_t stream)
+  const char* str, rmm::cuda_stream_view stream)
 {
   if (!str) return nullptr;
   auto length = std::strlen(str);
 
   auto* d_str = new rmm::device_buffer(length, stream);
-  CUDA_TRY(cudaMemcpyAsync(d_str->data(), str, length, cudaMemcpyHostToDevice, stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  CUDA_TRY(cudaMemcpyAsync(d_str->data(), str, length, cudaMemcpyHostToDevice, stream.value()));
+  stream.synchronize();
 
   auto deleter = [d_str](string_view* sv) { delete d_str; };
   return std::unique_ptr<string_view, decltype(deleter)>{
@@ -49,7 +53,7 @@ std::unique_ptr<string_view, std::function<void(string_view*)>> string_from_host
 
 // build a vector of string_view objects from a strings column
 rmm::device_vector<string_view> create_string_vector_from_column(cudf::strings_column_view strings,
-                                                                 cudaStream_t stream)
+                                                                 rmm::cuda_stream_view stream)
 {
   auto execpol        = rmm::exec_policy(stream);
   auto strings_column = column_device_view::create(strings.parent(), stream);
@@ -58,7 +62,7 @@ rmm::device_vector<string_view> create_string_vector_from_column(cudf::strings_c
   auto count = strings.size();
   rmm::device_vector<string_view> strings_vector(count);
   string_view* d_strings = strings_vector.data().get();
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      count,
                      [d_column, d_strings] __device__(size_type idx) {
@@ -73,10 +77,10 @@ rmm::device_vector<string_view> create_string_vector_from_column(cudf::strings_c
 // build a strings offsets column from a vector of string_views
 std::unique_ptr<cudf::column> child_offsets_from_string_vector(
   const rmm::device_vector<string_view>& strings,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
-  return child_offsets_from_string_iterator(strings.begin(), strings.size(), mr, stream);
+  return child_offsets_from_string_iterator(strings.begin(), strings.size(), stream, mr);
 }
 
 // build a strings chars column from an vector of string_views
@@ -84,8 +88,8 @@ std::unique_ptr<cudf::column> child_chars_from_string_vector(
   const rmm::device_vector<string_view>& strings,
   const int32_t* d_offsets,
   cudf::size_type null_count,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   size_type count = strings.size();
   auto d_strings  = strings.data().get();
@@ -97,7 +101,7 @@ std::unique_ptr<cudf::column> child_chars_from_string_vector(
     make_numeric_column(data_type{type_id::INT8}, bytes, mask_state::UNALLOCATED, stream, mr);
   // get it's view
   auto d_chars = chars_column->mutable_view().data<int8_t>();
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      count,
                      [d_strings, d_offsets, d_chars] __device__(size_type idx) {
@@ -112,8 +116,8 @@ std::unique_ptr<cudf::column> child_chars_from_string_vector(
 std::unique_ptr<column> create_chars_child_column(cudf::size_type strings_count,
                                                   cudf::size_type null_count,
                                                   cudf::size_type total_bytes,
-                                                  rmm::mr::device_memory_resource* mr,
-                                                  cudaStream_t stream)
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(null_count <= strings_count, "Invalid null count");
   return make_numeric_column(
@@ -121,8 +125,8 @@ std::unique_ptr<column> create_chars_child_column(cudf::size_type strings_count,
 }
 
 //
-std::unique_ptr<column> make_empty_strings_column(rmm::mr::device_memory_resource* mr,
-                                                  cudaStream_t stream)
+std::unique_ptr<column> make_empty_strings_column(rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
 {
   return std::make_unique<column>(data_type{type_id::STRING},
                                   0,
diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh
index 06a8aab4dc4..c90cc3aeee1 100644
--- a/cpp/src/strings/utilities.cuh
+++ b/cpp/src/strings/utilities.cuh
@@ -19,6 +19,8 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <cstring>
 
 namespace cudf {
@@ -74,8 +76,8 @@ auto make_strings_children(
   SizeAndExecuteFunction size_and_exec_fn,
   size_type strings_count,
   size_type null_count,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto offsets_column = make_numeric_column(
     data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
@@ -86,7 +88,7 @@ auto make_strings_children(
   // This is called twice -- once for offsets and once for chars.
   // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
   auto for_each_fn = [strings_count, stream](SizeAndExecuteFunction& size_and_exec_fn) {
-    thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator<size_type>(0),
                        strings_count,
                        size_and_exec_fn);
@@ -94,12 +96,14 @@ auto make_strings_children(
 
   // Compute the offsets values
   for_each_fn(size_and_exec_fn);
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream)->on(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
+                         d_offsets,
+                         d_offsets + strings_count + 1,
+                         d_offsets);
 
   // Now build the chars column
   std::unique_ptr<column> chars_column = create_chars_child_column(
-    strings_count, null_count, thrust::device_pointer_cast(d_offsets)[strings_count], mr, stream);
+    strings_count, null_count, thrust::device_pointer_cast(d_offsets)[strings_count], stream, mr);
   size_and_exec_fn.d_chars = chars_column->mutable_view().template data<char>();
   for_each_fn(size_and_exec_fn);
 
diff --git a/cpp/src/strings/utilities.hpp b/cpp/src/strings/utilities.hpp
index b61f9581078..3377d8bab35 100644
--- a/cpp/src/strings/utilities.hpp
+++ b/cpp/src/strings/utilities.hpp
@@ -15,11 +15,13 @@
  */
 #pragma once
 
+#include <cstdint>
+
 namespace cudf {
 namespace strings {
 namespace detail {
 // Type for the character flags table.
-using character_flags_table_type = uint8_t;
+using character_flags_table_type = std::uint8_t;
 
 /**
  * @brief Returns pointer to device memory that contains the static
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index c61fd0797a4..e42a8b51f9f 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -28,6 +28,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -90,13 +92,13 @@ template <typename device_execute_functor>
 std::unique_ptr<column> wrap(
   strings_column_view const& strings,
   size_type width,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(width > 0, "Positive wrap width required");
 
   auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
 
   auto execpol = rmm::exec_policy(stream);
 
@@ -105,8 +107,7 @@ std::unique_ptr<column> wrap(
   size_type null_count = strings.null_count();
 
   // copy null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // build offsets column
   auto offsets_column = std::make_unique<column>(strings.offsets(), stream, mr);  // makes a copy
@@ -117,7 +118,7 @@ std::unique_ptr<column> wrap(
 
   device_execute_functor d_execute_fctr{d_column, d_new_offsets, d_chars, width};
 
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      d_execute_fctr);
@@ -138,7 +139,7 @@ std::unique_ptr<column> wrap(strings_column_view const& strings,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::wrap<detail::execute_wrap>(strings, width, mr);
+  return detail::wrap<detail::execute_wrap>(strings, width, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 5ac1b5162be..464306fee94 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <nvtext/tokenize.hpp>
+
+#include <strings/utilities.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -27,12 +31,12 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include <nvtext/tokenize.hpp>
-#include <strings/utilities.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
-#include <rmm/device_uvector.hpp>
 
 namespace nvtext {
 namespace detail {
@@ -98,17 +102,18 @@ struct token_row_offsets_fn {
   cudf::size_type const tokens_counts;
 
   template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
-  std::unique_ptr<rmm::device_uvector<cudf::size_type>> operator()(cudaStream_t stream) const
+  std::unique_ptr<rmm::device_uvector<cudf::size_type>> operator()(
+    rmm::cuda_stream_view stream) const
   {
     index_changed_fn<T> pfn{row_indices.data<T>(), sorted_indices.template data<int32_t>()};
     auto const output_count =
-      thrust::count_if(rmm::exec_policy(stream)->on(stream),
+      thrust::count_if(rmm::exec_policy(stream)->on(stream.value()),
                        thrust::make_counting_iterator<cudf::size_type>(0),
                        thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
                        pfn);
     auto tokens_offsets =
       std::make_unique<rmm::device_uvector<cudf::size_type>>(output_count + 1, stream);
-    thrust::copy_if(rmm::exec_policy(stream)->on(stream),
+    thrust::copy_if(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(tokens_counts),
                     tokens_offsets->begin(),
@@ -134,7 +139,7 @@ struct token_row_offsets_fn {
 std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& strings,
                                          cudf::column_view const& row_indices,
                                          cudf::string_scalar const& separator,
-                                         cudaStream_t stream,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be valid");
@@ -164,17 +169,17 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
     thrust::make_counting_iterator<cudf::size_type>(0),
     detokenizer_fn{*strings_column, d_row_map, tokens_offsets->data(), d_separator});
   auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + output_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + output_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build the chars column - append each source token to the appropriate output row
   cudf::size_type const total_bytes =
     cudf::detail::get_value<int32_t>(offsets_column->view(), output_count, stream);
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(output_count, 0, total_bytes, mr, stream);
+    cudf::strings::detail::create_chars_child_column(output_count, 0, total_bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<cudf::size_type>(0),
     output_count,
     detokenizer_fn{
@@ -199,7 +204,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::detokenize(strings, row_indices, separator, 0, mr);
+  return detail::detokenize(strings, row_indices, separator, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 64508467087..6977def28ef 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <nvtext/edit_distance.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -22,11 +24,11 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <nvtext/edit_distance.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/transform.h>
 #include <thrust/transform_scan.h>
-#include <rmm/device_uvector.hpp>
 
 namespace nvtext {
 namespace detail {
@@ -141,7 +143,7 @@ struct edit_distance_matrix_levenshtein_algorithm {
  */
 std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& strings,
                                             cudf::strings_column_view const& targets,
-                                            cudaStream_t stream,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   cudf::size_type strings_count = strings.size();
@@ -165,7 +167,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
                                                mr);
   auto d_results = results->mutable_view().data<int32_t>();
   auto execpol   = rmm::exec_policy(stream);
-  thrust::transform(execpol->on(stream),
+  thrust::transform(execpol->on(stream.value()),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(strings_count),
                     d_results,
@@ -181,9 +183,10 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
 
   // get the total size of the temporary compute buffer
   size_t compute_size =
-    thrust::reduce(execpol->on(stream), d_results, d_results + strings_count, size_t{0});
+    thrust::reduce(execpol->on(stream.value()), d_results, d_results + strings_count, size_t{0});
   // convert sizes to offsets in-place
-  thrust::exclusive_scan(execpol->on(stream), d_results, d_results + strings_count, d_results);
+  thrust::exclusive_scan(
+    execpol->on(stream.value()), d_results, d_results + strings_count, d_results);
   // create the temporary compute buffer
   rmm::device_uvector<int16_t> compute_buffer(compute_size, stream);
   auto d_buffer = compute_buffer.data();
@@ -192,7 +195,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
   // - on input, d_results is the offset to the working section of d_buffer for each row
   // - on output, d_results is the calculated edit distance for that row
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<cudf::size_type>(0),
     strings_count,
     edit_distance_levenshtein_algorithm{d_strings, d_targets, d_buffer, d_results});
@@ -203,7 +206,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
  * @copydoc nvtext::edit_distance_matrix
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& strings,
-                                                   cudaStream_t stream,
+                                                   rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
   cudf::size_type strings_count = strings.size();
@@ -224,9 +227,9 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
   cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2;
   rmm::device_uvector<cudf::size_type> offsets(n_upper, stream);
   auto d_offsets = offsets.data();
-  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream));
+  CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value()));
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<cudf::size_type>(0),
     strings_count * strings_count,
     [d_strings, d_offsets, strings_count] __device__(cudf::size_type idx) {
@@ -244,9 +247,10 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
 
   // get the total size for the compute buffer
   size_t compute_size =
-    thrust::reduce(execpol->on(stream), offsets.begin(), offsets.end(), size_t{0});
+    thrust::reduce(execpol->on(stream.value()), offsets.begin(), offsets.end(), size_t{0});
   // convert sizes to offsets in-place
-  thrust::exclusive_scan(execpol->on(stream), offsets.begin(), offsets.end(), offsets.begin());
+  thrust::exclusive_scan(
+    execpol->on(stream.value()), offsets.begin(), offsets.end(), offsets.begin());
   // create the compute buffer
   rmm::device_uvector<int16_t> compute_buffer(compute_size, stream);
   auto d_buffer = compute_buffer.data();
@@ -260,7 +264,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
                                                mr);
   auto d_results = results->mutable_view().data<int32_t>();
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<cudf::size_type>(0),
     strings_count * strings_count,
     edit_distance_matrix_levenshtein_algorithm{d_strings, d_buffer, d_offsets, d_results});
@@ -273,7 +277,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
                                                       stream,
                                                       mr);
   thrust::transform_exclusive_scan(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<int32_t>(0),
     thrust::make_counting_iterator<int32_t>(strings_count + 1),
     offsets_column->mutable_view().data<int32_t>(),
@@ -301,7 +305,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::edit_distance(strings, targets, 0, mr);
+  return detail::edit_distance(strings, targets, rmm::cuda_stream_default, mr);
 }
 
 /**
@@ -311,7 +315,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
                                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::edit_distance_matrix(strings, 0, mr);
+  return detail::edit_distance_matrix(strings, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 792b94aaee6..815b2720f3a 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <nvtext/generate_ngrams.hpp>
+
+#include <strings/utilities.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -25,8 +29,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
-#include <nvtext/generate_ngrams.hpp>
-#include <strings/utilities.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/transform_scan.h>
 
@@ -78,8 +82,8 @@ std::unique_ptr<cudf::column> generate_ngrams(
   cudf::strings_column_view const& strings,
   cudf::size_type ngrams               = 2,
   cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                  = 0)
+  rmm::cuda_stream_view stream         = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be valid");
   cudf::string_view const d_separator(separator.data(), separator.size());
@@ -89,7 +93,6 @@ std::unique_ptr<cudf::column> generate_ngrams(
   if (strings_count == 0)  // if no strings, return an empty column
     return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
-  auto execpol        = rmm::exec_policy(stream);
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
 
@@ -131,16 +134,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
     thrust::make_transform_iterator(thrust::make_counting_iterator<cudf::size_type>(0),
                                     ngram_generator_fn{d_strings, ngrams, d_separator});
   auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + ngrams_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + ngrams_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build the chars column
   // generate the ngrams from the input strings and copy them into the chars data buffer
   cudf::size_type const total_bytes = thrust::device_pointer_cast(d_offsets)[ngrams_count];
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(ngrams_count, 0, total_bytes, mr, stream);
+    cudf::strings::detail::create_chars_child_column(ngrams_count, 0, total_bytes, stream, mr);
   char* const d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(execpol->on(stream),
+  auto execpol        = rmm::exec_policy(stream);
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      ngrams_count,
                      ngram_generator_fn{d_strings, ngrams, d_separator, d_offsets, d_chars});
@@ -164,7 +168,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_ngrams(strings, ngrams, separator, mr);
+  return detail::generate_ngrams(strings, ngrams, separator, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -203,7 +207,7 @@ struct character_ngram_generator_fn {
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
                                                         cudf::size_type ngrams,
-                                                        cudaStream_t stream,
+                                                        rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
@@ -219,7 +223,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   // create a vector of ngram offsets for each string
   rmm::device_vector<cudf::size_type> ngram_offsets(strings_count + 1);
   thrust::transform_exclusive_scan(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<cudf::size_type>(0),
     thrust::make_counting_iterator<cudf::size_type>(strings_count + 1),
     ngram_offsets.begin(),
@@ -238,7 +242,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                            d_ngram_offsets + strings_count,
                            sizeof(cudf::size_type),
                            cudaMemcpyDeviceToHost,
-                           stream));
+                           stream.value()));
   CUDF_EXPECTS(total_ngrams > 0,
                "Insufficient number of characters in each string to generate ngrams");
 
@@ -251,21 +255,22 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   auto d_offsets      = offsets_column->mutable_view().data<int32_t>();
   // compute the size of each ngram -- output goes in d_offsets
   character_ngram_generator_fn generator{d_strings, ngrams, d_ngram_offsets, d_offsets};
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
                      generator);
 
   // convert sizes into offsets in-place
-  thrust::exclusive_scan(execpol->on(stream), d_offsets, d_offsets + total_ngrams + 1, d_offsets);
+  thrust::exclusive_scan(
+    execpol->on(stream.value()), d_offsets, d_offsets + total_ngrams + 1, d_offsets);
 
   // build the chars column
   auto const chars_bytes =
     cudf::detail::get_value<int32_t>(offsets_column->view(), total_ngrams, stream);
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(total_ngrams, 0, chars_bytes, mr, stream);
+    cudf::strings::detail::create_chars_child_column(total_ngrams, 0, chars_bytes, stream, mr);
   generator.d_chars = chars_column->mutable_view().data<char>();  // output chars
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
                      generator);
@@ -286,7 +291,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_character_ngrams(strings, ngrams, 0, mr);
+  return detail::generate_character_ngrams(strings, ngrams, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 1ec356bbf33..fab49af99a9 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+#include <nvtext/detail/tokenize.hpp>
+#include <nvtext/ngrams_tokenize.hpp>
+
+#include <strings/utilities.cuh>
+
+#include <text/utilities/tokenize_ops.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -22,10 +29,8 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
-#include <nvtext/detail/tokenize.hpp>
-#include <nvtext/ngrams_tokenize.hpp>
-#include <strings/utilities.cuh>
-#include <text/utilities/tokenize_ops.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/transform.h>
 #include <thrust/transform_scan.h>
@@ -130,8 +135,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   cudf::size_type ngrams               = 2,
   cudf::string_scalar const& delimiter = cudf::string_scalar(""),
   cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                  = 0)
+  rmm::cuda_stream_view stream         = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -140,7 +145,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
 
   CUDF_EXPECTS(ngrams >= 1, "Parameter ngrams should be an integer value of 1 or greater");
   if (ngrams == 1)  // this is just a straight tokenize
-    return tokenize(strings, delimiter, mr, stream);
+    return tokenize(strings, delimiter, stream, mr);
   auto strings_count = strings.size();
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
@@ -155,13 +160,13 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   // Ex. token-counts = [3,2]; token-offsets = [0,3,5]
   rmm::device_vector<int32_t> token_offsets(strings_count + 1);
   auto d_token_offsets = token_offsets.data().get();
-  thrust::transform_inclusive_scan(rmm::exec_policy(stream)->on(stream),
+  thrust::transform_inclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                                    thrust::make_counting_iterator<cudf::size_type>(0),
                                    thrust::make_counting_iterator<cudf::size_type>(strings_count),
                                    d_token_offsets + 1,
                                    strings_tokenizer{d_strings, d_delimiter},
                                    thrust::plus<int32_t>());
-  CUDA_TRY(cudaMemsetAsync(d_token_offsets, 0, sizeof(int32_t), stream));
+  CUDA_TRY(cudaMemsetAsync(d_token_offsets, 0, sizeof(int32_t), stream.value()));
   auto total_tokens = token_offsets[strings_count];  // Ex. 5 tokens
 
   // get the token positions (in bytes) per string
@@ -169,7 +174,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   rmm::device_vector<position_pair> token_positions(total_tokens);
   auto d_token_positions = token_positions.data().get();
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<cudf::size_type>(0),
     strings_count,
     string_tokens_positions_fn{d_strings, d_delimiter, d_token_offsets, d_token_positions});
@@ -179,7 +184,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   rmm::device_vector<int32_t> ngram_offsets(strings_count + 1);
   auto d_ngram_offsets = ngram_offsets.data().get();
   thrust::transform_inclusive_scan(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<cudf::size_type>(0),
     thrust::make_counting_iterator<cudf::size_type>(strings_count),
     d_ngram_offsets + 1,
@@ -188,7 +193,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
       return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
     },
     thrust::plus<int32_t>());
-  CUDA_TRY(cudaMemsetAsync(d_ngram_offsets, 0, sizeof(int32_t), stream));
+  CUDA_TRY(cudaMemsetAsync(d_ngram_offsets, 0, sizeof(int32_t), stream.value()));
   auto total_ngrams = ngram_offsets[strings_count];
 
   // Compute the total size of the ngrams for each string (not for each ngram)
@@ -202,13 +207,13 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   rmm::device_vector<int32_t> chars_offsets(strings_count + 1);  // output memory offsets
   auto d_chars_offsets = chars_offsets.data().get();             // per input string
   thrust::transform_inclusive_scan(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<cudf::size_type>(0),
     thrust::make_counting_iterator<cudf::size_type>(strings_count),
     d_chars_offsets + 1,
     ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions},
     thrust::plus<int32_t>());
-  CUDA_TRY(cudaMemsetAsync(d_chars_offsets, 0, sizeof(int32_t), stream));
+  CUDA_TRY(cudaMemsetAsync(d_chars_offsets, 0, sizeof(int32_t), stream.value()));
   auto output_chars_size = chars_offsets[strings_count];  // Ex. 14 output bytes total
 
   rmm::device_vector<int32_t> ngram_sizes(total_ngrams);  // size in bytes of each
@@ -216,12 +221,12 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
 
   // build chars column
   auto chars_column = cudf::strings::detail::create_chars_child_column(
-    strings_count, 0, output_chars_size, mr, stream);
+    strings_count, 0, output_chars_size, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
   // Generate the ngrams into the chars column data buffer.
   // The ngram_builder_fn functor also fills the d_ngram_sizes vector with the
   // size of each ngram.
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<int32_t>(0),
                      strings_count,
                      ngram_builder_fn{d_strings,
@@ -235,7 +240,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
                                       d_ngram_sizes});
   // build the offsets column -- converting the ngram sizes into offsets
   auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    ngram_sizes.begin(), ngram_sizes.end(), mr, stream);
+    ngram_sizes.begin(), ngram_sizes.end(), stream, mr);
   chars_column->set_null_count(0);
   offsets_column->set_null_count(0);
   // create the output strings column
@@ -259,7 +264,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, mr);
+  return detail::ngrams_tokenize(
+    strings, ngrams, delimiter, separator, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index ac67b08eba0..0f3e3ec6b6b 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -154,11 +154,11 @@ struct codepoint_to_utf8_fn {
 
 }  // namespace
 
-// details API
+// detail API
 std::unique_ptr<cudf::column> normalize_spaces(
   cudf::strings_column_view const& strings,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   cudf::size_type strings_count = strings.size();
   if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
@@ -167,30 +167,29 @@ std::unique_ptr<cudf::column> normalize_spaces(
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // copy bitmask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // create offsets by calculating size of each string for output
   auto offsets_transformer_itr =
     thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0),
                                     normalize_spaces_fn{d_strings});  // this does size-only calc
   auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build the chars column
   cudf::size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column     = cudf::strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, mr, stream);
+    strings_count, strings.null_count(), bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
 
   // copy tokens to the chars buffer
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
                      normalize_spaces_fn{d_strings, d_offsets, d_chars});
   chars_column->set_null_count(0);  // reset null count for child column
-  //
+
   return cudf::make_strings_column(strings_count,
                                    std::move(offsets_column),
                                    std::move(chars_column),
@@ -205,7 +204,7 @@ std::unique_ptr<cudf::column> normalize_spaces(
  */
 std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& strings,
                                                    bool do_lower_case,
-                                                   cudaStream_t stream,
+                                                   rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
   auto const strings_count = strings.size();
@@ -236,32 +235,31 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0),
                                     codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets});
   auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // create the output chars column
   cudf::size_type output_bytes =
     cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column = cudf::strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), output_bytes, mr, stream);
+    strings_count, strings.null_count(), output_bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
 
   // build the chars output data: convert the 4-byte code-point values into UTF-8 chars
   thrust::for_each_n(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<cudf::size_type>(0),
     strings_count,
     codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets, d_offsets, d_chars});
   chars_column->set_null_count(0);  // reset null count for child column
 
-  return cudf::make_strings_column(
-    strings_count,
-    std::move(offsets_column),
-    std::move(chars_column),
-    strings.null_count(),
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    stream,
-    mr);
+  return cudf::make_strings_column(strings_count,
+                                   std::move(offsets_column),
+                                   std::move(chars_column),
+                                   strings.null_count(),
+                                   cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                   stream,
+                                   mr);
 }
 
 }  // namespace detail
@@ -272,7 +270,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_spaces(strings, mr);
+  return detail::normalize_spaces(strings, rmm::cuda_stream_default, mr);
 }
 
 /**
@@ -283,7 +281,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
                                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_characters(strings, do_lower_case, 0, mr);
+  return detail::normalize_characters(strings, do_lower_case, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 8da94e69da9..e1a03c3462b 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+#include <strings/utilities.cuh>
+
+#include <text/utilities/tokenize_ops.cuh>
+
+#include <nvtext/detail/tokenize.hpp>
+#include <nvtext/replace.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -24,12 +31,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <nvtext/detail/tokenize.hpp>
-#include <nvtext/replace.hpp>
-
-#include <strings/utilities.cuh>
-
-#include <text/utilities/tokenize_ops.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace nvtext {
 namespace detail {
@@ -194,7 +196,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
                                              cudf::strings_column_view const& targets,
                                              cudf::strings_column_view const& replacements,
                                              cudf::string_scalar const& delimiter,
-                                             cudaStream_t stream,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!targets.has_nulls(), "Parameter targets must not have nulls");
@@ -218,12 +220,11 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
                              *replacements_column};
 
   // copy null mask from input column
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls replacer to build the offsets and chars columns
   auto children = cudf::strings::detail::make_strings_children(
-    replacer, strings_count, strings.null_count(), mr, stream);
+    replacer, strings_count, strings.null_count(), stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
@@ -239,7 +240,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
                                             cudf::size_type min_token_length,
                                             cudf::string_scalar const& replacement,
                                             cudf::string_scalar const& delimiter,
-                                            cudaStream_t stream,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid");
@@ -254,12 +255,11 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
   remove_small_tokens_fn filterer{*strings_column, d_delimiter, min_token_length, d_replacement};
 
   // copy null mask from input column
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
   auto children = cudf::strings::detail::make_strings_children(
-    filterer, strings_count, strings.null_count(), mr, stream);
+    filterer, strings_count, strings.null_count(), stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
@@ -282,7 +282,8 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_tokens(strings, targets, replacements, delimiter, 0, mr);
+  return detail::replace_tokens(
+    strings, targets, replacements, delimiter, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
@@ -292,7 +293,8 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::filter_tokens(strings, min_token_length, replacement, delimiter, 0, mr);
+  return detail::filter_tokens(
+    strings, min_token_length, replacement, delimiter, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 8810ea759e7..eace646934d 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <strings/utilities.cuh>
+
+#include <nvtext/stemmer.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -24,9 +28,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <strings/utilities.cuh>
-
-#include <nvtext/stemmer.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -93,22 +95,22 @@ template <typename PositionIterator>
 std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings,
                                         letter_type ltype,
                                         PositionIterator position_itr,
-                                        cudaStream_t stream,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8});
 
   // create empty output column
-  auto results = cudf::make_fixed_width_column(
-    cudf::data_type{cudf::type_id::BOOL8},
-    strings.size(),
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results =
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
+                                  strings.size(),
+                                  cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                  strings.null_count(),
+                                  stream,
+                                  mr);
   // set values into output column
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(strings.size()),
                     results->mutable_view().data<bool>(),
@@ -126,7 +128,7 @@ struct dispatch_is_letter_fn {
   std::unique_ptr<cudf::column> operator()(cudf::strings_column_view const& strings,
                                            letter_type ltype,
                                            cudf::column_view const& indices,
-                                           cudaStream_t stream,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr) const
   {
     CUDF_EXPECTS(strings.size() == indices.size(),
@@ -135,6 +137,7 @@ struct dispatch_is_letter_fn {
     // resolve and pass an iterator for the indices column to the detail function
     return is_letter(strings, ltype, indices.begin<T>(), stream, mr);
   }
+
   template <typename T, typename... Args, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(Args&&... args) const
   {
@@ -201,22 +204,22 @@ struct porter_stemmer_measure_fn {
 }  // namespace
 
 std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view const& strings,
-                                                     cudaStream_t stream,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
 
   // create empty output column
-  auto results = cudf::make_fixed_width_column(
-    cudf::data_type{cudf::type_id::INT32},
-    strings.size(),
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results =
+    cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
+                                  strings.size(),
+                                  cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                  strings.null_count(),
+                                  stream,
+                                  mr);
   // compute measures into output column
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(strings.size()),
                     results->mutable_view().data<int32_t>(),
@@ -227,7 +230,7 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
 std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings,
                                         letter_type ltype,
                                         cudf::column_view const& indices,
-                                        cudaStream_t stream,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   return cudf::type_dispatcher(
@@ -244,8 +247,11 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_letter(
-    strings, ltype, thrust::make_constant_iterator<cudf::size_type>(character_index), 0, mr);
+  return detail::is_letter(strings,
+                           ltype,
+                           thrust::make_constant_iterator<cudf::size_type>(character_index),
+                           rmm::cuda_stream_default,
+                           mr);
 }
 
 std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings,
@@ -254,7 +260,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_letter(strings, ltype, indices, 0, mr);
+  return detail::is_letter(strings, ltype, indices, rmm::cuda_stream_default, mr);
 }
 
 /**
@@ -264,7 +270,7 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
                                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::porter_stemmer_measure(strings, 0, mr);
+  return detail::porter_stemmer_measure(strings, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 12f0b0d9813..0b1baab3758 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -14,11 +14,14 @@
  * limitations under the License.
  */
 
+#include <text/subword/detail/data_normalizer.hpp>
+#include <text/subword/detail/tokenizer_utils.cuh>
+
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/error.hpp>
-#include <text/subword/detail/data_normalizer.hpp>
-#include <text/subword/detail/tokenizer_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/remove.h>
@@ -258,7 +261,7 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
 
 }  // namespace
 
-data_normalizer::data_normalizer(cudaStream_t stream, bool do_lower_case)
+data_normalizer::data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case)
   : do_lower_case(do_lower_case)
 {
   d_cp_metadata = detail::get_codepoint_metadata(stream);
@@ -268,7 +271,7 @@ data_normalizer::data_normalizer(cudaStream_t stream, bool do_lower_case)
 uvector_pair data_normalizer::normalize(char const* d_strings,
                                         uint32_t const* d_offsets,
                                         uint32_t num_strings,
-                                        cudaStream_t stream)
+                                        rmm::cuda_stream_view stream)
 {
   if (num_strings == 0)
     return std::make_pair(std::make_unique<rmm::device_uvector<uint32_t>>(0, stream),
@@ -278,7 +281,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
   // copy offsets to working memory
   size_t const num_offsets = num_strings + 1;
   auto d_strings_offsets   = std::make_unique<rmm::device_uvector<uint32_t>>(num_offsets, stream);
-  thrust::transform(execpol->on(stream),
+  thrust::transform(execpol->on(stream.value()),
                     thrust::make_counting_iterator<uint32_t>(0),
                     thrust::make_counting_iterator<uint32_t>(num_offsets),
                     d_strings_offsets->begin(),
@@ -298,7 +301,7 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
   auto d_code_points = std::make_unique<rmm::device_uvector<uint32_t>>(max_new_char_total, stream);
   rmm::device_uvector<uint32_t> d_chars_per_thread(threads_on_device, stream);
 
-  kernel_data_normalizer<<<grid.num_blocks, grid.num_threads_per_block, 0, stream>>>(
+  kernel_data_normalizer<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     reinterpret_cast<const unsigned char*>(d_strings),
     bytes_count,
     d_cp_metadata,
@@ -308,19 +311,21 @@ uvector_pair data_normalizer::normalize(char const* d_strings,
     d_chars_per_thread.data());
 
   // Remove the 'empty' code points from the vector
-  thrust::remove(
-    execpol->on(stream), d_code_points->begin(), d_code_points->end(), uint32_t{1 << FILTER_BIT});
+  thrust::remove(execpol->on(stream.value()),
+                 d_code_points->begin(),
+                 d_code_points->end(),
+                 uint32_t{1 << FILTER_BIT});
 
   // We also need to prefix sum the number of characters up to an including
   // the current character in order to get the new strings lengths.
-  thrust::inclusive_scan(execpol->on(stream),
+  thrust::inclusive_scan(execpol->on(stream.value()),
                          d_chars_per_thread.begin(),
                          d_chars_per_thread.end(),
                          d_chars_per_thread.begin());
 
   // This will reset the offsets to the new generated code point values
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<uint32_t>(1),
     num_strings,
     update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()});
diff --git a/cpp/src/text/subword/detail/data_normalizer.hpp b/cpp/src/text/subword/detail/data_normalizer.hpp
index 9148bde5317..1a9eb5ba997 100644
--- a/cpp/src/text/subword/detail/data_normalizer.hpp
+++ b/cpp/src/text/subword/detail/data_normalizer.hpp
@@ -18,6 +18,7 @@
 
 #include <text/subword/detail/cp_data.h>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
 using uvector_pair = std::pair<std::unique_ptr<rmm::device_uvector<uint32_t>>,
@@ -54,7 +55,7 @@ class data_normalizer {
    *        input stream to lower case and strip accents from those characters.
    *        If false, accented and uppercase characters are not transformed.
    */
-  data_normalizer(cudaStream_t stream, bool do_lower_case = true);
+  data_normalizer(rmm::cuda_stream_view stream, bool do_lower_case = true);
 
   /**
    * @brief Normalize a vector of strings.
@@ -83,7 +84,7 @@ class data_normalizer {
   uvector_pair normalize(char const* d_strings,
                          uint32_t const* d_offsets,
                          uint32_t num_strings,
-                         cudaStream_t stream);
+                         rmm::cuda_stream_view stream);
 
  private:
   bool const do_lower_case;
diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh
index 0bb92c01a9d..48ee0fc2b51 100644
--- a/cpp/src/text/subword/detail/tokenizer_utils.cuh
+++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh
@@ -18,6 +18,8 @@
 
 #include <text/subword/detail/cp_data.h>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <stdint.h>
 
 namespace nvtext {
@@ -60,7 +62,7 @@ struct update_strings_lengths_fn {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-codepoint_metadata_type const* get_codepoint_metadata(cudaStream_t stream);
+codepoint_metadata_type const* get_codepoint_metadata(rmm::cuda_stream_view stream);
 
 /**
  * @brief Retrieve the aux code point metadata table.
@@ -70,7 +72,7 @@ codepoint_metadata_type const* get_codepoint_metadata(cudaStream_t stream);
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-aux_codepoint_data_type const* get_aux_codepoint_data(cudaStream_t stream);
+aux_codepoint_data_type const* get_aux_codepoint_data(rmm::cuda_stream_view stream);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
index 2813c018145..e61437b7703 100644
--- a/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
+++ b/cpp/src/text/subword/detail/wordpiece_tokenizer.hpp
@@ -18,6 +18,8 @@
 
 #include <text/subword/detail/data_normalizer.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace nvtext {
 
 struct hashed_vocabulary;
@@ -70,7 +72,7 @@ class wordpiece_tokenizer {
                       uint32_t stride,
                       bool do_truncate,
                       bool do_lower_case,
-                      cudaStream_t stream      = 0,
+                      rmm::cuda_stream_view stream,
                       uint32_t max_word_length = 200);
 
   /**
@@ -88,7 +90,7 @@ class wordpiece_tokenizer {
   uvector_pair tokenize(char const* d_strings,
                         uint32_t const* d_offsets,
                         uint32_t num_strings,
-                        cudaStream_t stream);
+                        rmm::cuda_stream_view stream);
 
  private:
   /**
@@ -100,7 +102,7 @@ class wordpiece_tokenizer {
    *        per string.
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void tokenize(uvector_pair& cps_and_offsets, cudaStream_t stream);
+  void tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream);
 
   hashed_vocabulary const& vocab_table;
   data_normalizer normalizer;  // removes punctuation, accents, etc
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index d0f797fca62..b905fdebb1a 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -14,14 +14,18 @@
  * limitations under the License.
  */
 
+#include <text/subword/detail/codepoint_metadata.ah>
+#include <text/subword/detail/data_normalizer.hpp>
+#include <text/subword/detail/tokenizer_utils.cuh>
+
+#include <nvtext/detail/load_hash_file.hpp>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/error.hpp>
-#include <nvtext/detail/load_hash_file.hpp>
-#include <text/subword/detail/codepoint_metadata.ah>
-#include <text/subword/detail/data_normalizer.hpp>
-#include <text/subword/detail/tokenizer_utils.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <stdint.h>
 #include <algorithm>
@@ -38,7 +42,7 @@ namespace detail {
  * Build the code point metadata table in device memory
  * using the vector pieces from codepoint_metadata.ah
  */
-const codepoint_metadata_type* get_codepoint_metadata(cudaStream_t stream)
+const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream)
 {
   static cudf::strings::detail::thread_safe_per_context_cache<codepoint_metadata_type>
     g_codepoint_metadata;
@@ -46,7 +50,7 @@ const codepoint_metadata_type* get_codepoint_metadata(cudaStream_t stream)
     codepoint_metadata_type* table =
       static_cast<codepoint_metadata_type*>(rmm::mr::get_current_device_resource()->allocate(
         codepoint_metadata_size * sizeof(codepoint_metadata_type), stream));
-    thrust::fill(rmm::exec_policy(stream)->on(stream),
+    thrust::fill(rmm::exec_policy(stream)->on(stream.value()),
                  table + cp_section1_end,
                  table + codepoint_metadata_size,
                  codepoint_metadata_default_value);
@@ -54,13 +58,13 @@ const codepoint_metadata_type* get_codepoint_metadata(cudaStream_t stream)
                              codepoint_metadata,
                              cp_section1_end * sizeof(codepoint_metadata[0]),  // 1st section
                              cudaMemcpyHostToDevice,
-                             stream));
+                             stream.value()));
     CUDA_TRY(cudaMemcpyAsync(
       table + cp_section2_begin,
       cp_metadata_917505_917999,
       (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
       cudaMemcpyHostToDevice,
-      stream));
+      stream.value()));
     return table;
   });
 }
@@ -71,7 +75,7 @@ const codepoint_metadata_type* get_codepoint_metadata(cudaStream_t stream)
  * Build the aux code point data table in device memory
  * using the vector pieces from codepoint_metadata.ah
  */
-const aux_codepoint_data_type* get_aux_codepoint_data(cudaStream_t stream)
+const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream)
 {
   static cudf::strings::detail::thread_safe_per_context_cache<aux_codepoint_data_type>
     g_aux_codepoint_data;
@@ -79,7 +83,7 @@ const aux_codepoint_data_type* get_aux_codepoint_data(cudaStream_t stream)
     aux_codepoint_data_type* table =
       static_cast<aux_codepoint_data_type*>(rmm::mr::get_current_device_resource()->allocate(
         aux_codepoint_data_size * sizeof(aux_codepoint_data_type), stream));
-    thrust::fill(rmm::exec_policy(stream)->on(stream),
+    thrust::fill(rmm::exec_policy(stream)->on(stream.value()),
                  table + aux_section1_end,
                  table + aux_codepoint_data_size,
                  aux_codepoint_default_value);
@@ -87,25 +91,25 @@ const aux_codepoint_data_type* get_aux_codepoint_data(cudaStream_t stream)
                              aux_codepoint_data,
                              aux_section1_end * sizeof(aux_codepoint_data[0]),  // 1st section
                              cudaMemcpyHostToDevice,
-                             stream));
+                             stream.value()));
     CUDA_TRY(cudaMemcpyAsync(
       table + aux_section2_begin,
       aux_cp_data_44032_55203,
       (aux_section2_end - aux_section2_begin + 1) * sizeof(aux_codepoint_data[0]),  // 2nd section
       cudaMemcpyHostToDevice,
-      stream));
+      stream.value()));
     CUDA_TRY(cudaMemcpyAsync(
       table + aux_section3_begin,
       aux_cp_data_70475_71099,
       (aux_section3_end - aux_section3_begin + 1) * sizeof(aux_codepoint_data[0]),  // 3rd section
       cudaMemcpyHostToDevice,
-      stream));
+      stream.value()));
     CUDA_TRY(cudaMemcpyAsync(
       table + aux_section4_begin,
       aux_cp_data_119134_119232,
       (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
       cudaMemcpyHostToDevice,
-      stream));
+      stream.value()));
     return table;
   });
 }
@@ -134,7 +138,7 @@ const aux_codepoint_data_type* get_aux_codepoint_data(cudaStream_t stream)
  * @return object containing hash table elements for the wordpiece tokenizer
  */
 hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabulary,
-                                       cudaStream_t stream,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   hashed_vocabulary result;
@@ -194,7 +198,7 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu
                            table.data(),
                            table.size() * sizeof(uint64_t),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
 
   result.bin_coefficients = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64},
                                                       bin_coefficients.size(),
@@ -205,7 +209,7 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu
                            bin_coefficients.data(),
                            bin_coefficients.size() * sizeof(uint64_t),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
 
   result.bin_offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT16},
                                                  bin_offsets.size(),
@@ -216,7 +220,7 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu
                            bin_offsets.data(),
                            bin_offsets.size() * sizeof(uint16_t),
                            cudaMemcpyHostToDevice,
-                           stream));
+                           stream.value()));
 
   // this just initializes some constant tables into device memory
   // to help speed up the runtime
@@ -232,7 +236,7 @@ hashed_vocabulary load_vocabulary_file(std::string const& filename_hashed_vocabu
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::load_vocabulary_file(filename_hashed_vocabulary, 0, mr);
+  return detail::load_vocabulary_file(filename_hashed_vocabulary, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index f25d8144bbf..e305a3e7296 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -23,6 +23,8 @@
 #include <nvtext/subword_tokenize.hpp>
 #include <text/subword/detail/wordpiece_tokenizer.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/for_each.h>
 #include <thrust/transform_scan.h>
 
@@ -127,7 +129,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   bool do_lower_case,
                                   bool do_truncate,
                                   uint32_t max_rows_tensor,
-                                  cudaStream_t stream,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(stride <= max_sequence_length,
@@ -164,7 +166,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
   auto d_offsets_per_tensor = offsets_per_tensor.data();
   auto const execpol        = rmm::exec_policy(stream);
   thrust::transform_exclusive_scan(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<cudf::size_type>(0),
     thrust::make_counting_iterator<cudf::size_type>(strings_count + 1),
     offsets_per_tensor.begin(),
@@ -184,7 +186,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
   rmm::device_uvector<uint32_t> row2row_within_tensor(nrows_tensor_token_ids, stream);
   auto d_row2row_within_tensor = row2row_within_tensor.data();
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<uint32_t>(0),
     strings_count,
     [d_offsets_per_tensor, d_row2tensor, d_row2row_within_tensor] __device__(auto idx) {
@@ -218,7 +220,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
   constexpr int block_size = 256;
   cudf::detail::grid_1d const grid{
     static_cast<cudf::size_type>(nrows_tensor_token_ids * max_sequence_length), block_size};
-  kernel_compute_tensor_metadata<<<grid.num_blocks, grid.num_threads_per_block, 0, stream>>>(
+  kernel_compute_tensor_metadata<<<grid.num_blocks,
+                                   grid.num_threads_per_block,
+                                   0,
+                                   stream.value()>>>(
     device_token_ids,
     device_offsets,
     d_row2tensor,
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index b51e5c82688..3e65add8a7d 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -14,13 +14,17 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/utilities/error.hpp>
-#include <nvtext/subword_tokenize.hpp>
 #include <text/subword/detail/hash_utils.cuh>
 #include <text/subword/detail/tokenizer_utils.cuh>
 #include <text/subword/detail/wordpiece_tokenizer.hpp>
 
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/utilities/error.hpp>
+
+#include <nvtext/subword_tokenize.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/for_each.h>
 #include <thrust/remove.h>
 #include <thrust/transform_scan.h>
@@ -270,21 +274,21 @@ wordpiece_tokenizer::wordpiece_tokenizer(hashed_vocabulary const& vocab_table,
                                          uint32_t stride,
                                          bool do_truncate,
                                          bool do_lower_case,
-                                         cudaStream_t stream,
+                                         rmm::cuda_stream_view stream,
                                          uint32_t max_word_length)
   : vocab_table(vocab_table),
+    normalizer(stream, do_lower_case),
     max_sequence_length{max_sequence_length},
-    max_word_length{max_word_length},
     stride(stride),
     do_truncate(do_truncate),
-    normalizer(stream, do_lower_case)
+    max_word_length{max_word_length}
 {
 }
 
 uvector_pair wordpiece_tokenizer::tokenize(char const* d_strings,
                                            uint32_t const* d_offsets,
                                            uint32_t num_strings,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream)
 {
   auto cps_and_offsets = normalizer.normalize(d_strings, d_offsets, num_strings, stream);
   tokenize(cps_and_offsets, stream);
@@ -299,7 +303,7 @@ struct tranform_fn {  // just converting uint8 value to uint32
   __device__ uint32_t operator()(uint8_t count) { return count; }
 };
 
-void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, cudaStream_t stream)
+void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stream_view stream)
 {
   uint32_t* device_code_points     = cps_and_offsets.first->data();
   size_t const num_code_points     = cps_and_offsets.first->size();
@@ -321,32 +325,32 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, cudaStream_t s
   detail::init_data_and_mark_word_start_and_ends<<<grid_init.num_blocks,
                                                    grid_init.num_threads_per_block,
                                                    0,
-                                                   stream>>>(device_code_points,
-                                                             device_start_word_indices,
-                                                             device_end_word_indices,
-                                                             num_code_points,
-                                                             device_token_ids.data(),
-                                                             device_tokens_per_word.data());
-  CHECK_CUDA(stream);
+                                                   stream.value()>>>(device_code_points,
+                                                                     device_start_word_indices,
+                                                                     device_end_word_indices,
+                                                                     num_code_points,
+                                                                     device_token_ids.data(),
+                                                                     device_tokens_per_word.data());
+  CHECK_CUDA(stream.value());
 
   cudf::detail::grid_1d const grid_mark{static_cast<cudf::size_type>(num_strings + 1),
                                         THREADS_PER_BLOCK};
   detail::mark_string_start_and_ends<<<grid_mark.num_blocks,
                                        grid_mark.num_threads_per_block,
                                        0,
-                                       stream>>>(device_code_points,
-                                                 device_strings_offsets,
-                                                 device_start_word_indices,
-                                                 device_end_word_indices,
-                                                 num_strings);
-  CHECK_CUDA(stream);
+                                       stream.value()>>>(device_code_points,
+                                                         device_strings_offsets,
+                                                         device_start_word_indices,
+                                                         device_end_word_indices,
+                                                         num_strings);
+  CHECK_CUDA(stream.value());
 
   // Now start_word_indices has the word starts scattered throughout the array. We need to select
   // all values not equal to the max uint32_t and place them at the start of the array. We leverage
   // the fact that the start_word_indices and the end_word indices are contiguous to only launch one
   // device select kernel.
   auto const execpol = rmm::exec_policy(stream);
-  auto itr_end       = thrust::remove(execpol->on(stream),
+  auto itr_end       = thrust::remove(execpol->on(stream.value()),
                                 device_word_indices.begin(),
                                 device_word_indices.end(),
                                 std::numeric_limits<uint32_t>::max());
@@ -359,27 +363,28 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, cudaStream_t s
   device_end_word_indices = device_start_word_indices + num_words;
 
   cudf::detail::grid_1d const grid{static_cast<cudf::size_type>(num_words), THREADS_PER_BLOCK};
-  detail::kernel_wordpiece_tokenizer<<<grid.num_blocks, grid.num_threads_per_block, 0, stream>>>(
-    device_code_points,
-    vocab_table.table->view().data<uint64_t>(),
-    vocab_table.bin_coefficients->view().data<uint64_t>(),
-    vocab_table.bin_offsets->view().data<uint16_t>(),
-    vocab_table.unknown_token_id,
-    vocab_table.outer_hash_a,
-    vocab_table.outer_hash_b,
-    vocab_table.num_bins,
-    device_start_word_indices,
-    device_end_word_indices,
-    max_word_length,
-    num_words,
-    device_token_ids.data(),
-    device_tokens_per_word.data());
-  CHECK_CUDA(stream);
+  detail::
+    kernel_wordpiece_tokenizer<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      device_code_points,
+      vocab_table.table->view().data<uint64_t>(),
+      vocab_table.bin_coefficients->view().data<uint64_t>(),
+      vocab_table.bin_offsets->view().data<uint16_t>(),
+      vocab_table.unknown_token_id,
+      vocab_table.outer_hash_a,
+      vocab_table.outer_hash_b,
+      vocab_table.num_bins,
+      device_start_word_indices,
+      device_end_word_indices,
+      max_word_length,
+      num_words,
+      device_token_ids.data(),
+      device_tokens_per_word.data());
+  CHECK_CUDA(stream.value());
 
   // Repurpose the input array for the token ids. In the worst case, each code point ends up being a
   // token so this will always have enough memory to store the contiguous tokens.
   uint32_t* contiguous_token_ids = device_code_points;
-  thrust::copy_if(execpol->on(stream),
+  thrust::copy_if(execpol->on(stream.value()),
                   device_token_ids.begin(),
                   device_token_ids.end(),
                   contiguous_token_ids,
@@ -387,7 +392,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, cudaStream_t s
 
   // Repurpose start word indices since it is the same size and type as the required output.
   uint32_t* token_id_counts = device_start_word_indices;
-  thrust::transform_inclusive_scan(execpol->on(stream),
+  thrust::transform_inclusive_scan(execpol->on(stream.value()),
                                    device_tokens_per_word.data(),
                                    device_tokens_per_word.data() + num_code_points,
                                    token_id_counts,
@@ -395,7 +400,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, cudaStream_t s
                                    thrust::plus<uint32_t>());
 
   // Update the device_strings_offsets using the token_id_counts
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<uint32_t>(1),
                      num_strings,
                      update_strings_lengths_fn{token_id_counts, device_strings_offsets});
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index ea1afa69d2b..e16bf3cf153 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -26,6 +26,8 @@
 #include <nvtext/tokenize.hpp>
 #include <text/utilities/tokenize_ops.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/count.h>
 #include <thrust/transform.h>
 
@@ -36,8 +38,8 @@ namespace {
 template <typename TokenCounter>
 std::unique_ptr<cudf::column> token_count_fn(cudf::size_type strings_count,
                                              TokenCounter tokenizer,
-                                             rmm::mr::device_memory_resource* mr,
-                                             cudaStream_t stream)
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
 {
   // create output column
   auto token_counts   = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32},
@@ -47,7 +49,7 @@ std::unique_ptr<cudf::column> token_count_fn(cudf::size_type strings_count,
                                                 mr);
   auto d_token_counts = token_counts->mutable_view().data<int32_t>();
   // add the counts to the column
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(strings_count),
                     d_token_counts,
@@ -59,28 +61,28 @@ std::unique_ptr<cudf::column> token_count_fn(cudf::size_type strings_count,
 template <typename Tokenizer>
 std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
                                           Tokenizer tokenizer,
-                                          rmm::mr::device_memory_resource* mr,
-                                          cudaStream_t stream)
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
   auto execpol = rmm::exec_policy(stream);
   // get the number of tokens in each string
   auto const token_counts =
-    token_count_fn(strings_count, tokenizer, rmm::mr::get_current_device_resource(), stream);
+    token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource());
   auto d_token_counts = token_counts->view();
   // create token-index offsets from the counts
   rmm::device_vector<int32_t> token_offsets(strings_count + 1);
-  thrust::inclusive_scan(execpol->on(stream),
+  thrust::inclusive_scan(execpol->on(stream.value()),
                          d_token_counts.template begin<int32_t>(),
                          d_token_counts.template end<int32_t>(),
                          token_offsets.begin() + 1);
-  CUDA_TRY(cudaMemsetAsync(token_offsets.data().get(), 0, sizeof(int32_t), stream));
+  CUDA_TRY(cudaMemsetAsync(token_offsets.data().get(), 0, sizeof(int32_t), stream.value()));
   auto const total_tokens = token_offsets.back();
   // build a list of pointers to each token
   rmm::device_vector<string_index_pair> tokens(total_tokens);
   // now go get the tokens
   tokenizer.d_offsets = token_offsets.data().get();
   tokenizer.d_tokens  = tokens.data().get();
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      strings_count,
                      tokenizer);
@@ -95,33 +97,33 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
 // zero or more character tokenizer
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::string_scalar const& delimiter,
-                                       rmm::mr::device_memory_resource* mr,
-                                       cudaStream_t stream)
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
-  return tokenize_fn(strings.size(), strings_tokenizer{*strings_column, d_delimiter}, mr, stream);
+  return tokenize_fn(strings.size(), strings_tokenizer{*strings_column, d_delimiter}, stream, mr);
 }
 
 // zero or more character token counter
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::string_scalar const& delimiter,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   return token_count_fn(
-    strings.size(), strings_tokenizer{*strings_column, d_delimiter}, mr, stream);
+    strings.size(), strings_tokenizer{*strings_column, d_delimiter}, stream, mr);
 }
 
 // one or more string delimiter tokenizer
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::strings_column_view const& delimiters,
-                                       rmm::mr::device_memory_resource* mr,
-                                       cudaStream_t stream)
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiters.size() > 0, "Parameter delimiters must not be empty");
   CUDF_EXPECTS(!delimiters.has_nulls(), "Parameter delimiters must not have nulls");
@@ -132,15 +134,15 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
     multi_delimiter_strings_tokenizer{*strings_column,
                                       delimiters_column->begin<cudf::string_view>(),
                                       delimiters_column->end<cudf::string_view>()},
-    mr,
-    stream);
+    stream,
+    mr);
 }
 
 // one or more string delimiter token counter
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::strings_column_view const& delimiters,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiters.size() > 0, "Parameter delimiters must not be empty");
   CUDF_EXPECTS(!delimiters.has_nulls(), "Parameter delimiters must not have nulls");
@@ -151,13 +153,13 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
     multi_delimiter_strings_tokenizer{*strings_column,
                                       delimiters_column->begin<cudf::string_view>(),
                                       delimiters_column->end<cudf::string_view>()},
-    mr,
-    stream);
+    stream,
+    mr);
 }
 
 // tokenize on every character
 std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& strings_column,
-                                                 cudaStream_t stream,
+                                                 rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings_column.size();
@@ -179,7 +181,7 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   auto execpol      = rmm::exec_policy(stream);
   auto strings_view = cudf::column_device_view::create(strings_column.parent(), stream);
   cudf::size_type num_characters = thrust::count_if(
-    execpol->on(stream), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) {
+    execpol->on(stream.value()), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) {
       return cudf::strings::detail::is_begin_utf8_char(byte);
     });
 
@@ -198,7 +200,7 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
                                                   mr);
   auto d_new_offsets  = offsets_column->mutable_view().begin<int32_t>();
   thrust::copy_if(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<int32_t>(0),
     thrust::make_counting_iterator<int32_t>(chars_bytes + 1),
     d_new_offsets,
@@ -230,7 +232,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tokenize(strings, delimiter, mr);
+  return detail::tokenize(strings, delimiter, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
@@ -238,7 +240,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tokenize(strings, delimiters, mr);
+  return detail::tokenize(strings, delimiters, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
@@ -246,7 +248,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_tokens(strings, delimiter, mr);
+  return detail::count_tokens(strings, delimiter, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
@@ -254,14 +256,14 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_tokens(strings, delimiters, mr);
+  return detail::count_tokens(strings, delimiters, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& strings,
                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::character_tokenize(strings, 0, mr);
+  return detail::character_tokenize(strings, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 0903350f802..987d6272737 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -347,10 +347,7 @@ std::unique_ptr<cudf::column> transform_fn(cudf::dictionary_column_view const& i
                                      stream,
                                      default_mr);
   return cudf::dictionary::detail::encode(
-    output->view(),
-    dictionary::detail::get_indices_type_for_size(output->size()),
-    mr,
-    stream.value());
+    output->view(), dictionary::detail::get_indices_type_for_size(output->size()), stream, mr);
 }
 
 template <typename UFN>
@@ -360,13 +357,12 @@ struct MathOpDispatcher {
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
   {
-    return transform_fn<T, UFN>(
-      input.begin<T>(),
-      input.end<T>(),
-      cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr),
-      input.null_count(),
-      stream,
-      mr);
+    return transform_fn<T, UFN>(input.begin<T>(),
+                                input.end<T>(),
+                                cudf::detail::copy_bitmask(input, stream, mr),
+                                input.null_count(),
+                                stream,
+                                mr);
   }
 
   struct dictionary_dispatch {
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index f5f445fff6c..7cfc48d4385 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -71,7 +71,7 @@ struct launcher {
                       output_view.begin<Tout>(),
                       F{});
 
-    CHECK_CUDA(stream);
+    CHECK_CUDA(stream.value());
 
     return output;
   }
diff --git a/cpp/tests/column/column_device_view_test.cu b/cpp/tests/column/column_device_view_test.cu
index c26a3046017..69d428be814 100644
--- a/cpp/tests/column/column_device_view_test.cu
+++ b/cpp/tests/column/column_device_view_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,31 +14,33 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/types.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 struct ColumnDeviceViewTest : public cudf::test::BaseFixture {
 };
 
 TEST_F(ColumnDeviceViewTest, Sample)
 {
-  using T             = int32_t;
-  cudaStream_t stream = 0;
+  using T = int32_t;
+  rmm::cuda_stream_view stream{};
   cudf::test::fixed_width_column_wrapper<T> input({1, 2, 3, 4, 5, 6});
   auto output            = cudf::allocate_like(input);
   auto input_device_view = cudf::column_device_view::create(input, stream);
   auto output_device_view =
     cudf::mutable_column_device_view::create(output->mutable_view(), stream);
   auto exec = rmm::exec_policy(stream);
-  EXPECT_NO_THROW(thrust::copy(exec->on(stream),
+  EXPECT_NO_THROW(thrust::copy(exec->on(stream.value()),
                                input_device_view->begin<T>(),
                                input_device_view->end<T>(),
                                output_device_view->begin<T>()));
@@ -48,15 +50,15 @@ TEST_F(ColumnDeviceViewTest, Sample)
 
 TEST_F(ColumnDeviceViewTest, MismatchingType)
 {
-  using T             = int32_t;
-  cudaStream_t stream = 0;
+  using T = int32_t;
+  rmm::cuda_stream_view stream{};
   cudf::test::fixed_width_column_wrapper<T> input({1, 2, 3, 4, 5, 6});
   auto output            = cudf::allocate_like(input);
   auto input_device_view = cudf::column_device_view::create(input, stream);
   auto output_device_view =
     cudf::mutable_column_device_view::create(output->mutable_view(), stream);
   auto exec = rmm::exec_policy(stream);
-  EXPECT_THROW(thrust::copy(exec->on(stream),
+  EXPECT_THROW(thrust::copy(exec->on(stream.value()),
                             input_device_view->begin<T>(),
                             input_device_view->end<T>(),
                             output_device_view->begin<int64_t>()),
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index afd4d6668dd..d30929b90c6 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,21 +14,23 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/type_lists.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/type_lists.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 
 class ColumnFactoryTest : public cudf::test::BaseFixture {
   cudf::size_type _size{1000};
-  cudaStream_t _stream{0};
 
  public:
   cudf::size_type size() { return _size; }
-  cudaStream_t stream() { return _stream; }
+  rmm::cuda_stream_view stream() { return rmm::cuda_stream_default; }
 };
 
 template <typename T>
diff --git a/cpp/tests/copying/copy_tests.cu b/cpp/tests/copying/copy_tests.cu
index 50d0e82222e..9e0251d944d 100644
--- a/cpp/tests/copying/copy_tests.cu
+++ b/cpp/tests/copying/copy_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
-#include <cudf/copying.hpp>
-#include <cudf/detail/copy_if_else.cuh>
-#include <cudf/detail/iterator.cuh>
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/type_lists.hpp>
-
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy_if_else.cuh>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 template <typename T>
 struct CopyTest : public cudf::test::BaseFixture {
 };
@@ -69,8 +70,8 @@ struct copy_if_else_tiny_grid_functor {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& lhs,
                                            cudf::column_view const& rhs,
                                            Filter filter,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     // output
     std::unique_ptr<cudf::column> out =
@@ -85,7 +86,7 @@ struct copy_if_else_tiny_grid_functor {
 
     // call the kernel with an artificially small grid
     cudf::detail::copy_if_else_kernel<32, T, decltype(lhs_iter), decltype(rhs_iter), Filter, false>
-      <<<1, 32, 0, stream>>>(lhs_iter, rhs_iter, filter, *out_dv, nullptr);
+      <<<1, 32, 0, stream.value()>>>(lhs_iter, rhs_iter, filter, *out_dv, nullptr);
 
     return out;
   }
@@ -94,8 +95,8 @@ struct copy_if_else_tiny_grid_functor {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& lhs,
                                            cudf::column_view const& rhs,
                                            Filter filter,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Unexpected test execution");
   }
@@ -115,8 +116,8 @@ std::unique_ptr<cudf::column> tiny_grid_launch(cudf::column_view const& lhs,
                                lhs,
                                rhs,
                                filter,
-                               rmm::mr::get_current_device_resource(),
-                               (cudaStream_t)0);
+                               rmm::cuda_stream_default,
+                               rmm::mr::get_current_device_resource());
 }
 
 TYPED_TEST(CopyTest, CopyIfElseTestTinyGrid)
diff --git a/cpp/tests/copying/gather_struct_tests.cu b/cpp/tests/copying/gather_struct_tests.cu
index c9923fb6457..3df44409062 100644
--- a/cpp/tests/copying/gather_struct_tests.cu
+++ b/cpp/tests/copying/gather_struct_tests.cu
@@ -14,33 +14,28 @@
  * limitations under the License.
  */
 
-#include <cudf/copying.hpp>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/table/table.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
 
-#include <thrust/host_vector.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/sequence.h>
-#include <algorithm>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/type_lists.hpp>
-#include <functional>
-#include <initializer_list>
-#include <iterator>
-#include <memory>
+
 #include <rmm/device_buffer.hpp>
-#include <tuple>
+
+#include <nvfunctional>
+
+#include <memory>
 
 using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
 using cudf::size_type;
@@ -384,7 +379,7 @@ TYPED_TEST(TypedStructGatherTest, TestGatherStructOfStructsWithValidity)
   // Testing gather() on struct<struct<numeric>>
 
   // Factory to construct numeric column with configurable null-mask.
-  auto const numeric_column_exemplar = [](std::function<bool(size_type)> pred) {
+  auto const numeric_column_exemplar = [](nvstd::function<bool(size_type)> pred) {
     return fixed_width_column_wrapper<TypeParam, int32_t>{
       {5, 10, 15, 20, 25, 30, 35, 45, 50, 55, 60, 65, 70, 75},
       make_counting_transform_iterator(0, [=](auto i) { return pred(i); })};
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index ea9459675ff..f642ad5bd90 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -14,14 +14,18 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 #include <functional>
 #include <limits>
 #include <memory>
@@ -32,7 +36,7 @@ using TestTypes = cudf::test::Types<int32_t>;
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(cudf::test::make_type_param_scalar<T>(0), false, stream, mr);
@@ -42,7 +46,7 @@ std::unique_ptr<cudf::scalar> make_scalar(
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   T value,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 87fe3145226..c63cab91be7 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,18 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/datetime.hpp>
-#include <cudf/types.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/timestamp_utilities.cuh>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/datetime.hpp>
+#include <cudf/types.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
 template <typename T>
 struct NonTimestampTest : public cudf::test::BaseFixture {
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
@@ -143,7 +143,6 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents)
 
 template <typename T>
 struct TypedDatetimeOpsTest : public cudf::test::BaseFixture {
-  cudaStream_t stream() { return cudaStream_t(0); }
   cudf::size_type size() { return cudf::size_type(10); }
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 16af9ea93bc..debf540ea8e 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,10 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/utilities/error.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream.hpp>
+
 #include <cstring>
 
 TEST(ExpectsTest, FalseCondition)
@@ -58,30 +61,26 @@ void __global__ test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
 // calls.
 TEST(StreamCheck, FailedKernel)
 {
-  cudaStream_t stream;
-  CUDA_TRY(cudaStreamCreate(&stream));
+  rmm::cuda_stream stream;
   int a;
-  test_kernel<<<0, 0, 0, stream>>>(&a);
+  test_kernel<<<0, 0, 0, stream.value()>>>(&a);
 #ifdef NDEBUG
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  stream.synchronize();
 #endif
-  EXPECT_THROW(CHECK_CUDA(stream), cudf::cuda_error);
-  CUDA_TRY(cudaStreamDestroy(stream));
+  EXPECT_THROW(CHECK_CUDA(stream.value()), cudf::cuda_error);
 }
 
 TEST(StreamCheck, CatchFailedKernel)
 {
-  cudaStream_t stream;
-  CUDA_TRY(cudaStreamCreate(&stream));
+  rmm::cuda_stream stream;
   int a;
-  test_kernel<<<0, 0, 0, stream>>>(&a);
+  test_kernel<<<0, 0, 0, stream.value()>>>(&a);
 #ifndef NDEBUG
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  stream.synchronize();
 #endif
-  CUDA_EXPECT_THROW_MESSAGE(CHECK_CUDA(stream),
+  CUDA_EXPECT_THROW_MESSAGE(CHECK_CUDA(stream.value()),
                             "cudaErrorInvalidConfiguration "
                             "invalid configuration argument");
-  CUDA_TRY(cudaStreamDestroy(stream));
 }
 
 __global__ void assert_false_kernel() { release_assert(false && "this kernel should die"); }
diff --git a/cpp/tests/groupby/group_std_test.cpp b/cpp/tests/groupby/group_std_test.cpp
index fdc69251428..e60aba08385 100644
--- a/cpp/tests/groupby/group_std_test.cpp
+++ b/cpp/tests/groupby/group_std_test.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#ifdef NDEBUG  // currently groupby std tests are not supported. See groupstd.cu
+
 #include <tests/groupby/groupby_test_util.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -144,3 +146,5 @@ TYPED_TEST(groupby_std_test, ddof_non_default)
 
 }  // namespace test
 }  // namespace cudf
+
+#endif  // NDEBUG
diff --git a/cpp/tests/groupby/group_var_test.cpp b/cpp/tests/groupby/group_var_test.cpp
index 2e49709a11f..6c1ea616212 100644
--- a/cpp/tests/groupby/group_var_test.cpp
+++ b/cpp/tests/groupby/group_var_test.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#ifdef NDEBUG  // currently groupby variance tests are not supported. See groupstd.cu
+
 #include <tests/groupby/groupby_test_util.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -144,3 +146,5 @@ TYPED_TEST(groupby_var_test, ddof_non_default)
 
 }  // namespace test
 }  // namespace cudf
+
+#endif  // NDEBUG
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index 5e69b4fd15d..b1fd3fa0bb4 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-19, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,19 @@
  * limitations under the License.
  */
 
+#include <hash/concurrent_unordered_map.cuh>
+
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
-#include <hash/concurrent_unordered_map.cuh>
 
-#include <gtest/gtest.h>
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/device_vector.h>
 #include <thrust/logical.h>
+
+#include <gtest/gtest.h>
+
 #include <cstdlib>
 #include <iostream>
 #include <limits>
@@ -51,7 +56,7 @@ struct InsertTest : public cudf::test::BaseFixture {
       std::min(static_cast<key_type>(size), std::numeric_limits<key_type>::max());
     pairs.resize(input_size);
     map = std::move(map_type::create(compute_hash_table_size(size)));
-    CUDA_TRY(cudaStreamSynchronize(0));
+    rmm::cuda_stream_default.synchronize();
   }
 
   const cudf::size_type size{10000};
diff --git a/cpp/tests/hash_map/multimap_test.cu b/cpp/tests/hash_map/multimap_test.cu
index 3ed3e14ab09..1f1d20eb8ae 100644
--- a/cpp/tests/hash_map/multimap_test.cu
+++ b/cpp/tests/hash_map/multimap_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
+#include <hash/concurrent_unordered_multimap.cuh>
+
 #include <cudf_test/base_fixture.hpp>
 
-#include <thrust/device_vector.h>
+#include <rmm/cuda_stream_view.hpp>
 
-#include <hash/concurrent_unordered_multimap.cuh>
+#include <thrust/device_vector.h>
 
 #include <gtest/gtest.h>
 
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 #include <vector>
 
-#include <cstdlib>
-
 // This is necessary to do a parametrized typed-test over multiple template
 // arguments
 template <typename Key, typename Value>
@@ -61,7 +62,7 @@ class MultimapTest : public cudf::test::BaseFixture {
   MultimapTest(const size_type hash_table_size = 100)
     : the_map(multimap_type::create(hash_table_size)), size(hash_table_size)
   {
-    CUDA_TRY(cudaStreamSynchronize(0));
+    rmm::cuda_stream_default.synchronize();
   }
 
   ~MultimapTest() {}
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 7c9b0929e35..c3ecafe990a 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -14,13 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/io/data_sink.hpp>
@@ -29,6 +22,14 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <fstream>
 #include <type_traits>
@@ -706,12 +707,12 @@ class custom_test_data_sink : public cudf::io::data_sink {
 
   bool supports_device_write() const override { return true; }
 
-  void device_write(void const* gpu_data, size_t size, cudaStream_t stream)
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
     char* ptr = nullptr;
     CUDA_TRY(cudaMallocHost(&ptr, size));
-    CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream));
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
+    stream.synchronize();
     outfile_.write(ptr, size);
     CUDA_TRY(cudaFreeHost(ptr));
   }
@@ -1135,12 +1136,12 @@ class custom_test_memmap_sink : public cudf::io::data_sink {
 
   bool supports_device_write() const override { return supports_device_writes; }
 
-  void device_write(void const* gpu_data, size_t size, cudaStream_t stream)
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream)
   {
     char* ptr = nullptr;
     CUDA_TRY(cudaMallocHost(&ptr, size));
-    CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream));
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
+    stream.synchronize();
     mm_writer->host_write(ptr, size);
     CUDA_TRY(cudaFreeHost(ptr));
   }
diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp
index 6d850e15e9f..c91fb6b3b5e 100644
--- a/cpp/tests/scalar/factories_test.cpp
+++ b/cpp/tests/scalar/factories_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/type_lists.hpp>
+
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/type_lists.hpp>
 
-class ScalarFactoryTest : public cudf::test::BaseFixture {
-  cudaStream_t _stream{0};
+#include <rmm/cuda_stream_view.hpp>
 
+class ScalarFactoryTest : public cudf::test::BaseFixture {
  public:
-  cudaStream_t stream() { return _stream; }
+  rmm::cuda_stream_view stream() { return rmm::cuda_stream_default; }
 };
 
 template <typename T>
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index 506b0de4a36..528517d2be5 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,18 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_view.hpp>
-#include <cudf/table/row_operators.cuh>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/table/table_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/column/column_view.hpp>
+#include <cudf/table/row_operators.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 #include <vector>
 
 // Compares two tables row by row, if table1 row is less than table2, then corresponding row value
@@ -34,7 +37,7 @@ void row_comparison(cudf::table_view input1,
                     cudf::mutable_column_view output,
                     std::vector<cudf::order> const& column_order)
 {
-  cudaStream_t stream = 0;
+  rmm::cuda_stream_view stream{};
 
   auto device_table_1 = cudf::table_device_view::create(input1, stream);
   auto device_table_2 = cudf::table_device_view::create(input2, stream);
@@ -43,7 +46,7 @@ void row_comparison(cudf::table_view input1,
   auto comparator = cudf::row_lexicographic_comparator<false>(
     *device_table_1, *device_table_2, d_column_order.data().get());
 
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::make_counting_iterator(0),
                     thrust::make_counting_iterator(input1.num_rows()),
                     thrust::make_counting_iterator(0),
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index dc51b039b11..b9cbcd7c8a5 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,12 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/timestamp_utilities.cuh>
+#include <cudf_test/type_lists.hpp>
+
 #include <cudf/binaryop.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -21,15 +27,12 @@
 #include <cudf/types.hpp>
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/timestamp_utilities.cuh>
-#include <cudf_test/type_lists.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 
 template <typename T>
 struct ChronoColumnTest : public cudf::test::BaseFixture {
-  cudaStream_t stream() { return cudaStream_t(0); }
+  rmm::cuda_stream_view stream() { return rmm::cuda_stream_default; }
   cudf::size_type size() { return cudf::size_type(100); }
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 8ceee00598a..597672ec50a 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -109,7 +109,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
 
   bool supports_device_write() const override { return true; }
 
-  void device_write(void const *gpu_data, size_t size, cudaStream_t stream) override {
+  void device_write(void const *gpu_data, size_t size, rmm::cuda_stream_view stream) override {
     JNIEnv *env = cudf::jni::get_jni_env(jvm);
     size_t left_to_copy = size;
     const char *copy_from = static_cast<const char *>(gpu_data);
@@ -117,7 +117,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
         // should never be < 0, but just to be safe
-        CUDA_TRY(cudaStreamSynchronize(stream));
+        stream.synchronize();
         rotate_buffer(env);
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
@@ -126,14 +126,14 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
       char *copy_to = current_buffer_data + current_buffer_written;
 
       CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost,
-                               stream));
+                               stream.value()));
 
       copy_from = copy_from + amount_to_copy;
       current_buffer_written += amount_to_copy;
       total_written += amount_to_copy;
       left_to_copy -= amount_to_copy;
     }
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    stream.synchronize();
   }
 
   void flush() override {
diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu
index 95eea10e8e0..e1fc56ed834 100644
--- a/java/src/main/native/src/map_lookup.cu
+++ b/java/src/main/native/src/map_lookup.cu
@@ -119,7 +119,7 @@ get_gather_map_for_map_values(column_view const &input, string_scalar &lookup_ke
   gpu_find_first<block_size, has_nulls><<<grid.num_blocks, block_size, 0, stream.value()>>>(
       *input_device_view, *output_view, lookup_key_device_view);
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 
   return gather_map;
 }