From ff53e23103f58ebfe0aebf8f4943a64bd958567d Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 29 Oct 2020 18:36:55 +1100
Subject: [PATCH 01/51] Fix cast warning.

---
 cpp/src/dictionary/detail/merge.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu
index e2d2760642a..6448d711db1 100644
--- a/cpp/src/dictionary/detail/merge.cu
+++ b/cpp/src/dictionary/detail/merge.cu
@@ -59,7 +59,8 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
   return make_dictionary_column(
     std::make_unique<column>(lcol.keys(), stream, mr),
     std::move(indices_column),
-    rmm::device_buffer{lcol.has_nulls() || rcol.has_nulls() ? size_t{merged_size} : 0, stream, mr},
+    rmm::device_buffer{
+      lcol.has_nulls() || rcol.has_nulls() ? static_cast<size_t>(merged_size) : 0, stream, mr},
     lcol.null_count() + rcol.null_count());
 }
 

From 8c44adea46e08d1a4987c7a3e0ebc16c142c94e2 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 2 Nov 2020 14:48:19 +1100
Subject: [PATCH 02/51] Initial stream changes

---
 .../common/generate_benchmark_input.cpp       |  10 +-
 cpp/benchmarks/copying/shift_benchmark.cu     |  13 +-
 .../null_mask/set_null_mask_benchmark.cpp     |   4 +-
 .../type_dispatcher_benchmark.cu              |  14 +-
 cpp/docs/DOCUMENTATION.md                     |   4 +-
 cpp/docs/TRANSITIONGUIDE.md                   |  10 +-
 cpp/include/cudf/copying.hpp                  |  81 ++++----
 cpp/include/cudf/detail/copy.hpp              |  45 +++--
 cpp/include/cudf/detail/copy_if.cuh           |   2 +-
 cpp/include/cudf/detail/copy_if_else.cuh      |  13 +-
 cpp/include/cudf/detail/gather.cuh            |   8 +-
 cpp/include/cudf/detail/null_mask.hpp         |  67 +++++--
 cpp/include/cudf/detail/valid_if.cuh          |   9 +-
 cpp/include/cudf/null_mask.hpp                |  17 +-
 cpp/include/cudf/scalar/scalar.hpp            |  62 ++++---
 .../cudf/strings/detail/copy_if_else.cuh      |  20 +-
 cpp/include/cudf/strings/detail/merge.cuh     |   4 +-
 .../cudf/strings/detail/modify_strings.cuh    |   5 +-
 cpp/include/cudf/strings/detail/scatter.cuh   |   6 +-
 cpp/src/binaryop/binaryop.cpp                 |  45 ++---
 cpp/src/binaryop/compiled/binary_ops.cu       |   3 +-
 cpp/src/bitmask/null_mask.cu                  | 174 +++++++++++-------
 cpp/src/column/column.cu                      |  29 +--
 cpp/src/column/column_factories.cpp           |  16 +-
 cpp/src/copying/concatenate.cu                |   5 +-
 cpp/src/copying/copy.cpp                      |  15 +-
 cpp/src/copying/copy.cu                       |  97 +++++-----
 cpp/src/copying/copy_range.cu                 |   3 +-
 cpp/src/copying/sample.cu                     |  17 +-
 cpp/src/copying/scatter.cu                    |   5 +-
 cpp/src/copying/shift.cu                      |  28 ++-
 cpp/src/copying/slice.cpp                     |   5 +-
 cpp/src/datetime/datetime_ops.cu              |  16 +-
 cpp/src/dictionary/add_keys.cu                |  11 +-
 cpp/src/dictionary/decode.cu                  |   7 +-
 cpp/src/dictionary/dictionary_factories.cu    |   5 +-
 cpp/src/dictionary/encode.cu                  |  11 +-
 cpp/src/dictionary/replace.cu                 |   4 +-
 cpp/src/filling/fill.cu                       |   6 +-
 cpp/src/groupby/hash/groupby.cu               |   3 +-
 cpp/src/groupby/sort/sort_helper.cu           |   3 +-
 cpp/src/interop/from_arrow.cpp                |  33 ++--
 cpp/src/io/avro/reader_impl.cu                |   3 +-
 cpp/src/io/csv/durations.cu                   |   4 +-
 cpp/src/io/utilities/column_buffer.hpp        |   7 +-
 cpp/src/lists/copying/copying.cu              |   4 +-
 cpp/src/merge/merge.cu                        |   2 +-
 cpp/src/quantiles/quantile.cu                 |  18 +-
 cpp/src/reductions/scan.cu                    |  31 ++--
 cpp/src/replace/clamp.cu                      |   2 +-
 cpp/src/replace/nans.cu                       |  16 +-
 cpp/src/replace/nulls.cu                      |  19 +-
 cpp/src/replace/replace.cu                    |   9 +-
 cpp/src/reshape/byte_cast.cu                  |   7 +-
 cpp/src/reshape/interleave_columns.cu         |   2 +-
 cpp/src/scalar/scalar.cpp                     |   8 +-
 cpp/src/sort/rank.cu                          |  11 +-
 cpp/src/strings/attributes.cu                 |   6 +-
 cpp/src/strings/case.cu                       |   7 +-
 cpp/src/strings/char_types/char_types.cu      |  46 +++--
 cpp/src/strings/combine.cu                    |   6 +-
 cpp/src/strings/contains.cu                   |  30 +--
 cpp/src/strings/convert/convert_booleans.cu   |  19 +-
 cpp/src/strings/convert/convert_datetime.cu   |  35 ++--
 cpp/src/strings/convert/convert_durations.cu  |  22 ++-
 cpp/src/strings/convert/convert_floats.cu     |  21 ++-
 cpp/src/strings/convert/convert_hex.cu        |  29 +--
 cpp/src/strings/convert/convert_integers.cu   |  19 +-
 cpp/src/strings/convert/convert_ipv4.cu       |  32 ++--
 cpp/src/strings/convert/convert_urls.cu       |   9 +-
 cpp/src/strings/copying/concatenate.cu        |   4 +-
 cpp/src/strings/filter_chars.cu               |   7 +-
 cpp/src/strings/find.cu                       |  46 +++--
 cpp/src/strings/findall.cu                    |  37 ++--
 cpp/src/strings/padding.cu                    |   9 +-
 cpp/src/strings/replace/replace.cu            |  12 +-
 cpp/src/strings/split/split.cu                |  25 +--
 cpp/src/strings/strip.cu                      |   6 +-
 cpp/src/strings/substring.cu                  |   4 +-
 cpp/src/strings/translate.cu                  |   5 +-
 cpp/src/strings/wrap.cu                       |   4 +-
 cpp/src/text/normalize.cu                     |  24 ++-
 cpp/src/text/replace.cu                       |   7 +-
 cpp/src/text/stemmer.cu                       |  27 +--
 cpp/src/transform/encode.cu                   |   2 +-
 cpp/src/unary/cast_ops.cu                     |  15 +-
 cpp/src/unary/math_ops.cu                     |  68 ++++---
 87 files changed, 993 insertions(+), 668 deletions(-)

diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
index d516c084f03..e82a58c3a5b 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.cpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -24,6 +24,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <future>
@@ -296,9 +297,9 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
   return std::make_unique<cudf::column>(
     cudf::data_type{cudf::type_to_id<T>()},
     num_rows,
-    rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), cudaStream_t(0)),
+    rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), rmm::cuda_stream_default),
     rmm::device_buffer(
-      null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
+      null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
 }
 
 /**
@@ -483,7 +484,8 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
     auto offsets_column = std::make_unique<cudf::column>(
       cudf::data_type{cudf::type_id::INT32},
       offsets.size(),
-      rmm::device_buffer(offsets.data(), offsets.size() * sizeof(int32_t), cudaStream_t(0)));
+      rmm::device_buffer(
+        offsets.data(), offsets.size() * sizeof(int32_t), rmm::cuda_stream_default));
 
     list_column = cudf::make_lists_column(
       num_rows,
@@ -491,7 +493,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
       std::move(current_child_column),
       cudf::UNKNOWN_NULL_COUNT,
       rmm::device_buffer(
-        null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), cudaStream_t(0)));
+        null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
   }
   return list_column;  // return the top-level column
 }
diff --git a/cpp/benchmarks/copying/shift_benchmark.cu b/cpp/benchmarks/copying/shift_benchmark.cu
index 648bb699dbf..4cf3455debb 100644
--- a/cpp/benchmarks/copying/shift_benchmark.cu
+++ b/cpp/benchmarks/copying/shift_benchmark.cu
@@ -14,17 +14,8 @@
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  cudaStream_t stream                 = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto s = new ScalarType(0, false, stream, mr);
-  return std::unique_ptr<cudf::scalar>(s);
-}
-
-template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
-std::unique_ptr<cudf::scalar> make_scalar(
-  T value,
-  cudaStream_t stream                 = 0,
+  T value                             = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);
diff --git a/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp b/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
index 2f47393731a..e0a35ff0097 100644
--- a/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
+++ b/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
@@ -31,7 +31,7 @@ void BM_setnullmask(benchmark::State& state)
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true, 0);
+    cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), begin, end, true);
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * size / 8);
@@ -44,4 +44,4 @@ void BM_setnullmask(benchmark::State& state)
     ->Range(1 << 10, 1 << 30)                                                                  \
     ->UseManualTime();
 
-NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
\ No newline at end of file
+NBM_BENCHMARK_DEFINE(SetNullMaskKernel);
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
index 7b1068d09dd..222a2c40618 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
@@ -90,13 +90,13 @@ struct ColumnHandle {
   template <typename ColumnType>
   void operator()(mutable_column_device_view source_column,
                   int work_per_thread,
-                  cudaStream_t stream = 0)
+                  rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     cudf::detail::grid_1d grid_config{source_column.size(), block_size};
     int grid_size = grid_config.num_blocks;
     // Launch the kernel.
     host_dispatching_kernel<functor_type, ColumnType>
-      <<<grid_size, block_size, 0, stream>>>(source_column);
+      <<<grid_size, block_size, 0, stream.value()>>>(source_column);
   }
 };
 
@@ -144,14 +144,14 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
     // std::vector<cudf::util::cuda::scoped_stream> v_stream(n_cols);
     for (int c = 0; c < n_cols; c++) {
       auto d_column = mutable_column_device_view::create(input.column(c));
-      cudf::type_dispatcher(
-        d_column->type(), ColumnHandle<functor_type>{}, *d_column, work_per_thread);
+      // cudf::type_dispatcher(
+      //  d_column->type(), ColumnHandle<functor_type>{}, *d_column, work_per_thread);
     }
   } else if (dispatching_type == DEVICE_DISPATCHING) {
     auto d_table_view = mutable_table_device_view::create(input);
-    auto f            = device_dispatching_kernel<functor_type>;
+    // auto f            = device_dispatching_kernel<functor_type>;
     // Launch the kernel
-    f<<<grid_size, block_size>>>(*d_table_view);
+    // f<<<grid_size, block_size>>>(*d_table_view);
   } else if (dispatching_type == NO_DISPATCHING) {
     auto f = no_dispatching_kernel<functor_type, T>;
     // Launch the kernel
@@ -160,7 +160,7 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
 }
 
 template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
-void type_dispatcher_benchmark(benchmark::State& state)
+void type_dispatcher_benchmark(::benchmark::State& state)
 {
   const cudf::size_type source_size = static_cast<cudf::size_type>(state.range(1));
 
diff --git a/cpp/docs/DOCUMENTATION.md b/cpp/docs/DOCUMENTATION.md
index 6b0a51dbf1b..b219543e3d6 100644
--- a/cpp/docs/DOCUMENTATION.md
+++ b/cpp/docs/DOCUMENTATION.md
@@ -225,7 +225,7 @@ You can use the `@copydoc` tag to avoid duplicating the comment block for a func
    */
 ```
 
-Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `cudaStream_t` parameter.
+Also, `@copydoc` is useful when documenting a `detail` function that differs only by the `stream` parameter.
 
 ```c++
 /**
@@ -235,7 +235,7 @@ Also, `@copydoc` is useful when documenting a `detail` function that differs onl
  */
 std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
                                                 std::vector<size_type> const& indices,
-                                                cudaStream_t stream = 0);
+                                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 ```
 
 Note, you must specify the whole signature of the function, including optional parameters, so that doxygen will be able to locate it.
diff --git a/cpp/docs/TRANSITIONGUIDE.md b/cpp/docs/TRANSITIONGUIDE.md
index f7de4863952..8786c4c039b 100644
--- a/cpp/docs/TRANSITIONGUIDE.md
+++ b/cpp/docs/TRANSITIONGUIDE.md
@@ -131,7 +131,7 @@ A *mutable*, non-owning view of a table.
 We do not yet expose CUDA streams in external libcudf APIs. 
 However, in order to ease the transition to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be implemented using asynchronous APIs on the default stream (e.g., stream 0). 
 
-The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `cudaStream_t` parameter at the end defaulted to `0`. 
+The recommended pattern for doing this is to make the definition of the external API invoke an internal API in the `detail` namespace. The internal `detail` API will have all the same parameters, plus a `rmm::cuda_stream_view` parameter at the end defaulted to `rmm::cuda_stream_default`. 
 The implementation should be wholly contained in the `detail` API definition and use only asynchronous versions of CUDA APIs with the defaulted stream parameter. 
 
 In order to make the `detail` API callable from other libcudf functions, it should be exposed in a header placed in the `cudf/cpp/include/detail/` directory.
@@ -144,19 +144,19 @@ void external_function(...);
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
-void external_function(..., cudaStream_t stream = 0)
+void external_function(..., rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 } // namespace detail
 
 // cudf/src/implementation.cpp
 namespace detail{
     // defaulted stream parameter
-    void external_function(..., cudaStream_t stream){
+    void external_function(..., rmm::cuda_stream_view stream){
         // implementation uses stream w/ async APIs
         RMM_ALLOC(...,stream);
-        CUDA_TRY(cudaMemcpyAsync(...,stream));
+        CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
         kernel<<<..., stream>>>(...);
         thrust::algorithm(rmm::exec_policy(stream)->on(stream), ...);
-        CUDA_TRY(cudaStreamSynchronize(stream));
+        stream.synchronize();
         RMM_FREE(...,stream);
     }
 } // namespace detail
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index a20d9d653ce..b1483fea133 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -286,6 +286,46 @@ std::unique_ptr<column> copy_range(
   size_type target_begin,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Creates a new column by shifting all values by an offset.
+ *
+ * @ingroup copy_shift
+ *
+ * Elements will be determined by `output[idx] = input[idx - offset]`.
+ * Some elements in the output may be indeterminable from the input. For those
+ * elements, the value will be determined by `fill_values`.
+ *
+ * @code{.pseudo}
+ * Examples
+ * -------------------------------------------------
+ * input       = [0, 1, 2, 3, 4]
+ * offset      = 3
+ * fill_values = @
+ * return      = [@, @, @, 0, 1]
+ * -------------------------------------------------
+ * input       = [5, 4, 3, 2, 1]
+ * offset      = -2
+ * fill_values = 7
+ * return      = [3, 2, 1, 7, 7]
+ * @endcode
+ *
+ * @note if the input is nullable, the output will be nullable.
+ * @note if the fill value is null, the output will be nullable.
+ *
+ * @param input      Column to be shifted.
+ * @param offset     The offset by which to shift the input.
+ * @param fill_value Fill value for indeterminable outputs.
+ * @param mr         Device memory resource used to allocate the returned result's device memory
+ *
+ * @throw cudf::logic_error if @p input dtype is not fixed-with.
+ * @throw cudf::logic_error if @p fill_value dtype does not match @p input dtype.
+ */
+std::unique_ptr<column> shift(
+  column_view const& input,
+  size_type offset,
+  scalar const& fill_value,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Slices a `column_view` into a set of `column_view`s according to a set of indices.
  *
@@ -479,7 +519,6 @@ struct contiguous_split_result {
  * @param input View of a table to split
  * @param splits A vector of indices where the view will be split
  * @param[in] mr Device memory resource used to allocate the returned result's device memory
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return The set of requested views of `input` indicated by the `splits` and the viewed memory
  * buffer.
  */
@@ -513,46 +552,6 @@ std::unique_ptr<column> copy_if_else(
   column_view const& boolean_mask,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Creates a new column by shifting all values by an offset.
- *
- * @ingroup copy_shift
- *
- * Elements will be determined by `output[idx] = input[idx - offset]`.
- * Some elements in the output may be indeterminable from the input. For those
- * elements, the value will be determined by `fill_values`.
- *
- * @code{.pseudo}
- * Examples
- * -------------------------------------------------
- * input       = [0, 1, 2, 3, 4]
- * offset      = 3
- * fill_values = @
- * return      = [@, @, @, 0, 1]
- * -------------------------------------------------
- * input       = [5, 4, 3, 2, 1]
- * offset      = -2
- * fill_values = 7
- * return      = [3, 2, 1, 7, 7]
- * @endcode
- *
- * @note if the input is nullable, the output will be nullable.
- * @note if the fill value is null, the output will be nullable.
- *
- * @param input      Column to be shifted.
- * @param offset     The offset by which to shift the input.
- * @param fill_value Fill value for indeterminable outputs.
- *
- * @throw cudf::logic_error if @p input dtype is not fixed-with.
- * @throw cudf::logic_error if @p fill_value dtype does not match @p input dtype.
- */
-std::unique_ptr<column> shift(
-  column_view const& input,
-  size_type offset,
-  scalar const& fill_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
-
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
  *          @p rhs based on the value of the corresponding element in @p boolean_mask
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 22399043bb2..0312f1ebe75 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -21,6 +21,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 /**
@@ -71,7 +73,20 @@ ColumnView slice(ColumnView const& input, cudf::size_type begin, cudf::size_type
  */
 std::vector<column_view> slice(column_view const& input,
                                std::vector<size_type> const& indices,
-                               cudaStream_t stream = 0);
+                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+/**
+ * @copydoc cudf::shift(column_view const&,size_type,scalar const&,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> shift(
+  column_view const& input,
+  size_type offset,
+  scalar const& fill_value,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::contiguous_split
@@ -81,8 +96,8 @@ std::vector<column_view> slice(column_view const& input,
 std::vector<contiguous_split_result> contiguous_split(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::allocate_like(column_view const&, size_type, mask_allocation_policy,
@@ -94,8 +109,8 @@ std::unique_ptr<column> allocate_like(
   column_view const& input,
   size_type size,
   mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, column_view const&,
@@ -107,8 +122,8 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, column_view const&,
@@ -120,8 +135,8 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, scalar const&,
@@ -133,8 +148,8 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, scalar const&,
@@ -146,8 +161,8 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::sample
@@ -159,8 +174,8 @@ std::unique_ptr<table> sample(
   size_type const n,
   sample_with_replacement replacement = sample_with_replacement::FALSE,
   int64_t const seed                  = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index ce581d71ac7..9399df22450 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -213,7 +213,7 @@ struct scatter_gather_functor {
     cudaStream_t stream                 = 0)
   {
     auto output_column = cudf::detail::allocate_like(
-      input, output_size, cudf::mask_allocation_policy::RETAIN, mr, stream);
+      input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
     auto output = output_column->mutable_view();
 
     bool has_valid = input.nullable();
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 0bad8a1a86f..d5be077d27b 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -27,9 +27,10 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <cub/cub.cuh>
 #include <rmm/device_scalar.hpp>
 
+#include <cub/cub.cuh>
+
 namespace cudf {
 namespace detail {
 namespace {  // anonymous
@@ -162,8 +163,8 @@ std::unique_ptr<column> copy_if_else(
   LeftIter lhs_end,
   RightIter rhs,
   FilterFn filter,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
 {
   using Element =
     typename thrust::tuple_element<0, typename thrust::iterator_traits<LeftIter>::value_type>::type;
@@ -177,7 +178,7 @@ std::unique_ptr<column> copy_if_else(
     make_fixed_width_column(data_type(type_to_id<Element>()),
                             size,
                             nullable ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED,
-                            stream,
+                            stream.value(),
                             mr);
 
   auto out_v = mutable_column_device_view::create(*out);
@@ -188,14 +189,14 @@ std::unique_ptr<column> copy_if_else(
 
     // call the kernel
     copy_if_else_kernel<block_size, Element, LeftIter, RightIter, FilterFn, true>
-      <<<grid.num_blocks, block_size, 0, stream>>>(
+      <<<grid.num_blocks, block_size, 0, stream.value()>>>(
         lhs_begin, rhs, filter, *out_v, valid_count.data());
 
     out->set_null_count(size - valid_count.value());
   } else {
     // call the kernel
     copy_if_else_kernel<block_size, Element, LeftIter, RightIter, FilterFn, false>
-      <<<grid.num_blocks, block_size, 0, stream>>>(lhs_begin, rhs, filter, *out_v, nullptr);
+      <<<grid.num_blocks, block_size, 0, stream.value()>>>(lhs_begin, rhs, filter, *out_v, nullptr);
   }
 
   return out;
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index dd6266f258b..f20af839916 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -17,6 +17,7 @@
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/indexalator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/release_assert.cuh>
 #include <cudf/detail/valid_if.cuh>
@@ -24,7 +25,6 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/lists/detail/gather.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/gather.cuh>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
@@ -175,7 +175,7 @@ struct column_gatherer_impl {
     auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
     auto const policy   = cudf::mask_allocation_policy::NEVER;
     auto destination_column =
-      cudf::detail::allocate_like(source_column, num_rows, policy, mr, stream);
+      cudf::detail::allocate_like(source_column, num_rows, policy, stream, mr);
 
     using Type = device_storage_type_t<Element>;
 
@@ -403,7 +403,7 @@ struct column_gatherer_impl<dictionary32, MapItType> {
     // Perform gather on just the indices
     column_view indices = dictionary.get_indices_annotated();
     auto new_indices    = cudf::detail::allocate_like(
-      indices, output_count, cudf::mask_allocation_policy::NEVER, mr, stream);
+      indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
     gather_helper(
       cudf::detail::indexalator_factory::make_input_iterator(indices),
       indices.size(),
@@ -496,7 +496,7 @@ void gather_bitmask(table_view const& source,
         not target[i]->nullable()) {
       auto const state =
         op == gather_bitmask_op::PASSTHROUGH ? mask_state::ALL_VALID : mask_state::UNINITIALIZED;
-      auto mask = create_null_mask(target[i]->size(), state, stream, mr);
+      auto mask = detail::create_null_mask(target[i]->size(), state, stream, mr);
       target[i]->set_null_mask(std::move(mask), 0);
     }
   }
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 6319da752bc..4b2c5b0a8d6 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -18,9 +18,33 @@
 #include <cudf/types.hpp>
 
 #include <vector>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
+
+/**
+ * @copydoc cudf::create_null_mask(size_type, mask_state, rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ **/
+rmm::device_buffer create_null_mask(
+  size_type size,
+  mask_state state,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::set_null_mask(bitmask_type*, size_type, size_type, bool)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ **/
+void set_null_mask(bitmask_type *bitmask,
+                   size_type begin_bit,
+                   size_type end_bit,
+                   bool valid,
+                   rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
 /**
  * @copydoc cudf::segmented_count_set_bits
  *
@@ -28,7 +52,7 @@ namespace detail {
  */
 std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
                                                 std::vector<size_type> const &indices,
-                                                cudaStream_t stream = 0);
+                                                rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::segmented_count_unset_bits
@@ -37,22 +61,41 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
  */
 std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
                                                   std::vector<size_type> const &indices,
-                                                  cudaStream_t stream = 0);
+                                                  rmm::cuda_stream_view stream);
 
 /**
- * @brief Returns a bitwise AND of the specified bitmasks
+ * @copydoc cudf::copy_bitmask(bitmask_type const*, size_type, size_type,
+ *rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ **/
+rmm::device_buffer copy_bitmask(
+  bitmask_type const *mask,
+  size_type begin_bit,
+  size_type end_bit,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::copy_bitmask(column_view const& view, rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ **/
+rmm::device_buffer copy_bitmask(
+  column_view const &view,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc bitmask_and(std::vector<bitmask_type const*>, std::vector<size_type> const&, size_type,
+ * rmm::mr::device_memory_resource *)
  *
- * @param masks The list of data pointers of the bitmasks to be ANDed
- * @param begin_bits The bit offsets from which each mask is to be ANDed
- * @param mask_size The number of bits to be ANDed in each mask
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned device_buffer
- * @return rmm::device_buffer Output bitmask
  */
 rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
                                std::vector<size_type> const &begin_bits,
                                size_type mask_size,
-                               cudaStream_t stream,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource *mr);
 
 /**
@@ -61,8 +104,8 @@ rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer bitmask_and(table_view const &view,
-                               rmm::mr::device_memory_resource *mr,
-                               cudaStream_t stream = 0);
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource *mr);
 
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
@@ -80,7 +123,7 @@ void inplace_bitmask_and(bitmask_type *dest_mask,
                          std::vector<bitmask_type const *> const &masks,
                          std::vector<size_type> const &begin_bits,
                          size_type mask_size,
-                         cudaStream_t stream,
+                         rmm::cuda_stream_view stream,
                          rmm::mr::device_memory_resource *mr);
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index c9719228f87..011a3fa616c 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
@@ -25,6 +25,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/distance.h>
 #include <rmm/device_scalar.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
@@ -87,14 +88,14 @@ std::pair<rmm::device_buffer, size_type> valid_if(
   InputIterator begin,
   InputIterator end,
   Predicate p,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(begin <= end, "Invalid range.");
 
   size_type size = thrust::distance(begin, end);
 
-  auto null_mask = create_null_mask(size, mask_state::UNINITIALIZED, stream, mr);
+  auto null_mask = detail::create_null_mask(size, mask_state::UNINITIALIZED, stream, mr);
 
   size_type null_count{0};
   if (size > 0) {
@@ -103,7 +104,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(
     constexpr size_type block_size{256};
     grid_1d grid{size, block_size};
 
-    valid_if_kernel<block_size><<<grid.num_blocks, grid.num_threads_per_block, 0, stream>>>(
+    valid_if_kernel<block_size><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       static_cast<bitmask_type*>(null_mask.data()), begin, size, p, valid_count.data());
 
     null_count = size - valid_count.value(stream);
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 50ea7ead37d..110fd2b5087 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -76,7 +76,6 @@ size_type num_bitmask_words(size_type number_of_bits);
  *
  * @param size The number of elements to be represented by the mask
  * @param state The desired state of the mask
- * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned device_buffer.
  * @return rmm::device_buffer A `device_buffer` for use as a null bitmask
  * satisfying the desired size and state
@@ -84,7 +83,6 @@ size_type num_bitmask_words(size_type number_of_bits);
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
-  cudaStream_t stream                 = 0,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -98,13 +96,8 @@ rmm::device_buffer create_null_mask(
  * @param begin_bit Index of the first bit to set (inclusive)
  * @param end_bit Index of the last bit to set (exclusive)
  * @param valid If true set all entries to valid; otherwise, set all to null.
- * @param stream CUDA stream used for device memory operations and kernel launches.
  **/
-void set_null_mask(bitmask_type* bitmask,
-                   size_type begin_bit,
-                   size_type end_bit,
-                   bool valid,
-                   cudaStream_t stream = 0);
+void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid);
 
 /**
  * @brief Given a bitmask, counts the number of set (1) bits in the range
@@ -188,7 +181,6 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
  * @param mask Bitmask residing in device memory whose bits will be copied
  * @param begin_bit Index of the first bit to be copied (inclusive)
  * @param end_bit Index of the last bit to be copied (exclusive)
- * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return rmm::device_buffer A `device_buffer` containing the bits
  * `[begin_bit, end_bit)` from `mask`.
@@ -197,7 +189,6 @@ rmm::device_buffer copy_bitmask(
   bitmask_type const* mask,
   size_type begin_bit,
   size_type end_bit,
-  cudaStream_t stream                 = 0,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -207,14 +198,12 @@ rmm::device_buffer copy_bitmask(
  * Returns empty `device_buffer` if the column is not nullable
  *
  * @param view Column view whose bitmask needs to be copied
- * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return rmm::device_buffer A `device_buffer` containing the bits
  * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask.
  **/
 rmm::device_buffer copy_bitmask(
   column_view const& view,
-  cudaStream_t stream                 = 0,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -224,14 +213,12 @@ rmm::device_buffer copy_bitmask(
  * If no column in the table is nullable, an empty bitmask is returned.
  *
  * @param view The table of columns
- * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return rmm::device_buffer Output bitmask
  */
 rmm::device_buffer bitmask_and(
   table_view const& view,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index ed3a6aebf31..dcce9f043e8 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -28,6 +28,7 @@
 #include <memory>
 #include <utility>
 #include <vector>
+#include "rmm/cuda_stream_view.hpp"
 
 /**
  * @file
@@ -67,7 +68,10 @@ class scalar {
    * @param is_valid true: set the value to valid. false: set it to null
    * @param stream CUDA stream used for device memory operations.
    */
-  void set_valid(bool is_valid, cudaStream_t stream = 0) { _is_valid.set_value(is_valid, stream); }
+  void set_valid(bool is_valid, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+  {
+    _is_valid.set_value(is_valid, stream);
+  }
 
   /**
    * @brief Indicates whether the scalar contains a valid value
@@ -78,7 +82,10 @@ class scalar {
    * @return true Value is valid
    * @return false Value is invalid/null
    */
-  bool is_valid(cudaStream_t stream = 0) const { return _is_valid.value(stream); }
+  bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
+  {
+    return _is_valid.value(stream);
+  }
 
   /**
    * @brief Returns a raw pointer to the validity bool in device memory
@@ -109,7 +116,7 @@ class scalar {
    */
   scalar(data_type type,
          bool is_valid                       = false,
-         cudaStream_t stream                 = 0,
+         rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : _type(type), _is_valid(is_valid, stream, mr)
   {
@@ -136,7 +143,7 @@ class fixed_width_scalar : public scalar {
    * @param value New value of scalar
    * @param stream CUDA stream used for device memory operations.
    */
-  void set_value(T value, cudaStream_t stream = 0)
+  void set_value(T value, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     _data.set_value(value, stream);
     this->set_valid(true, stream);
@@ -152,7 +159,10 @@ class fixed_width_scalar : public scalar {
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  T value(cudaStream_t stream = 0) const { return _data.value(stream); }
+  T value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
+  {
+    return _data.value(stream);
+  }
 
   /**
    * @brief Returns a raw pointer to the value in device memory
@@ -179,7 +189,7 @@ class fixed_width_scalar : public scalar {
    */
   fixed_width_scalar(T value,
                      bool is_valid                       = true,
-                     cudaStream_t stream                 = 0,
+                     rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : scalar(data_type(type_to_id<T>()), is_valid, stream, mr), _data(value, stream, mr)
   {
@@ -195,7 +205,7 @@ class fixed_width_scalar : public scalar {
    */
   fixed_width_scalar(rmm::device_scalar<T>&& data,
                      bool is_valid                       = true,
-                     cudaStream_t stream                 = 0,
+                     rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : scalar(data_type(type_to_id<T>()), is_valid, stream, mr),
       _data{std::forward<rmm::device_scalar<T>>(data)}
@@ -232,7 +242,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    */
   numeric_scalar(T value,
                  bool is_valid                       = true,
-                 cudaStream_t stream                 = 0,
+                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
   {
@@ -248,7 +258,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    */
   numeric_scalar(rmm::device_scalar<T>&& data,
                  bool is_valid                       = true,
-                 cudaStream_t stream                 = 0,
+                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
   {
@@ -286,7 +296,7 @@ class fixed_point_scalar : public scalar {
   fixed_point_scalar(rep_type value,
                      numeric::scale_type scale,
                      bool is_valid                       = true,
-                     cudaStream_t stream                 = 0,
+                     rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : scalar{data_type{type_to_id<T>(), static_cast<int32_t>(scale)}, is_valid, stream, mr},
       _data{value}
@@ -303,7 +313,7 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(rep_type value,
                      bool is_valid                       = true,
-                     cudaStream_t stream                 = 0,
+                     rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : scalar{data_type{type_to_id<T>(), 0}, is_valid, stream, mr}, _data{value}
   {
@@ -319,7 +329,7 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(T value,
                      bool is_valid                       = true,
-                     cudaStream_t stream                 = 0,
+                     rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : scalar{data_type{type_to_id<T>(), 0}, is_valid, stream, mr},
       _data{numeric::scaled_integer<rep_type>{value}.value}
@@ -338,7 +348,7 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
                      bool is_valid                       = true,
-                     cudaStream_t stream                 = 0,
+                     rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : scalar{data_type{type_to_id<T>()}, is_valid, stream, mr},  // note that scale is ignored here
       _data{std::forward<rmm::device_scalar<rep_type>>(data)}
@@ -350,7 +360,10 @@ class fixed_point_scalar : public scalar {
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  rep_type value(cudaStream_t stream = 0) const { return _data.value(stream); }
+  rep_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
+  {
+    return _data.value(stream);
+  }
 
   /**
    * @brief Returns a raw pointer to the value in device memory
@@ -390,7 +403,7 @@ class string_scalar : public scalar {
    */
   string_scalar(std::string const& string,
                 bool is_valid                       = true,
-                cudaStream_t stream                 = 0,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : scalar(data_type(type_id::STRING), is_valid), _data(string.data(), string.size(), stream, mr)
   {
@@ -407,7 +420,7 @@ class string_scalar : public scalar {
    */
   string_scalar(value_type const& source,
                 bool is_valid                       = true,
-                cudaStream_t stream                 = 0,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : scalar(data_type(type_id::STRING), is_valid),
       _data(source.data(), source.size_bytes(), stream, mr)
@@ -425,7 +438,7 @@ class string_scalar : public scalar {
    */
   string_scalar(rmm::device_scalar<value_type>& data,
                 bool is_valid                       = true,
-                cudaStream_t stream                 = 0,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : string_scalar(data.value(stream), is_valid, stream, mr)
   {
@@ -441,14 +454,17 @@ class string_scalar : public scalar {
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  std::string to_string(cudaStream_t stream = 0) const;
+  std::string to_string(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Get the value of the scalar as a string_view
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  value_type value(cudaStream_t stream = 0) const { return value_type{data(), size()}; }
+  value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
+  {
+    return value_type{data(), size()};
+  }
 
   /**
    * @brief Returns the size of the string in bytes
@@ -492,7 +508,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    */
   chrono_scalar(T value,
                 bool is_valid                       = true,
-                cudaStream_t stream                 = 0,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
   {
@@ -508,7 +524,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    */
   chrono_scalar(rmm::device_scalar<T>&& data,
                 bool is_valid                       = true,
-                cudaStream_t stream                 = 0,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
   {
@@ -535,7 +551,7 @@ struct timestamp_scalar : chrono_scalar<T> {
   template <typename Duration2>
   timestamp_scalar(Duration2 const& value,
                    bool is_valid,
-                   cudaStream_t stream                 = 0,
+                   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : chrono_scalar<T>(T{typename T::duration{value}}, is_valid, stream, mr)
   {
@@ -564,7 +580,7 @@ struct duration_scalar : chrono_scalar<T> {
    */
   duration_scalar(typename T::rep value,
                   bool is_valid,
-                  cudaStream_t stream                 = 0,
+                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : chrono_scalar<T>(T{value}, is_valid, stream, mr)
   {
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 21954104d72..7bfe1df4239 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -21,6 +21,7 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace strings {
@@ -54,11 +55,11 @@ std::unique_ptr<cudf::column> copy_if_else(
   StringPairIterLeft lhs_end,
   StringPairIterRight rhs_begin,
   Filter filter_fn,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = std::distance(lhs_begin, lhs_end);
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
 
   auto execpol = rmm::exec_policy(stream);
   // create null mask
@@ -86,16 +87,17 @@ std::unique_ptr<cudf::column> copy_if_else(
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
   auto d_offsets = offsets_column->view().template data<int32_t>();
 
   // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = create_chars_child_column(strings_count, null_count, bytes, mr, stream);
-  auto d_chars      = chars_column->mutable_view().template data<char>();
+  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
+  auto chars_column =
+    create_chars_child_column(strings_count, null_count, bytes, mr, stream.value());
+  auto d_chars = chars_column->mutable_view().template data<char>();
   // fill in chars
   thrust::for_each_n(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     [lhs_begin, rhs_begin, filter_fn, d_offsets, d_chars] __device__(size_type idx) {
@@ -110,7 +112,7 @@ std::unique_ptr<cudf::column> copy_if_else(
                              std::move(chars_column),
                              null_count,
                              std::move(null_mask),
-                             stream,
+                             stream.value(),
                              mr);
 }
 
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index 114a4195a95..6bdbce3c933 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -17,8 +17,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/merge.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -62,7 +62,7 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
   rmm::device_buffer null_mask{0, stream, mr};
   size_type null_count = lhs.null_count() + rhs.null_count();
   if (null_count > 0)
-    null_mask = create_null_mask(strings_count, mask_state::ALL_VALID, stream, mr);
+    null_mask = cudf::detail::create_null_mask(strings_count, mask_state::ALL_VALID, stream, mr);
 
   // build offsets column
   auto offsets_transformer = [d_lhs, d_rhs] __device__(auto index_pair) {
diff --git a/cpp/include/cudf/strings/detail/modify_strings.cuh b/cpp/include/cudf/strings/detail/modify_strings.cuh
index e61a404441b..c90ca4575f8 100644
--- a/cpp/include/cudf/strings/detail/modify_strings.cuh
+++ b/cpp/include/cudf/strings/detail/modify_strings.cuh
@@ -16,12 +16,14 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <strings/utilities.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace strings {
@@ -65,7 +67,8 @@ std::unique_ptr<column> modify_strings(strings_column_view const& strings,
   size_type null_count = strings.null_count();
 
   // copy null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
   // get the lookup tables used for case conversion
 
   device_probe_functor d_probe_fctr{d_column, std::forward<Types>(args)...};
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 53d2310364d..627b9902506 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -17,10 +17,13 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -62,7 +65,8 @@ std::unique_ptr<column> scatter(
 
   // create null mask -- caller must update this
   rmm::device_buffer null_mask{0, stream, mr};
-  if (target.has_nulls()) null_mask = copy_bitmask(target.parent(), stream, mr);
+  if (target.has_nulls())
+    null_mask = cudf::detail::copy_bitmask(target.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // create string vectors
   rmm::device_vector<string_view> target_vector = create_string_vector_from_column(target, stream);
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 52075c6d93b..55aabb87d8d 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -17,35 +17,36 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/binaryop.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/traits.hpp>
-
 #include <binaryop/jit/code/code.h>
+#include <binaryop/compiled/binary_ops.hpp>
+#include <binaryop/jit/util.hpp>
+
 #include <jit/launcher.h>
 #include <jit/parser.h>
 #include <jit/type.h>
-#include <binaryop/jit/util.hpp>
-#include <cudf/datetime.hpp>  // replace eventually
-
-#include "compiled/binary_ops.hpp"
-#include "cudf/binaryop.hpp"
-#include "cudf/fixed_point/fixed_point.hpp"
-#include "cudf/types.hpp"
-
 #include <jit/bit.hpp.jit>
 #include <jit/common_headers.hpp>
 #include <jit/durations.hpp.jit>
 #include <jit/fixed_point.hpp.jit>
 #include <jit/timestamps.hpp.jit>
 #include <jit/types.hpp.jit>
+
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/datetime.hpp>  // replace eventually
+#include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+
 #include <string>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 
@@ -62,9 +63,9 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
   if (col.is_empty()) return rmm::device_buffer{0, stream, mr};
 
   if (not s.is_valid()) {
-    return create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr);
+    return cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr);
   } else if (s.is_valid() and col.nullable()) {
-    return copy_bitmask(col, stream, mr);
+    return cudf::detail::copy_bitmask(col, rmm::cuda_stream_view{stream}, mr);
   } else {
     return rmm::device_buffer{0, stream, mr};
   }
@@ -336,7 +337,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
   } else {
-    auto new_mask = bitmask_and(table_view({lhs, rhs}), mr, stream);
+    auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
     return make_fixed_width_column(
       output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
   }
@@ -731,7 +732,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 
   CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match");
 
-  auto new_mask = bitmask_and(table_view({lhs, rhs}), mr, stream);
+  auto new_mask = bitmask_and(table_view({lhs, rhs}), stream, mr);
   auto out      = make_fixed_width_column(
     output_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
 
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 0109e788eb4..e21681a8467 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 
@@ -164,7 +165,7 @@ struct binary_op {
                                      rmm::mr::device_memory_resource* mr,
                                      cudaStream_t stream)
   {
-    auto new_mask = bitmask_and(table_view({lhs, rhs}), mr, stream);
+    auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
     auto out      = make_fixed_width_column(
       out_type, lhs.size(), std::move(new_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 429697f64c6..bc464cab372 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -25,17 +25,21 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
+
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/extrema.h>
+
 #include <cub/cub.cuh>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 
 #include <algorithm>
 #include <numeric>
 #include <type_traits>
+#include "rmm/mr/device/device_memory_resource.hpp"
 
 namespace cudf {
 size_type state_null_count(mask_state state, size_type size)
@@ -67,10 +71,12 @@ size_type num_bitmask_words(size_type number_of_bits)
                                                      detail::size_in_bits<bitmask_type>());
 }
 
+namespace detail {
+
 // Create a device_buffer for a null mask
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource *mr)
 {
   size_type mask_size{0};
@@ -81,13 +87,14 @@ rmm::device_buffer create_null_mask(size_type size,
 
   if (state != mask_state::UNINITIALIZED) {
     uint8_t fill_value = (state == mask_state::ALL_VALID) ? 0xff : 0x00;
-    CUDA_TRY(
-      cudaMemsetAsync(static_cast<bitmask_type *>(mask.data()), fill_value, mask_size, stream));
+    CUDA_TRY(cudaMemsetAsync(
+      static_cast<bitmask_type *>(mask.data()), fill_value, mask_size, stream.value()));
   }
 
   return mask;
 }
 
+namespace {
 __global__ void set_null_mask_kernel(bitmask_type *__restrict__ destination,
                                      size_type begin_bit,
                                      size_type end_bit,
@@ -116,12 +123,15 @@ __global__ void set_null_mask_kernel(bitmask_type *__restrict__ destination,
     }
   }
 }
+}  // namespace
 
-// Set pre-allocated null mask of given bit range [begin_bit, end_bit)
-// to valid, if valid==true,
+// Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
 // or null, otherwise;
-void set_null_mask(
-  bitmask_type *bitmask, size_type begin_bit, size_type end_bit, bool valid, cudaStream_t stream)
+void set_null_mask(bitmask_type *bitmask,
+                   size_type begin_bit,
+                   size_type end_bit,
+                   bool valid,
+                   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(begin_bit >= 0, "Invalid range.");
@@ -130,12 +140,29 @@ void set_null_mask(
     auto number_of_mask_words =
       num_bitmask_words(end_bit) - begin_bit / detail::size_in_bits<bitmask_type>();
     cudf::detail::grid_1d config(number_of_mask_words, 256);
-    set_null_mask_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
+    set_null_mask_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       static_cast<bitmask_type *>(bitmask), begin_bit, end_bit, valid, number_of_mask_words);
     CHECK_CUDA(stream);
   }
 }
 
+}  // namespace detail
+
+// Create a device_buffer for a null mask
+rmm::device_buffer create_null_mask(size_type size,
+                                    mask_state state,
+                                    rmm::mr::device_memory_resource *mr)
+{
+  return detail::create_null_mask(size, state, rmm::cuda_stream_default, mr);
+}
+
+// Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
+// or null, otherwise;
+void set_null_mask(bitmask_type *bitmask, size_type begin_bit, size_type end_bit, bool valid)
+{
+  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid);
+}
+
 namespace {
 
 /**
@@ -371,12 +398,56 @@ struct to_word_index : public thrust::unary_function<size_type, size_type> {
 
 namespace detail {
 
+// Create a bitmask from a specific range
+rmm::device_buffer copy_bitmask(bitmask_type const *mask,
+                                size_type begin_bit,
+                                size_type end_bit,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource *mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(begin_bit >= 0, "Invalid range.");
+  CUDF_EXPECTS(begin_bit <= end_bit, "Invalid bit range.");
+  rmm::device_buffer dest_mask{};
+  auto num_bytes = bitmask_allocation_size_bytes(end_bit - begin_bit);
+  if ((mask == nullptr) || (num_bytes == 0)) { return dest_mask; }
+  if (begin_bit == 0) {
+    dest_mask = rmm::device_buffer{static_cast<void const *>(mask), num_bytes, stream, mr};
+  } else {
+    auto number_of_mask_words = num_bitmask_words(end_bit - begin_bit);
+    dest_mask                 = rmm::device_buffer{num_bytes, stream, mr};
+    cudf::detail::grid_1d config(number_of_mask_words, 256);
+    copy_offset_bitmask<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+      static_cast<bitmask_type *>(dest_mask.data()),
+      mask,
+      begin_bit,
+      end_bit,
+      number_of_mask_words);
+    CHECK_CUDA(stream.value());
+  }
+  return dest_mask;
+}
+
+// Create a bitmask from a column view
+rmm::device_buffer copy_bitmask(column_view const &view,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource *mr)
+{
+  CUDF_FUNC_RANGE();
+  rmm::device_buffer null_mask{0, stream, mr};
+  if (view.nullable()) {
+    null_mask =
+      copy_bitmask(view.null_mask(), view.offset(), view.offset() + view.size(), stream, mr);
+  }
+  return null_mask;
+}
+
 // Inplace Bitwise AND of the masks
 void inplace_bitmask_and(bitmask_type *dest_mask,
                          std::vector<bitmask_type const *> const &masks,
                          std::vector<size_type> const &begin_bits,
                          size_type mask_size,
-                         cudaStream_t stream,
+                         rmm::cuda_stream_view stream,
                          rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(std::all_of(begin_bits.begin(), begin_bits.end(), [](auto b) { return b >= 0; }),
@@ -385,15 +456,13 @@ void inplace_bitmask_and(bitmask_type *dest_mask,
   CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
                "Mask pointer cannot be null");
 
-  auto num_bytes = bitmask_allocation_size_bytes(mask_size);
-
   auto number_of_mask_words = num_bitmask_words(mask_size);
 
   rmm::device_vector<bitmask_type const *> d_masks(masks);
   rmm::device_vector<size_type> d_begin_bits(begin_bits);
 
   cudf::detail::grid_1d config(number_of_mask_words, 256);
-  offset_bitmask_and<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
+  offset_bitmask_and<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
     dest_mask,
     d_masks.data().get(),
     d_begin_bits.data().get(),
@@ -401,21 +470,19 @@ void inplace_bitmask_and(bitmask_type *dest_mask,
     mask_size,
     number_of_mask_words);
 
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
 }
 
 // Bitwise AND of the masks
 rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
                                std::vector<size_type> const &begin_bits,
                                size_type mask_size,
-                               cudaStream_t stream,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource *mr)
 {
   rmm::device_buffer dest_mask{};
   auto num_bytes = bitmask_allocation_size_bytes(mask_size);
 
-  auto number_of_mask_words = num_bitmask_words(mask_size);
-
   dest_mask = rmm::device_buffer{num_bytes, stream, mr};
   inplace_bitmask_and(
     static_cast<bitmask_type *>(dest_mask.data()), masks, begin_bits, mask_size, stream, mr);
@@ -426,7 +493,7 @@ rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
 cudf::size_type count_set_bits(bitmask_type const *bitmask,
                                size_type start,
                                size_type stop,
-                               cudaStream_t stream = 0)
+                               rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 {
   if (nullptr == bitmask) { return 0; }
 
@@ -444,8 +511,9 @@ cudf::size_type count_set_bits(bitmask_type const *bitmask,
 
   rmm::device_scalar<size_type> non_zero_count(0, stream);
 
-  count_set_bits_kernel<block_size><<<grid.num_blocks, grid.num_threads_per_block, 0, stream>>>(
-    bitmask, start, stop - 1, non_zero_count.data());
+  count_set_bits_kernel<block_size>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      bitmask, start, stop - 1, non_zero_count.data());
 
   return non_zero_count.value();
 }
@@ -453,7 +521,7 @@ cudf::size_type count_set_bits(bitmask_type const *bitmask,
 cudf::size_type count_unset_bits(bitmask_type const *bitmask,
                                  size_type start,
                                  size_type stop,
-                                 cudaStream_t stream = 0)
+                                 rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 {
   if (nullptr == bitmask) { return 0; }
   auto num_bits = (stop - start);
@@ -462,7 +530,7 @@ cudf::size_type count_unset_bits(bitmask_type const *bitmask,
 
 std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
                                                 std::vector<size_type> const &indices,
-                                                cudaStream_t stream)
+                                                rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(indices.size() % 2 == 0,
                "Array of indices needs to have an even number of elements.");
@@ -522,7 +590,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
                                            num_ranges,
                                            first_word_indices,
                                            last_word_indices,
-                                           stream));
+                                           stream.value()));
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
 
   // second perform segmented reduction
@@ -534,7 +602,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
                                            num_ranges,
                                            first_word_indices,
                                            last_word_indices,
-                                           stream));
+                                           stream.value()));
 
   CHECK_CUDA(stream);
 
@@ -548,7 +616,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
   subtract_set_bits_range_boundaries_kerenel<<<grid.num_blocks,
                                                grid.num_threads_per_block,
                                                0,
-                                               stream>>>(
+                                               stream.value()>>>(
     bitmask, num_ranges, d_first_indices.begin(), d_last_indices.begin(), d_null_counts.begin());
 
   CHECK_CUDA(stream);
@@ -558,16 +626,16 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
                            d_null_counts.data().get(),
                            num_ranges * sizeof(size_type),
                            cudaMemcpyDeviceToHost,
-                           stream));
+                           stream.value()));
 
-  CUDA_TRY(cudaStreamSynchronize(stream));  // now ret is valid.
+  stream.synchronize();  // now ret is valid.
 
   return ret;
 }
 
 std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
                                                   std::vector<size_type> const &indices,
-                                                  cudaStream_t stream)
+                                                  rmm::cuda_stream_view stream)
 {
   if (indices.empty()) {
     return std::vector<size_type>{};
@@ -587,8 +655,8 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
 
 // Returns the bitwise AND of the null masks of all columns in the table view
 rmm::device_buffer bitmask_and(table_view const &view,
-                               rmm::mr::device_memory_resource *mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource *mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -631,7 +699,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const *bitmask,
                                                 std::vector<size_type> const &indices)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_count_set_bits(bitmask, indices, 0);
+  return detail::segmented_count_set_bits(bitmask, indices, rmm::cuda_stream_default);
 }
 
 // Count zero bits in the specified ranges
@@ -639,57 +707,27 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const *bitmask,
                                                   std::vector<size_type> const &indices)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_count_unset_bits(bitmask, indices, 0);
+  return detail::segmented_count_unset_bits(bitmask, indices, rmm::cuda_stream_default);
 }
 
 // Create a bitmask from a specific range
 rmm::device_buffer copy_bitmask(bitmask_type const *mask,
                                 size_type begin_bit,
                                 size_type end_bit,
-                                cudaStream_t stream,
                                 rmm::mr::device_memory_resource *mr)
 {
-  CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(begin_bit >= 0, "Invalid range.");
-  CUDF_EXPECTS(begin_bit <= end_bit, "Invalid bit range.");
-  rmm::device_buffer dest_mask{};
-  auto num_bytes = bitmask_allocation_size_bytes(end_bit - begin_bit);
-  if ((mask == nullptr) || (num_bytes == 0)) { return dest_mask; }
-  if (begin_bit == 0) {
-    dest_mask = rmm::device_buffer{static_cast<void const *>(mask), num_bytes, stream, mr};
-  } else {
-    auto number_of_mask_words = num_bitmask_words(end_bit - begin_bit);
-    dest_mask                 = rmm::device_buffer{num_bytes, stream, mr};
-    cudf::detail::grid_1d config(number_of_mask_words, 256);
-    copy_offset_bitmask<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
-      static_cast<bitmask_type *>(dest_mask.data()),
-      mask,
-      begin_bit,
-      end_bit,
-      number_of_mask_words);
-    CHECK_CUDA(stream);
-  }
-  return dest_mask;
+  return detail::copy_bitmask(mask, begin_bit, end_bit, rmm::cuda_stream_default, mr);
 }
 
 // Create a bitmask from a column view
-rmm::device_buffer copy_bitmask(column_view const &view,
-                                cudaStream_t stream,
-                                rmm::mr::device_memory_resource *mr)
+rmm::device_buffer copy_bitmask(column_view const &view, rmm::mr::device_memory_resource *mr)
 {
-  rmm::device_buffer null_mask{0, stream, mr};
-  if (view.nullable()) {
-    null_mask =
-      copy_bitmask(view.null_mask(), view.offset(), view.offset() + view.size(), stream, mr);
-  }
-  return null_mask;
+  return detail::copy_bitmask(view, rmm::cuda_stream_default, mr);
 }
 
-rmm::device_buffer bitmask_and(table_view const &view,
-                               rmm::mr::device_memory_resource *mr,
-                               cudaStream_t stream)
+rmm::device_buffer bitmask_and(table_view const &view, rmm::mr::device_memory_resource *mr)
 {
-  return detail::bitmask_and(view, mr, stream);
+  return detail::bitmask_and(view, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index 809abe40989..399bc26f786 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -38,6 +38,7 @@
 #include <vector>
 #include "cudf/structs/structs_column_view.hpp"
 #include "cudf/types.hpp"
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 // Copy constructor
@@ -207,12 +208,13 @@ struct create_column_from_view {
       children.emplace_back(std::make_unique<column>(indices_view, stream, mr));
       children.emplace_back(std::make_unique<column>(dict_view.keys(), stream, mr));
     }
-    return std::make_unique<column>(view.type(),
-                                    view.size(),
-                                    rmm::device_buffer{0, stream, mr},
-                                    cudf::copy_bitmask(view, stream, mr),
-                                    view.null_count(),
-                                    std::move(children));
+    return std::make_unique<column>(
+      view.type(),
+      view.size(),
+      rmm::device_buffer{0, stream, mr},
+      cudf::detail::copy_bitmask(view, rmm::cuda_stream_view{stream}, mr),
+      view.null_count(),
+      std::move(children));
   }
 
   template <typename ColumnType, std::enable_if_t<cudf::is_fixed_width<ColumnType>()> * = nullptr>
@@ -231,7 +233,7 @@ struct create_column_from_view {
         view.size() * cudf::size_of(view.type()),
         stream,
         mr},
-      cudf::copy_bitmask(view, stream, mr),
+      cudf::detail::copy_bitmask(view, rmm::cuda_stream_view{stream}, mr),
       view.null_count(),
       std::move(children));
   }
@@ -265,12 +267,13 @@ struct create_column_from_view {
 
     auto num_rows = children.empty() ? 0 : children.front()->size();
 
-    return make_structs_column(num_rows,
-                               std::move(children),
-                               view.null_count(),
-                               cudf::copy_bitmask(view.null_mask(), begin, end, stream, mr),
-                               stream,
-                               mr);
+    return make_structs_column(
+      num_rows,
+      std::move(children),
+      view.null_count(),
+      cudf::detail::copy_bitmask(view.null_mask(), begin, end, rmm::cuda_stream_view{stream}, mr),
+      stream,
+      mr);
   }
 };
 }  // anonymous namespace
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 648e1a14708..efbfd1de501 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -16,10 +16,10 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/fill.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/detail/fill.hpp>
@@ -36,6 +36,7 @@ struct size_of_helper {
   constexpr int operator()() const
   {
     CUDF_FAIL("Invalid, non fixed-width element type.");
+    return 0;
   }
 
   template <typename T,
@@ -81,7 +82,7 @@ std::unique_ptr<column> make_numeric_column(data_type type,
   return std::make_unique<column>(type,
                                   size,
                                   rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  create_null_mask(size, state, stream, mr),
+                                  detail::create_null_mask(size, state, stream, mr),
                                   state_null_count(state, size),
                                   std::vector<std::unique_ptr<column>>{});
 }
@@ -99,7 +100,7 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
   return std::make_unique<column>(type,
                                   size,
                                   rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  create_null_mask(size, state, stream, mr),
+                                  detail::create_null_mask(size, state, stream, mr),
                                   state_null_count(state, size),
                                   std::vector<std::unique_ptr<column>>{});
 }
@@ -117,7 +118,7 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
   return std::make_unique<column>(type,
                                   size,
                                   rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  create_null_mask(size, state, stream, mr),
+                                  detail::create_null_mask(size, state, stream, mr),
                                   state_null_count(state, size),
                                   std::vector<std::unique_ptr<column>>{});
 }
@@ -135,7 +136,7 @@ std::unique_ptr<column> make_duration_column(data_type type,
   return std::make_unique<column>(type,
                                   size,
                                   rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                                  create_null_mask(size, state, stream, mr),
+                                  detail::create_null_mask(size, state, stream, mr),
                                   state_null_count(state, size),
                                   std::vector<std::unique_ptr<column>>{});
 }
@@ -182,17 +183,18 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
   rmm::mr::device_memory_resource* mr,
   cudaStream_t stream) const
 {
+  auto null_mask = detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
+
   if (!value.is_valid())
     return std::make_unique<column>(value.type(),
                                     size,
                                     rmm::device_buffer{0, stream, mr},
-                                    create_null_mask(size, mask_state::ALL_NULL, stream, mr),
+                                    null_mask,
                                     size);
 
   // Create a strings column_view with all nulls and no children.
   // Since we are setting every row to the scalar, the fill() never needs to access
   // any of the children in the strings column which would otherwise cause an exception.
-  auto null_mask = create_null_mask(size, mask_state::ALL_NULL, stream);
   column_view sc{
     data_type{type_id::STRING}, size, nullptr, static_cast<bitmask_type*>(null_mask.data()), size};
   auto sv = static_cast<scalar_type_t<cudf::string_view> const&>(value);
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 225e08eb1a8..0ab19f5af1a 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/dictionary/detail/concatenate.hpp>
@@ -225,7 +226,7 @@ std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
 
   // Allocate output
   auto const policy = has_nulls ? mask_policy::ALWAYS : mask_policy::NEVER;
-  auto out_col      = detail::allocate_like(views.front(), output_size, policy, mr, stream);
+  auto out_col      = detail::allocate_like(views.front(), output_size, policy, stream, mr);
   out_col->set_null_count(0);  // prevent null count from being materialized
   auto out_view   = out_col->mutable_view();
   auto d_out_view = mutable_column_device_view::create(out_view, stream);
@@ -386,7 +387,7 @@ rmm::device_buffer concatenate_masks(std::vector<column_view> const& views,
       });
 
     rmm::device_buffer null_mask =
-      create_null_mask(total_element_count, mask_state::UNINITIALIZED, 0, mr);
+      create_null_mask(total_element_count, mask_state::UNINITIALIZED, mr);
 
     detail::concatenate_masks(views, static_cast<bitmask_type*>(null_mask.data()), 0);
 
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 9e7211e9757..6c0aeb601c2 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -18,13 +18,14 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <algorithm>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
@@ -49,8 +50,8 @@ inline mask_state should_allocate_mask(mask_allocation_policy mask_alloc, bool m
 std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
-                                      rmm::mr::device_memory_resource* mr,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_fixed_width(input.type()), "Expects only fixed-width type column");
@@ -59,13 +60,13 @@ std::unique_ptr<column> allocate_like(column_view const& input,
   std::vector<std::unique_ptr<column>> children{};
   children.reserve(input.num_children());
   for (size_type index = 0; index < input.num_children(); index++) {
-    children.emplace_back(allocate_like(input.child(index), size, mask_alloc, mr, stream));
+    children.emplace_back(allocate_like(input.child(index), size, mask_alloc, stream, mr));
   }
 
   return std::make_unique<column>(input.type(),
                                   size,
                                   rmm::device_buffer(size * size_of(input.type()), stream, mr),
-                                  create_null_mask(size, allocate_mask, stream, mr),
+                                  detail::create_null_mask(size, allocate_mask, stream, mr),
                                   state_null_count(allocate_mask, input.size()),
                                   std::move(children));
 }
@@ -107,7 +108,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::allocate_like(input, input.size(), mask_alloc, mr);
+  return detail::allocate_like(input, input.size(), mask_alloc, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> allocate_like(column_view const& input,
@@ -116,7 +117,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::allocate_like(input, size, mask_alloc, mr);
+  return detail::allocate_like(input, size, mask_alloc, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index f4858714705..619d24c1204 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/string_view.cuh>
 #include "cudf/fixed_point/fixed_point.hpp"
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
@@ -36,27 +37,27 @@ struct copy_if_else_functor_impl {
                                      bool left_nullable,
                                      bool right_nullable,
                                      Filter filter,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     if (left_nullable) {
       if (right_nullable) {
         auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
         auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
-        return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream);
+        return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
       }
       auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
       auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
-      return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream);
+      return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
     }
     if (right_nullable) {
       auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
       auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
-      return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream);
+      return detail::copy_if_else(true, lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
     }
     auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
     auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
-    return detail::copy_if_else(false, lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream);
+    return detail::copy_if_else(false, lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
   }
 };
 
@@ -71,8 +72,8 @@ struct copy_if_else_functor_impl<string_view, Left, Right, Filter> {
                                      bool left_nullable,
                                      bool right_nullable,
                                      Filter filter,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     using T = string_view;
 
@@ -81,20 +82,20 @@ struct copy_if_else_functor_impl<string_view, Left, Right, Filter> {
         auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
         auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
         return strings::detail::copy_if_else(
-          lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream);
+          lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
       }
       auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
       auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
-      return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream);
+      return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
     }
     if (right_nullable) {
       auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
       auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
-      return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream);
+      return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
     }
     auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
     auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
-    return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, mr, stream);
+    return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
   }
 };
 
@@ -109,8 +110,8 @@ struct copy_if_else_functor_impl<list_view, Left, Right, Filter> {
                                      bool left_nullable,
                                      bool right_nullable,
                                      Filter filter,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("copy_if_else not supported for list_view yet");
   }
@@ -124,8 +125,8 @@ struct copy_if_else_functor_impl<struct_view, Left, Right, Filter> {
                                      bool left_nullable,
                                      bool right_nullable,
                                      Filter filter,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("copy_if_else not supported for struct_view yet");
   }
@@ -142,8 +143,8 @@ struct copy_if_else_functor_impl<numeric::decimal32, Left, Right, Filter> {
                                      bool left_nullable,
                                      bool right_nullable,
                                      Filter filter,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("copy_if_else not supported for decimal32 yet");
   }
@@ -160,8 +161,8 @@ struct copy_if_else_functor_impl<numeric::decimal64, Left, Right, Filter> {
                                      bool left_nullable,
                                      bool right_nullable,
                                      Filter filter,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("copy_if_else not supported for decimal64 yet");
   }
@@ -179,11 +180,11 @@ struct copy_if_else_functor {
                                      bool left_nullable,
                                      bool right_nullable,
                                      Filter filter,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     copy_if_else_functor_impl<T, Left, Right, Filter> copier{};
-    return copier(lhs, rhs, size, left_nullable, right_nullable, filter, mr, stream);
+    return copier(lhs, rhs, size, left_nullable, right_nullable, filter, stream, mr);
   }
 };
 
@@ -194,8 +195,8 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
                                      bool left_nullable,
                                      bool right_nullable,
                                      column_view const& boolean_mask,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type");
   CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8),
@@ -218,8 +219,8 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
                                  left_nullable,
                                  right_nullable,
                                  filter,
-                                 mr,
-                                 stream);
+                                 stream,
+                                 mr);
   } else {
     auto filter = [bool_mask_device] __device__(cudf::size_type i) {
       return bool_mask_device.element<bool>(i);
@@ -232,8 +233,8 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
                                  left_nullable,
                                  right_nullable,
                                  filter,
-                                 mr,
-                                 stream);
+                                 stream,
+                                 mr);
   }
 }
 
@@ -242,8 +243,8 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
 std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs and rhs columns");
@@ -253,15 +254,15 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                       lhs.has_nulls(),
                       rhs.has_nulls(),
                       boolean_mask,
-                      mr,
-                      stream);
+                      stream,
+                      mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column");
@@ -270,15 +271,15 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                       !lhs.is_valid(),
                       rhs.has_nulls(),
                       boolean_mask,
-                      mr,
-                      stream);
+                      stream,
+                      mr);
 }
 
 std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column");
@@ -287,17 +288,17 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                       lhs.has_nulls(),
                       !rhs.is_valid(),
                       boolean_mask,
-                      mr,
-                      stream);
+                      stream,
+                      mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
-  return copy_if_else(lhs, rhs, !lhs.is_valid(), !rhs.is_valid(), boolean_mask, mr, stream);
+  return copy_if_else(lhs, rhs, !lhs.is_valid(), !rhs.is_valid(), boolean_mask, stream, mr);
 }
 
 };  // namespace detail
@@ -308,7 +309,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
@@ -317,7 +318,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> copy_if_else(column_view const& lhs,
@@ -326,7 +327,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
@@ -335,7 +336,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index 812867ba3ca..daca5900768 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -21,6 +21,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -103,7 +104,7 @@ struct out_of_place_copy_range_dispatch {
     auto p_ret = std::make_unique<cudf::column>(target, stream, mr);
     if ((!p_ret->nullable()) && source.has_nulls(source_begin, source_end)) {
       p_ret->set_null_mask(
-        cudf::create_null_mask(p_ret->size(), cudf::mask_state::ALL_VALID, stream, mr), 0);
+        cudf::detail::create_null_mask(p_ret->size(), cudf::mask_state::ALL_VALID, stream, mr), 0);
     }
 
     if (source_end != source_begin) {  // otherwise no-op
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index e3be4d4cc13..c270be1ccca 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 #include <thrust/device_vector.h>
 #include <thrust/random.h>
@@ -34,8 +35,8 @@ std::unique_ptr<table> sample(table_view const& input,
                               size_type const n,
                               sample_with_replacement replacement,
                               int64_t const seed,
-                              rmm::mr::device_memory_resource* mr,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(n >= 0, "expected number of samples should be non-negative");
   auto const num_rows = input.num_rows();
@@ -58,13 +59,13 @@ std::unique_ptr<table> sample(table_view const& input,
       thrust::make_transform_iterator(thrust::counting_iterator<size_type>(0), RandomGen);
     auto end = thrust::make_transform_iterator(thrust::counting_iterator<size_type>(n), RandomGen);
 
-    return detail::gather(input, begin, end, false, mr, stream);
+    return detail::gather(input, begin, end, false, mr, stream.value());
   } else {
-    auto gather_map =
-      make_numeric_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream);
+    auto gather_map = make_numeric_column(
+      data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream.value());
     auto gather_map_mutable_view = gather_map->mutable_view();
     // Shuffle all the row indices
-    thrust::shuffle_copy(rmm::exec_policy(stream)->on(stream),
+    thrust::shuffle_copy(rmm::exec_policy(stream)->on(stream.value()),
                          thrust::counting_iterator<size_type>(0),
                          thrust::counting_iterator<size_type>(num_rows),
                          gather_map_mutable_view.begin<size_type>(),
@@ -77,7 +78,7 @@ std::unique_ptr<table> sample(table_view const& input,
                           gather_map_view.end<size_type>(),
                           false,
                           mr,
-                          stream);
+                          stream.value());
   }
 }
 
@@ -91,6 +92,6 @@ std::unique_ptr<table> sample(table_view const& input,
 {
   CUDF_FUNC_RANGE();
 
-  return detail::sample(input, n, replacement, seed, mr);
+  return detail::sample(input, n, replacement, seed, rmm::cuda_stream_default, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 3b893ba4f29..b0f0bbef064 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.cuh>
 #include <cudf/detail/scatter.hpp>
@@ -75,7 +76,7 @@ void scatter_scalar_bitmask(std::vector<std::unique_ptr<scalar>> const& source,
     if (target[i]->nullable() or not source_is_valid) {
       if (not target[i]->nullable()) {
         // Target must have a null mask if the source is not valid
-        auto mask = create_null_mask(target[i]->size(), mask_state::ALL_VALID, stream, mr);
+        auto mask = detail::create_null_mask(target[i]->size(), mask_state::ALL_VALID, stream, mr);
         target[i]->set_null_mask(std::move(mask), 0);
       }
 
@@ -349,7 +350,7 @@ std::unique_ptr<column> boolean_mask_scatter(scalar const& input,
                                              rmm::mr::device_memory_resource* mr,
                                              cudaStream_t stream)
 {
-  return detail::copy_if_else(input, target, boolean_mask, mr, stream);
+  return detail::copy_if_else(input, target, boolean_mask, stream, mr);
 }
 
 std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index b024c79ab2d..169b6760985 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -26,6 +26,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -53,8 +55,8 @@ struct shift_functor {
     column_view const& input,
     size_type offset,
     scalar const& fill_value,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
   {
     using Type       = device_storage_type_t<T>;
     using ScalarType = cudf::scalar_type_t<Type>;
@@ -62,7 +64,7 @@ struct shift_functor {
 
     auto device_input = column_device_view::create(input);
     auto output =
-      detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, mr, stream);
+      detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
     auto device_output = mutable_column_device_view::create(*output);
 
     auto size        = input.size();
@@ -103,7 +105,7 @@ struct shift_functor {
       };
 
     thrust::transform(
-      rmm::exec_policy(stream)->on(stream), index_begin, index_end, data, func_value);
+      rmm::exec_policy(stream)->on(stream.value()), index_begin, index_end, data, func_value);
 
     return output;
   }
@@ -111,11 +113,13 @@ struct shift_functor {
 
 }  // anonymous namespace
 
+namespace detail {
+
 std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
-                              rmm::mr::device_memory_resource* mr,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input.type() == fill_value.type(),
@@ -123,7 +127,17 @@ std::unique_ptr<column> shift(column_view const& input,
 
   if (input.is_empty()) { return empty_like(input); }
 
-  return type_dispatcher(input.type(), shift_functor{}, input, offset, fill_value, mr, stream);
+  return type_dispatcher(input.type(), shift_functor{}, input, offset, fill_value, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> shift(column_view const& input,
+                              size_type offset,
+                              scalar const& fill_value,
+                              rmm::mr::device_memory_resource* mr)
+{
+  return detail::shift(input, offset, fill_value, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/slice.cpp b/cpp/src/copying/slice.cpp
index f202fd6dfb0..a9141b7a48f 100644
--- a/cpp/src/copying/slice.cpp
+++ b/cpp/src/copying/slice.cpp
@@ -22,12 +22,13 @@
 #include <cudf/utilities/error.hpp>
 
 #include <algorithm>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
 std::vector<column_view> slice(column_view const& input,
                                std::vector<size_type> const& indices,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even");
 
@@ -63,7 +64,7 @@ std::vector<cudf::column_view> slice(cudf::column_view const& input,
                                      std::vector<size_type> const& indices)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, 0);
+  return detail::slice(input, indices, rmm::cuda_stream_default);
 }
 
 std::vector<cudf::table_view> slice(cudf::table_view const& input,
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 919d1d4eacc..c3e2cc9a2ff 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -19,11 +19,13 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/datetime.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 #include <rmm/thrust_rmm_allocator.h>
 
@@ -165,8 +167,13 @@ std::unique_ptr<column> apply_datetime_op(column_view const& column,
   // Return an empty column if source column is empty
   if (size == 0) return make_empty_column(output_col_type);
 
-  auto output = make_fixed_width_column(
-    output_col_type, size, copy_bitmask(column, stream, mr), column.null_count(), stream, mr);
+  auto output =
+    make_fixed_width_column(output_col_type,
+                            size,
+                            cudf::detail::copy_bitmask(column, rmm::cuda_stream_view{stream}, mr),
+                            column.null_count(),
+                            stream,
+                            mr);
   auto launch =
     launch_functor<TransformFunctor, typename cudf::id_to_type_impl<OutputColCudfT>::type>{
       column, static_cast<mutable_column_view>(*output)};
@@ -260,8 +267,9 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
   // Return an empty column if source column is empty
   if (size == 0) return make_empty_column(output_col_type);
 
-  auto output_col_mask = bitmask_and(table_view({timestamp_column, months_column}), mr, stream);
-  auto output          = make_fixed_width_column(
+  auto output_col_mask =
+    cudf::detail::bitmask_and(table_view({timestamp_column, months_column}), stream, mr);
+  auto output = make_fixed_width_column(
     output_col_type, size, std::move(output_col_mask), cudf::UNKNOWN_NULL_COUNT, stream, mr);
 
   auto launch = add_calendrical_months_functor{
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index dbe22acab27..dc18afebb3b 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
 #include <cudf/detail/stream_compaction.hpp>
@@ -28,6 +29,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -114,10 +116,11 @@ std::unique_ptr<column> add_keys(
 
   // create new dictionary column with keys_column and indices_column
   // null mask has not changed
-  return make_dictionary_column(std::move(keys_column),
-                                std::move(indices_column),
-                                copy_bitmask(dictionary_column.parent(), stream, mr),
-                                dictionary_column.null_count());
+  return make_dictionary_column(
+    std::move(keys_column),
+    std::move(indices_column),
+    cudf::detail::copy_bitmask(dictionary_column.parent(), rmm::cuda_stream_view{stream}, mr),
+    dictionary_column.null_count());
 }
 
 }  // namespace detail
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index deaff20dc9e..c0bde1c92a5 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -17,12 +17,15 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/detail/encode.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -52,7 +55,9 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
   auto output_column = std::unique_ptr<column>(std::move(table_column.front()));
 
   // apply any nulls to the output column
-  output_column->set_null_mask(copy_bitmask(source.parent(), stream, mr), source.null_count());
+  output_column->set_null_mask(
+    cudf::detail::copy_bitmask(source.parent(), rmm::cuda_stream_view{stream}, mr),
+    source.null_count());
 
   return output_column;
 }
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index dc52820d848..286f4961946 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -16,10 +16,12 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/detail/encode.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace {
@@ -57,7 +59,8 @@ std::unique_ptr<column> make_dictionary_column(column_view const& keys_column,
     type_dispatcher(indices_column.type(), dispatch_create_indices{}, indices_column, mr, stream);
   rmm::device_buffer null_mask{0, stream, mr};
   auto null_count = indices_column.null_count();
-  if (null_count) null_mask = copy_bitmask(indices_column, stream, mr);
+  if (null_count)
+    null_mask = detail::copy_bitmask(indices_column, rmm::cuda_stream_view{stream}, mr);
 
   std::vector<std::unique_ptr<column>> children;
   children.emplace_back(std::move(indices_copy));
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index ce3062680e3..613974efde7 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/unary.hpp>
@@ -25,6 +26,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -61,10 +63,11 @@ std::unique_ptr<column> encode(column_view const& input_column,
     indices_column = cudf::detail::cast(indices_column->view(), indices_type, mr, stream);
 
   // create column with keys_column and indices_column
-  return make_dictionary_column(std::move(keys_column),
-                                std::move(indices_column),
-                                copy_bitmask(input_column, stream, mr),
-                                input_column.null_count());
+  return make_dictionary_column(
+    std::move(keys_column),
+    std::move(indices_column),
+    cudf::detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr),
+    input_column.null_count());
 }
 
 /**
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 4b96b66571a..fa3219ef039 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -117,8 +117,8 @@ std::unique_ptr<column> replace_indices(column_view const& input,
                                     input_pair_iterator + input.size(),
                                     replacement_iter,
                                     predicate,
-                                    mr,
-                                    stream);
+                                    stream,
+                                    mr);
 }
 
 }  // namespace
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index a9bd95a2876..de6ab9f7261 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/fill.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/detail/encode.hpp>
 #include <cudf/dictionary/detail/search.hpp>
@@ -93,7 +94,8 @@ struct out_of_place_fill_range_dispatch {
     if (end != begin) {  // otherwise no fill
       if (!p_ret->nullable() && !value.is_valid()) {
         p_ret->set_null_mask(
-          cudf::create_null_mask(p_ret->size(), cudf::mask_state::ALL_VALID, stream, mr), 0);
+          cudf::detail::create_null_mask(p_ret->size(), cudf::mask_state::ALL_VALID, stream, mr),
+          0);
       }
 
       auto ret_view = p_ret->mutable_view();
@@ -153,7 +155,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   if (!value.is_valid()) {
     auto result = std::make_unique<cudf::column>(input, stream, mr);
     auto mview  = result->mutable_view();
-    cudf::set_null_mask(mview.null_mask(), begin, end, false, stream);
+    cudf::detail::set_null_mask(mview.null_mask(), begin, end, false, stream);
     mview.set_null_count(input.null_count() + (end - begin));
     return result;
   }
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 7cace035d93..5bc7e0d02f0 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -278,7 +278,8 @@ void compute_single_pass_aggs(table_view const& keys,
   bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   if (skip_key_rows_with_nulls) {
-    auto row_bitmask{bitmask_and(keys, rmm::mr::get_current_device_resource(), stream)};
+    auto row_bitmask{
+      cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource())};
     thrust::for_each_n(
       rmm::exec_policy(stream)->on(stream),
       thrust::make_counting_iterator(0),
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 5476a2011e7..88bdaf829a1 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -239,7 +239,8 @@ column_view sort_groupby_helper::keys_bitmask_column(cudaStream_t stream)
 {
   if (_keys_bitmask_column) return _keys_bitmask_column->view();
 
-  auto row_bitmask = bitmask_and(_keys, rmm::mr::get_current_device_resource(), stream);
+  auto row_bitmask =
+    cudf::detail::bitmask_and(_keys, stream, rmm::mr::get_current_device_resource());
 
   _keys_bitmask_column = make_numeric_column(data_type(type_id::INT8),
                                              _keys.num_rows(),
diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp
index 79c95133b91..141c8121dff 100644
--- a/cpp/src/interop/from_arrow.cpp
+++ b/cpp/src/interop/from_arrow.cpp
@@ -19,6 +19,7 @@
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/interop.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/unary.hpp>
@@ -31,6 +32,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 
 namespace detail {
@@ -132,11 +135,11 @@ struct dispatch_to_cudf_column {
       // If array is sliced, we have to copy whole mask and then take copy.
       auto out_mask = (num_rows == static_cast<size_type>(data_buffer->size() / sizeof(T)))
                         ? *tmp_mask
-                        : copy_bitmask(static_cast<bitmask_type*>(tmp_mask->data()),
-                                       array.offset(),
-                                       array.offset() + num_rows,
-                                       stream,
-                                       mr);
+                        : cudf::detail::copy_bitmask(static_cast<bitmask_type*>(tmp_mask->data()),
+                                                     array.offset(),
+                                                     array.offset() + num_rows,
+                                                     rmm::cuda_stream_view{stream},
+                                                     mr);
 
       col->set_null_mask(std::move(out_mask));
     }
@@ -186,11 +189,11 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
   auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr;
   if (has_nulls) {
     auto out_mask =
-      copy_bitmask(static_cast<bitmask_type*>(get_mask_buffer(array, mr, stream)->data()),
-                   array.offset(),
-                   array.offset() + array.length(),
-                   stream,
-                   mr);
+      detail::copy_bitmask(static_cast<bitmask_type*>(get_mask_buffer(array, mr, stream)->data()),
+                           array.offset(),
+                           array.offset() + array.length(),
+                           rmm::cuda_stream_view{stream},
+                           mr);
 
     out_col->set_null_mask(std::move(out_mask));
   }
@@ -286,11 +289,11 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::struct_view>(
 
   auto out_mask = *(get_mask_buffer(array, mr, stream));
   if (struct_array->null_bitmap_data() != nullptr) {
-    out_mask = copy_bitmask(static_cast<bitmask_type*>(out_mask.data()),
-                            array.offset(),
-                            array.offset() + array.length(),
-                            stream,
-                            mr);
+    out_mask = detail::copy_bitmask(static_cast<bitmask_type*>(out_mask.data()),
+                                    array.offset(),
+                                    array.offset() + array.length(),
+                                    rmm::cuda_stream_view{stream},
+                                    mr);
   }
 
   return make_structs_column(
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 1753dff593b..3d6d298de71 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -23,6 +23,7 @@
 
 #include <io/comp/gpuinflate.h>
 
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -300,7 +301,7 @@ void reader::impl::decode_data(const rmm::device_buffer &block_data,
       schema_desc[schema_data_idx].count = dict[i].first;
     }
     if (out_buffers[i].null_mask_size()) {
-      set_null_mask(out_buffers[i].null_mask(), 0, num_rows, true, stream);
+      cudf::detail::set_null_mask(out_buffers[i].null_mask(), 0, num_rows, true, stream);
     }
   }
   rmm::device_buffer block_list(
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 839a013c784..863e7f0a8b3 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -16,6 +16,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
@@ -174,7 +175,8 @@ struct dispatch_from_durations_fn {
     auto d_column           = *column;
 
     // copy null mask
-    rmm::device_buffer null_mask = copy_bitmask(durations, stream, mr);
+    rmm::device_buffer null_mask =
+      cudf::detail::copy_bitmask(durations, rmm::cuda_stream_view{stream}, mr);
     // build offsets column
     auto offsets_transformer_itr = thrust::make_transform_iterator(
       thrust::make_counting_iterator<int32_t>(0), duration_to_string_size_fn<T>{d_column});
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 90bdc42804c..cde8a321f8e 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
@@ -31,6 +32,7 @@
 
 #include <rmm/thrust_rmm_allocator.h>
 #include <rmm/device_buffer.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace io {
@@ -117,7 +119,10 @@ struct column_buffer {
 
       default: _data = create_data(type, size, stream, mr); break;
     }
-    if (is_nullable) { _null_mask = create_null_mask(size, mask_state::ALL_NULL, stream, mr); }
+    if (is_nullable) {
+      _null_mask = cudf::detail::create_null_mask(
+        size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
+    }
   }
 
   auto data() { return _strings.size() ? _strings.data().get() : _data.data(); }
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index df5495c02e0..c7bf2139a83 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -3,6 +3,7 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <iostream>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace lists {
@@ -54,7 +55,8 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
           cudf::detail::slice(lists.child(), {start_offset, end_offset}, stream).front());
 
   // Compute the null mask of the result:
-  auto null_mask = cudf::copy_bitmask(lists.null_mask(), start, end, stream, mr);
+  auto null_mask =
+    cudf::detail::copy_bitmask(lists.null_mask(), start, end, rmm::cuda_stream_view{stream}, mr);
 
   return make_lists_column(lists_count,
                            std::move(offsets),
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 71ac0865e5e..c22f5afe181 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -262,7 +262,7 @@ struct column_merger {
     // materialize_merged_bitmask_kernel<false, false>()
     // which won't be called anymore (because of the _condition_ below)
     //
-    cudf::set_null_mask(merged_view.null_mask(), 0, merged_view.size(), true, stream_);
+    cudf::detail::set_null_mask(merged_view.null_mask(), 0, merged_view.size(), true, stream_);
 
     // set the null count:
     //
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 09a8b714819..280cc0198cf 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -14,17 +14,20 @@
  * limitations under the License.
  */
 
-#include <memory>
-#include <vector>
+#include <quantiles/quantiles_util.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <quantiles/quantiles_util.hpp>
+
+#include <memory>
+#include <vector>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
@@ -36,7 +39,7 @@ struct quantile_functor {
   interpolation interp;
   bool retain_types;
   rmm::mr::device_memory_resource* mr;
-  cudaStream_t stream;
+  rmm::cuda_stream_view stream;
 
   template <typename T>
   std::enable_if_t<not std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
@@ -51,13 +54,14 @@ struct quantile_functor {
   {
     using Result = std::conditional_t<exact, double, T>;
 
-    auto type   = data_type{type_to_id<Result>()};
-    auto output = make_fixed_width_column(type, q.size(), mask_state::UNALLOCATED, stream, mr);
+    auto type = data_type{type_to_id<Result>()};
+    auto output =
+      make_fixed_width_column(type, q.size(), mask_state::UNALLOCATED, stream.value(), mr);
 
     if (output->size() == 0) { return output; }
 
     if (input.is_empty()) {
-      auto mask = create_null_mask(output->size(), mask_state::ALL_NULL, stream, mr);
+      auto mask = cudf::detail::create_null_mask(output->size(), mask_state::ALL_NULL, stream, mr);
       output->set_null_mask(std::move(mask), output->size());
       return output;
     }
diff --git a/cpp/src/reductions/scan.cu b/cpp/src/reductions/scan.cu
index 8def1be553c..d5c9527e927 100644
--- a/cpp/src/reductions/scan.cu
+++ b/cpp/src/reductions/scan.cu
@@ -1,18 +1,17 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/nvtx/ranges.hpp>
-
-#include <cudf/null_mask.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
@@ -47,9 +46,11 @@ struct ScanDispatcher {
   {
     const size_type size = input_view.size();
     auto output_column =
-      detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, mr, stream);
+      detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, stream, mr);
     if (null_handling == null_policy::EXCLUDE) {
-      output_column->set_null_mask(copy_bitmask(input_view, stream, mr), input_view.null_count());
+      output_column->set_null_mask(
+        detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr),
+        input_view.null_count());
     }
     mutable_column_view output = output_column->mutable_view();
     auto d_input               = column_device_view::create(input_view, stream);
@@ -91,7 +92,7 @@ struct ScanDispatcher {
                                          cudaStream_t stream)
   {
     rmm::device_buffer mask =
-      create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr);
+      detail::create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr);
     auto d_input = column_device_view::create(input_view, stream);
     auto v       = detail::make_validity_iterator(*d_input);
     auto first_null_position =
@@ -114,9 +115,11 @@ struct ScanDispatcher {
   {
     const size_type size = input_view.size();
     auto output_column =
-      detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, mr, stream);
+      detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, stream, mr);
     if (null_handling == null_policy::EXCLUDE) {
-      output_column->set_null_mask(copy_bitmask(input_view, stream, mr), input_view.null_count());
+      output_column->set_null_mask(
+        detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr),
+        input_view.null_count());
     } else {
       if (input_view.nullable()) {
         output_column->set_null_mask(mask_inclusive_scan(input_view, mr, stream),
@@ -166,7 +169,9 @@ struct ScanDispatcher {
 
     auto output_column = make_strings_column(result, Op::template identity<T>(), stream, mr);
     if (null_handling == null_policy::EXCLUDE) {
-      output_column->set_null_mask(copy_bitmask(input_view, stream, mr), input_view.null_count());
+      output_column->set_null_mask(
+        detail::copy_bitmask(input_view, rmm::cuda_stream_view{stream}, mr),
+        input_view.null_count());
     } else {
       if (input_view.nullable()) {
         output_column->set_null_mask(mask_inclusive_scan(input_view, mr, stream),
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 38af16ed5e2..fff063b269a 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -158,7 +158,7 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   cudaStream_t stream)
 {
   auto output =
-    detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, mr, stream);
+    detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
   // mask will not change
   if (input.nullable()) { output->set_null_mask(copy_bitmask(input), input.null_count()); }
 
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index 4ce992ec9ee..6232da34f06 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -61,8 +61,8 @@ struct replace_nans_functor {
                             input_pair_iterator + size,
                             replacement_pair_iterator,
                             predicate,
-                            mr,
-                            stream);
+                            stream,
+                            mr);
       } else {
         auto replacement_pair_iterator = make_pair_iterator<T, false>(replacement);
         return copy_if_else(true,
@@ -70,8 +70,8 @@ struct replace_nans_functor {
                             input_pair_iterator + size,
                             replacement_pair_iterator,
                             predicate,
-                            mr,
-                            stream);
+                            stream,
+                            mr);
       }
     } else {
       auto input_pair_iterator = make_pair_iterator<T, false>(*input_device_view);
@@ -82,8 +82,8 @@ struct replace_nans_functor {
                             input_pair_iterator + size,
                             replacement_pair_iterator,
                             predicate,
-                            mr,
-                            stream);
+                            stream,
+                            mr);
       } else {
         auto replacement_pair_iterator = make_pair_iterator<T, false>(replacement);
         return copy_if_else(false,
@@ -91,8 +91,8 @@ struct replace_nans_functor {
                             input_pair_iterator + size,
                             replacement_pair_iterator,
                             predicate,
-                            mr,
-                            stream);
+                            stream,
+                            mr);
       }
     }
   }
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index c7bb01d3ecd..2a8fea154e5 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -32,6 +33,7 @@
 #include <cudf/strings/replace.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+#include "cudf/copying.hpp"
 
 #include <thrust/transform.h>
 
@@ -152,13 +154,14 @@ struct replace_nulls_column_kernel_forwarder {
     cudf::size_type nrows = input.size();
     cudf::detail::grid_1d grid{nrows, BLOCK_SIZE};
 
-    std::unique_ptr<cudf::column> output;
-    if (replacement.has_nulls())
-      output = cudf::detail::allocate_like(
-        input, input.size(), cudf::mask_allocation_policy::ALWAYS, mr, stream);
-    else
-      output = cudf::detail::allocate_like(
-        input, input.size(), cudf::mask_allocation_policy::NEVER, mr, stream);
+    auto output =
+      cudf::detail::allocate_like(input,
+                                  input.size(),
+                                  replacement.has_nulls() ? cudf::mask_allocation_policy::ALWAYS
+                                                          : cudf::mask_allocation_policy::NEVER,
+                                  stream,
+                                  mr);
+
     auto output_view = output->mutable_view();
 
     auto replace = replace_nulls<col_type, false>;
@@ -217,7 +220,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   auto device_replacement = cudf::column_device_view::create(replacement);
 
   rmm::device_buffer valid_bits =
-    cudf::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
+    cudf::detail::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
 
   // Call first pass kernel to get sizes in offsets
   cudf::detail::grid_1d grid{input.size(), BLOCK_SIZE, 1};
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 07a9f0fab9f..21b583cddbe 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -37,13 +37,13 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -52,6 +52,7 @@
 
 #include <thrust/find.h>
 #include <rmm/device_scalar.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace {  // anonymous
 
@@ -317,7 +318,7 @@ struct replace_kernel_forwarder {
                                             ? cudf::mask_allocation_policy::ALWAYS
                                             : cudf::mask_allocation_policy::NEVER;
       return cudf::detail::allocate_like(
-        input_col, input_col.size(), mask_allocation_policy, mr, stream);
+        input_col, input_col.size(), mask_allocation_policy, stream, mr);
     }();
 
     auto output_view = output->mutable_view();
@@ -395,8 +396,8 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   auto device_sizes             = cudf::mutable_column_device_view::create(sizes_view);
   auto device_indices           = cudf::mutable_column_device_view::create(indices_view);
 
-  rmm::device_buffer valid_bits =
-    cudf::create_null_mask(input_col.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
+  rmm::device_buffer valid_bits = cudf::detail::create_null_mask(
+    input_col.size(), cudf::mask_state::UNINITIALIZED, rmm::cuda_stream_view{stream}, mr);
 
   // Call first pass kernel to get sizes in offsets
   cudf::detail::grid_1d grid{input_col.size(), BLOCK_SIZE, 1};
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 2a96fa95ae1..841a8879aa6 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -16,10 +16,12 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/reshape.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
@@ -68,7 +70,8 @@ struct byte_list_conversion {
     auto offsets_column = cudf::strings::detail::make_offsets_child_column(
       begin, begin + input_column.size(), mr, stream);
 
-    rmm::device_buffer null_mask = copy_bitmask(input_column, stream, mr);
+    rmm::device_buffer null_mask =
+      detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr);
 
     return make_lists_column(input_column.size(),
                              std::move(offsets_column),
@@ -97,7 +100,7 @@ std::unique_ptr<cudf::column> byte_list_conversion::operator()<string_view>(
     std::move(contents.children[cudf::strings_column_view::offsets_column_index]),
     std::move(contents.children[cudf::strings_column_view::chars_column_index]),
     input_column.null_count(),
-    copy_bitmask(input_column, stream, mr),
+    detail::copy_bitmask(input_column, rmm::cuda_stream_view{stream}, mr),
     stream,
     mr);
 }
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index d6c68d56797..ef2ef8858ea 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -127,7 +127,7 @@ struct interleave_columns_functor {
     auto arch_column = input.column(0);
     auto output_size = input.num_columns() * input.num_rows();
     auto output =
-      allocate_like(arch_column, output_size, mask_allocation_policy::NEVER, mr, stream);
+      allocate_like(arch_column, output_size, mask_allocation_policy::NEVER, stream, mr);
     auto device_input  = table_device_view::create(input);
     auto device_output = mutable_column_device_view::create(*output);
     auto index_begin   = thrust::make_counting_iterator<size_type>(0);
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index b228136ad1b..89d3534a41f 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -19,14 +19,16 @@
 #include <rmm/device_buffer.hpp>
 
 #include <string>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
-std::string string_scalar::to_string(cudaStream_t stream) const
+std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
 {
   std::string result;
   result.resize(_data.size());
-  CUDA_TRY(cudaMemcpyAsync(&result[0], _data.data(), _data.size(), cudaMemcpyDeviceToHost, stream));
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    &result[0], _data.data(), _data.size(), cudaMemcpyDeviceToHost, stream.value()));
+  stream.synchronize();
   return result;
 }
 
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index 652a90809a3..a3a16130dfb 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/row_operators.cuh>
@@ -25,6 +26,8 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/sequence.h>
@@ -242,8 +245,12 @@ std::unique_ptr<column> rank(column_view const &input,
   std::unique_ptr<column> rank_column = [&null_handling, &output_type, &input, &mr, &stream] {
     // na_option=keep assign NA to NA values
     if (null_handling == null_policy::EXCLUDE)
-      return make_numeric_column(
-        output_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+      return make_numeric_column(output_type,
+                                 input.size(),
+                                 detail::copy_bitmask(input, stream, mr),
+                                 input.null_count(),
+                                 stream,
+                                 mr);
     else
       return make_numeric_column(output_type, input.size(), mask_state::UNALLOCATED, stream, mr);
   }();
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 1bad00f49a4..e16291b6aa2 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/attributes.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -24,6 +25,8 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/transform.h>
 #include <thrust/transform_scan.h>
 
@@ -59,7 +62,8 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
     cudf::data_type{type_id::INT32},
     strings_count,
     rmm::device_buffer(strings_count * sizeof(int32_t), stream, mr),
-    copy_bitmask(strings.parent(), stream, mr),  // copy the null mask
+    cudf::detail::copy_bitmask(
+      strings.parent(), rmm::cuda_stream_view{stream}, mr),  // copy the null mask
     strings.null_count());
   auto results_view = results->mutable_view();
   auto d_lengths    = results_view.data<int32_t>();
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index b3bf684f33a..48306ce4e11 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -19,15 +19,19 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
+
 #include <strings/utilities.cuh>
 #include <strings/utilities.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -142,7 +146,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
   size_type null_count = strings.null_count();
 
   // copy null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
   // get the lookup tables used for case conversion
   auto d_flags = get_character_flags_table();
 
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index d3ac5229dd4..6e63e756c2e 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -26,9 +27,10 @@
 #include <strings/utilities.cuh>
 #include <strings/utilities.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/logical.h>
 
-//
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -45,12 +47,13 @@ std::unique_ptr<column> all_characters_of_type(
   auto d_column       = *strings_column;
 
   // create output column
-  auto results      = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::BOOL8},
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<bool>();
   // get the static character types table
@@ -168,7 +171,8 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                            d_replacement};
 
   // copy null mask from input column
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // this utility calls filterer to build the offsets and chars columns
   auto children = cudf::strings::detail::make_strings_children(
@@ -192,12 +196,13 @@ std::unique_ptr<column> is_integer(
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::BOOL8},
+    strings.size(),
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto d_results = results->mutable_view().data<bool>();
   thrust::transform(rmm::exec_policy(stream)->on(stream),
                     thrust::make_counting_iterator<size_type>(0),
@@ -234,12 +239,13 @@ std::unique_ptr<column> is_float(
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::BOOL8},
+    strings.size(),
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto d_results = results->mutable_view().data<bool>();
   // check strings for valid float chars
   thrust::transform(rmm::exec_policy(stream)->on(stream),
diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu
index 60f7e13d866..57bd7abef2f 100644
--- a/cpp/src/strings/combine.cu
+++ b/cpp/src/strings/combine.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/scalar/scalar_device_view.cuh>
@@ -26,12 +27,15 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
+
 #include <strings/utilities.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
+
 #include <thrust/logical.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/transform_scan.h>
+
 #include <algorithm>
 
 namespace cudf {
@@ -210,7 +214,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
   size_type null_count = 0;
   rmm::device_buffer null_mask{0, stream, mr};  // init to null null-mask
   if (strings.null_count() == strings_count && !narep.is_valid()) {
-    null_mask  = create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr);
+    null_mask  = cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr);
     null_count = 1;
   }
   auto chars_column =
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 6b441b29c47..96c87f554b5 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -17,8 +17,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -26,6 +26,8 @@
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -79,12 +81,13 @@ std::unique_ptr<column> contains_util(
   auto d_prog = *prog;
 
   // create the output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::BOOL8},
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto d_results = results->mutable_view().data<bool>();
 
   // fill the output column
@@ -200,12 +203,13 @@ std::unique_ptr<column> count_re(
   auto d_prog = *prog;
 
   // create the output column
-  auto results   = make_numeric_column(data_type{type_id::INT32},
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::INT32},
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto d_results = results->mutable_view().data<int32_t>();
 
   // fill the output column
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index 18fdf68aa23..1ba2151c0a7 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
 #include <cudf/strings/detail/converters.hpp>
@@ -27,6 +28,8 @@
 #include <strings/utilities.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
@@ -49,12 +52,13 @@ std::unique_ptr<column> to_booleans(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create output column copying the strings' null-mask
-  auto results      = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::BOOL8},
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<bool>();
 
@@ -106,7 +110,8 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
   auto d_column = *column;
 
   // copy null mask
-  rmm::device_buffer null_mask = copy_bitmask(booleans, stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(booleans, rmm::cuda_stream_view{stream}, mr);
   // build offsets column
   auto offsets_transformer_itr =
     thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0),
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index d44e8a7ec13..f716b1500c6 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/detail/converters.hpp>
@@ -28,9 +29,12 @@
 #include <cudf/wrappers/timestamps.hpp>
 #include <strings/utilities.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
 #include <thrust/logical.h>
+
 #include <map>
-#include <rmm/device_uvector.hpp>
 #include <vector>
 
 namespace cudf {
@@ -414,12 +418,13 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
 
-  auto results      = make_timestamp_column(timestamp_type,
-                                       strings_count,
-                                       copy_bitmask(strings.parent(), stream, mr),
-                                       strings.null_count(),
-                                       stream,
-                                       mr);
+  auto results = make_timestamp_column(
+    timestamp_type,
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto results_view = results->mutable_view();
   cudf::type_dispatcher(
     timestamp_type, dispatch_to_timestamps_fn(), d_column, format, units, results_view, stream);
@@ -564,12 +569,13 @@ std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
 
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::BOOL8},
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto d_results = results->mutable_view().data<bool>();
 
   format_compiler compiler(format.c_str(), stream);
@@ -886,7 +892,8 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
   auto d_column = *column;
 
   // copy null mask
-  rmm::device_buffer null_mask = copy_bitmask(timestamps, stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(timestamps, rmm::cuda_stream_view{stream}, mr);
   // Each string will be the same number of bytes which can be determined
   // directly from the format string.
   auto d_str_bytes = compiler.template_bytes();  // size in bytes of each string
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index ba444c4ebe0..d2709e2ebe1 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -15,14 +15,18 @@
  */
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/types.hpp>
 #include <strings/convert/utilities.cuh>
 #include <strings/utilities.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
 #include <thrust/transform_reduce.h>
+
 #include <map>
-#include <rmm/device_uvector.hpp>
 #include <vector>
 
 namespace cudf {
@@ -409,7 +413,8 @@ struct dispatch_from_durations_fn {
     auto d_column           = *column;
 
     // copy null mask
-    rmm::device_buffer null_mask = copy_bitmask(durations, stream, mr);
+    rmm::device_buffer null_mask =
+      cudf::detail::copy_bitmask(durations, rmm::cuda_stream_view{stream}, mr);
     // build offsets column
     auto offsets_transformer_itr = thrust::make_transform_iterator(
       thrust::make_counting_iterator<int32_t>(0),
@@ -723,12 +728,13 @@ std::unique_ptr<column> to_durations(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
 
-  auto results      = make_duration_column(duration_type,
-                                      strings_count,
-                                      copy_bitmask(strings.parent(), stream, mr),
-                                      strings.null_count(),
-                                      stream,
-                                      mr);
+  auto results = make_duration_column(
+    duration_type,
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto results_view = results->mutable_view();
   cudf::type_dispatcher(
     duration_type, dispatch_to_durations_fn(), d_column, format, results_view, stream);
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 4ba347dbd50..8abf49c5dca 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/converters.hpp>
@@ -26,10 +27,12 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <strings/utilities.cuh>
 
-#include <memory.h>
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
+
 #include <cmath>
 #include <limits>
 
@@ -175,12 +178,13 @@ std::unique_ptr<column> to_floats(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create float output column copying the strings null-mask
-  auto results      = make_numeric_column(output_type,
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    output_type,
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto results_view = results->mutable_view();
   // fill output column with floats
   type_dispatcher(output_type, dispatch_to_floats_fn{}, d_strings, results_view, stream);
@@ -467,7 +471,8 @@ struct dispatch_from_floats_fn {
     auto d_column           = *column;
 
     // copy the null mask
-    rmm::device_buffer null_mask = copy_bitmask(floats, stream, mr);
+    rmm::device_buffer null_mask =
+      cudf::detail::copy_bitmask(floats, rmm::cuda_stream_view{stream}, mr);
     // build offsets column
     auto offsets_transformer_itr = thrust::make_transform_iterator(
       thrust::make_counting_iterator<int32_t>(0), float_to_string_size_fn<FloatType>{d_column});
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 60fe3a80d79..a8ea7cf3ab9 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -26,6 +27,8 @@
 #include <strings/utilities.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/transform.h>
@@ -129,12 +132,13 @@ std::unique_ptr<column> hex_to_integers(
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create integer output column copying the strings null-mask
-  auto results      = make_numeric_column(output_type,
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    output_type,
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto results_view = results->mutable_view();
   // fill output column with integers
   type_dispatcher(output_type, dispatch_hex_to_integers_fn{}, d_strings, results_view, stream);
@@ -149,12 +153,13 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::BOOL8},
+    strings.size(),
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto d_results = results->mutable_view().data<bool>();
   thrust::transform(rmm::exec_policy(stream)->on(stream),
                     thrust::make_counting_iterator<size_type>(0),
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 248f2f9a717..42bd70899a9 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
@@ -29,6 +30,8 @@
 #include <strings/utilities.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
@@ -101,12 +104,13 @@ std::unique_ptr<column> to_integers(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create integer output column copying the strings null-mask
-  auto results      = make_numeric_column(output_type,
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    output_type,
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto results_view = results->mutable_view();
   // fill output column with integers
   type_dispatcher(output_type, dispatch_to_integers_fn{}, d_strings, results_view, stream);
@@ -180,7 +184,8 @@ struct dispatch_from_integers_fn {
     auto d_column           = *column;
 
     // copy the null mask
-    rmm::device_buffer null_mask = copy_bitmask(integers, stream, mr);
+    rmm::device_buffer null_mask =
+      cudf::detail::copy_bitmask(integers, rmm::cuda_stream_view{stream}, mr);
     // build offsets column
     auto offsets_transformer_itr = thrust::make_transform_iterator(
       thrust::make_counting_iterator<int32_t>(0), integer_to_string_size_fn<IntegerType>{d_column});
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 3a18480c866..dcccad30f30 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_ipv4.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -23,6 +24,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <strings/utilities.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/count.h>
 #include <thrust/transform.h>
 
@@ -78,12 +81,13 @@ std::unique_ptr<column> ipv4_to_integers(
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   // create output column copying the strings' null-mask
-  auto results   = make_numeric_column(data_type{type_id::INT64},
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::INT64},
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto d_results = results->mutable_view().data<int64_t>();
   // fill output column with ipv4 integers
   thrust::transform(rmm::exec_policy(stream)->on(stream),
@@ -168,7 +172,8 @@ std::unique_ptr<column> integers_to_ipv4(
   auto d_column = *column;
 
   // copy null mask
-  rmm::device_buffer null_mask = copy_bitmask(integers, stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(integers, rmm::cuda_stream_view{stream}, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0), [d_column] __device__(size_type idx) {
@@ -212,12 +217,13 @@ std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
-  auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::BOOL8},
+    strings.size(),
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto d_results = results->mutable_view().data<bool>();
   thrust::transform(rmm::exec_policy(stream)->on(stream),
                     thrust::make_counting_iterator<size_type>(0),
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 5030e49a23a..9b5c142511f 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -23,6 +24,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <strings/utilities.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -120,7 +123,8 @@ std::unique_ptr<column> url_encode(
   auto d_strings      = *strings_column;
 
   // copy null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), url_encoder_fn{d_strings});
@@ -222,7 +226,8 @@ std::unique_ptr<column> url_decode(
   auto d_strings      = *strings_column;
 
   // copy null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), url_decoder_fn{d_strings});
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 38f43a8bb5d..9a8a64f2f99 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/concatenate.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -249,7 +250,8 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
   rmm::device_buffer null_mask{0, stream, mr};
   size_type null_count{};
   if (has_nulls) {
-    null_mask = create_null_mask(strings_count, mask_state::UNINITIALIZED, stream, mr);
+    null_mask =
+      cudf::detail::create_null_mask(strings_count, mask_state::UNINITIALIZED, stream, mr);
   }
 
   {  // Copy offsets columns with single kernel launch
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 3db0017f55f..975d84c7875 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -25,7 +26,10 @@
 #include <cudf/strings/translate.hpp>
 #include <strings/utilities.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/find.h>
+
 #include <algorithm>
 
 namespace cudf {
@@ -123,7 +127,8 @@ std::unique_ptr<column> filter_characters(
   auto d_strings      = *strings_column;
 
   // create null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // create offsets column
   filter_fn ffn{d_strings, keep_characters, table.begin(), table.end(), d_replacement};
diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu
index 7e401146d9f..1b3ede7c88c 100644
--- a/cpp/src/strings/find.cu
+++ b/cpp/src/strings/find.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -24,6 +25,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -65,12 +68,13 @@ std::unique_ptr<column> find_fn(strings_column_view const& strings,
   auto d_strings      = *strings_column;
   auto strings_count  = strings.size();
   // create output column
-  auto results      = make_numeric_column(data_type{type_id::INT32},
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::INT32},
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<int32_t>();
   // set the position values by evaluating the passed function
@@ -187,7 +191,9 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
   {
     auto const true_scalar = make_fixed_width_scalar<bool>(true, stream);
     auto results           = make_column_from_scalar(*true_scalar, strings.size(), mr, stream);
-    results->set_null_mask(copy_bitmask(strings.parent(), stream, mr), strings.null_count());
+    results->set_null_mask(
+      cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+      strings.null_count());
     return results;
   }
 
@@ -195,12 +201,13 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create output column
-  auto results      = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::BOOL8},
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<bool>();
   // set the bool values by evaluating the passed function
@@ -250,12 +257,13 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create output column
-  auto results      = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings_count,
-                                     copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
-                                     stream,
-                                     mr);
+  auto results = make_numeric_column(
+    data_type{type_id::BOOL8},
+    strings_count,
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<bool>();
   // set the bool values by evaluating the passed function
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu
index 5c0904c2cb8..d7e695c0a3a 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/findall.cu
@@ -17,14 +17,15 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/findall.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 #include <thrust/extrema.h>
 
@@ -112,15 +113,15 @@ std::unique_ptr<table> findall_re(
   strings_column_view const& strings,
   std::string const& pattern,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default)
 {
   auto strings_count  = strings.size();
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(strings.parent(), stream.value());
   auto d_strings      = *strings_column;
 
   auto d_flags = detail::get_character_flags_table();
   // compile regex into device object
-  auto prog       = reprog_device::create(pattern, d_flags, strings_count, stream);
+  auto prog       = reprog_device::create(pattern, d_flags, strings_count, stream.value());
   auto d_prog     = *prog;
   auto execpol    = rmm::exec_policy(stream);
   int regex_insts = prog->insts_counts();
@@ -129,19 +130,19 @@ std::unique_ptr<table> findall_re(
   auto d_find_counts = find_counts.data().get();
 
   if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
-    thrust::transform(execpol->on(stream),
+    thrust::transform(execpol->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_find_counts,
                       findall_count_fn<RX_STACK_SMALL>{d_strings, d_prog});
   else if (regex_insts <= RX_MEDIUM_INSTS)
-    thrust::transform(execpol->on(stream),
+    thrust::transform(execpol->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_find_counts,
                       findall_count_fn<RX_STACK_MEDIUM>{d_strings, d_prog});
   else
-    thrust::transform(execpol->on(stream),
+    thrust::transform(execpol->on(stream.value()),
                       thrust::make_counting_iterator<size_type>(0),
                       thrust::make_counting_iterator<size_type>(strings_count),
                       d_find_counts,
@@ -150,41 +151,41 @@ std::unique_ptr<table> findall_re(
   std::vector<std::unique_ptr<column>> results;
 
   size_type columns =
-    *thrust::max_element(execpol->on(stream), find_counts.begin(), find_counts.end());
+    *thrust::max_element(execpol->on(stream.value()), find_counts.begin(), find_counts.end());
   // boundary case: if no columns, return all nulls column (issue #119)
   if (columns == 0)
-    results.emplace_back(
-      std::make_unique<column>(data_type{type_id::STRING},
-                               strings_count,
-                               rmm::device_buffer{0, stream, mr},  // no data
-                               create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
-                               strings_count));
+    results.emplace_back(std::make_unique<column>(
+      data_type{type_id::STRING},
+      strings_count,
+      rmm::device_buffer{0, stream, mr},  // no data
+      cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
+      strings_count));
 
   for (int32_t column_index = 0; column_index < columns; ++column_index) {
     rmm::device_vector<string_index_pair> indices(strings_count);
     string_index_pair* d_indices = indices.data().get();
 
     if ((regex_insts > MAX_STACK_INSTS) || (regex_insts <= RX_SMALL_INSTS))
-      thrust::transform(execpol->on(stream),
+      thrust::transform(execpol->on(stream.value()),
                         thrust::make_counting_iterator<size_type>(0),
                         thrust::make_counting_iterator<size_type>(strings_count),
                         d_indices,
                         findall_fn<RX_STACK_SMALL>{d_strings, d_prog, column_index, d_find_counts});
     else if (regex_insts <= RX_MEDIUM_INSTS)
       thrust::transform(
-        execpol->on(stream),
+        execpol->on(stream.value()),
         thrust::make_counting_iterator<size_type>(0),
         thrust::make_counting_iterator<size_type>(strings_count),
         d_indices,
         findall_fn<RX_STACK_MEDIUM>{d_strings, d_prog, column_index, d_find_counts});
     else
-      thrust::transform(execpol->on(stream),
+      thrust::transform(execpol->on(stream.value()),
                         thrust::make_counting_iterator<size_type>(0),
                         thrust::make_counting_iterator<size_type>(strings_count),
                         d_indices,
                         findall_fn<RX_STACK_LARGE>{d_strings, d_prog, column_index, d_find_counts});
     //
-    results.emplace_back(make_strings_column(indices, stream, mr));
+    results.emplace_back(make_strings_column(indices, stream.value(), mr));
   }
   return std::make_unique<table>(std::move(results));
 }
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 8e9951a7bf8..05b5293e432 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/padding.hpp>
@@ -24,6 +25,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <strings/utilities.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -67,7 +70,8 @@ std::unique_ptr<column> pad(
   auto d_strings      = *strings_column;
 
   // create null_mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // build offsets column
   auto offsets_transformer_itr =
@@ -155,7 +159,8 @@ std::unique_ptr<column> zfill(
   auto d_strings      = *strings_column;
 
   // copy bitmask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index e94d01f0268..f373c97b1ef 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -25,6 +26,8 @@
 #include <strings/utilities.cuh>
 #include <strings/utilities.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -108,7 +111,8 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
   auto d_strings      = *strings_column;
 
   // copy the null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0),
@@ -194,7 +198,8 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
   auto d_strings      = *strings_column;
 
   // copy the null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0),
@@ -303,7 +308,8 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
   auto d_repls        = *repls_column;
 
   // copy the null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0),
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index a8fcd4adbd3..4ef46b289e2 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -493,12 +494,12 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
     *thrust::max_element(execpol->on(stream), token_counts.begin(), token_counts.end());
   // boundary case: if no columns, return one null column (custrings issue #119)
   if (columns_count == 0) {
-    results.push_back(
-      std::make_unique<column>(data_type{type_id::STRING},
-                               strings_count,
-                               rmm::device_buffer{0, stream, mr},  // no data
-                               create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
-                               strings_count));
+    results.push_back(std::make_unique<column>(
+      data_type{type_id::STRING},
+      strings_count,
+      rmm::device_buffer{0, stream, mr},  // no data
+      cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
+      strings_count));
   }
 
   // create working area to hold all token positions
@@ -764,12 +765,12 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
   std::vector<std::unique_ptr<column>> results;
   // boundary case: if no columns, return one null column (issue #119)
   if (columns_count == 0) {
-    results.push_back(
-      std::make_unique<column>(data_type{type_id::STRING},
-                               strings_count,
-                               rmm::device_buffer{0, stream, mr},  // no data
-                               create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
-                               strings_count));
+    results.push_back(std::make_unique<column>(
+      data_type{type_id::STRING},
+      strings_count,
+      rmm::device_buffer{0, stream, mr},  // no data
+      cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
+      strings_count));
   }
 
   // get the positions for every token
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 18df9d5e48c..ea5a2d8ef69 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -24,6 +25,8 @@
 #include <cudf/utilities/error.hpp>
 #include <strings/utilities.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/logical.h>
 #include <thrust/transform.h>
 
@@ -118,7 +121,8 @@ std::unique_ptr<column> strip(
   size_type null_count = strings.null_count();
 
   // copy null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // build offsets column -- calculate the size of each output string
   auto offsets_transformer_itr = thrust::make_transform_iterator(
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index d5695bddb31..1d4656ffa8f 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -111,7 +112,8 @@ std::unique_ptr<column> slice_strings(
   auto d_step         = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
   // copy the null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index e61a40d655f..1fc9ff7f813 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -25,6 +26,7 @@
 #include <strings/utilities.cuh>
 
 #include <thrust/find.h>
+
 #include <algorithm>
 
 namespace cudf {
@@ -92,7 +94,8 @@ std::unique_ptr<column> translate(
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
   // create offsets column
   auto offsets_transformer_itr =
     thrust::make_transform_iterator(thrust::make_counting_iterator<int32_t>(0),
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index 5864cc1f2c7..181283c5e34 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/utilities.hpp>
@@ -103,7 +104,8 @@ std::unique_ptr<column> wrap(
   size_type null_count = strings.null_count();
 
   // copy null mask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // build offsets column
   auto offsets_column = std::make_unique<column>(strings.offsets(), stream, mr);  // makes a copy
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index cc006760519..ac67b08eba0 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -19,18 +19,24 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+
 #include <strings/utilities.cuh>
 
 #include <nvtext/normalize.hpp>
+
 #include <text/subword/detail/data_normalizer.hpp>
 #include <text/utilities/tokenize_ops.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/for_each.h>
 #include <thrust/transform_reduce.h>
+
 #include <limits>
 
 namespace nvtext {
@@ -161,7 +167,8 @@ std::unique_ptr<cudf::column> normalize_spaces(
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // copy bitmask
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // create offsets by calculating size of each string for output
   auto offsets_transformer_itr =
@@ -247,13 +254,14 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     codepoint_to_utf8_fn{*strings_column, cp_chars, cp_offsets, d_offsets, d_chars});
   chars_column->set_null_count(0);  // reset null count for child column
 
-  return cudf::make_strings_column(strings_count,
-                                   std::move(offsets_column),
-                                   std::move(chars_column),
-                                   strings.null_count(),
-                                   copy_bitmask(strings.parent(), stream, mr),
-                                   stream,
-                                   mr);
+  return cudf::make_strings_column(
+    strings_count,
+    std::move(offsets_column),
+    std::move(chars_column),
+    strings.null_count(),
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    stream,
+    mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 7733e521b04..4263c5f1864 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -214,7 +215,8 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
                              *replacements_column};
 
   // copy null mask from input column
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // this utility calls replacer to build the offsets and chars columns
   auto children = cudf::strings::detail::make_strings_children(
@@ -249,7 +251,8 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
   remove_small_tokens_fn filterer{*strings_column, d_delimiter, min_token_length, d_replacement};
 
   // copy null mask from input column
-  rmm::device_buffer null_mask = copy_bitmask(strings.parent(), stream, mr);
+  rmm::device_buffer null_mask =
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
 
   // this utility calls filterer to build the offsets and chars columns
   auto children = cudf::strings::detail::make_strings_children(
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index ec4ad17448b..1521dc90dae 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -97,12 +98,13 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8});
 
   // create empty output column
-  auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8},
-                                               strings.size(),
-                                               copy_bitmask(strings.parent(), stream, mr),
-                                               strings.null_count(),
-                                               stream,
-                                               mr);
+  auto results = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::BOOL8},
+    strings.size(),
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   // set values into output column
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   thrust::transform(rmm::exec_policy(stream)->on(stream),
@@ -204,12 +206,13 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
 
   // create empty output column
-  auto results = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32},
-                                               strings.size(),
-                                               copy_bitmask(strings.parent(), stream, mr),
-                                               strings.null_count(),
-                                               stream,
-                                               mr);
+  auto results = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32},
+    strings.size(),
+    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
+    strings.null_count(),
+    stream,
+    mr);
   // compute measures into output column
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   thrust::transform(rmm::exec_policy(stream)->on(stream),
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 7729d17aadc..c8e7bd2fd5e 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -53,7 +53,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
 
     auto num_rows = keys_table->num_rows();
     auto mask =
-      cudf::detail::bitmask_and(keys_table->view(), rmm::mr::get_current_device_resource(), stream);
+      cudf::detail::bitmask_and(keys_table->view(), stream, rmm::mr::get_current_device_resource());
     auto num_rows_with_nulls =
       cudf::count_unset_bits(reinterpret_cast<bitmask_type*>(mask.data()), 0, num_rows);
 
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 338bb481606..e96f6e4f004 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/null_mask.hpp>
@@ -22,6 +23,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
@@ -110,12 +112,13 @@ struct dispatch_unary_cast_to {
                                      rmm::mr::device_memory_resource* mr,
                                      cudaStream_t stream)
   {
-    auto size   = input.size();
-    auto output = std::make_unique<column>(type,
-                                           size,
-                                           rmm::device_buffer{size * cudf::size_of(type), 0, mr},
-                                           copy_bitmask(input, 0, mr),
-                                           input.null_count());
+    auto size = input.size();
+    auto output =
+      std::make_unique<column>(type,
+                               size,
+                               rmm::device_buffer{size * cudf::size_of(type), stream, mr},
+                               detail::copy_bitmask(input, stream, mr),
+                               input.null_count());
 
     mutable_column_view output_mutable = *output;
 
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 1b4f91ad10f..08b653c7353 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -16,12 +16,15 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/detail/encode.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <cmath>
 #include <type_traits>
 
@@ -261,12 +264,13 @@ std::unique_ptr<cudf::column> transform_fn(cudf::dictionary_column_view const& i
   auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
   auto default_mr      = rmm::mr::get_current_device_resource();
   // call unary-op using temporary output buffer
-  auto output = transform_fn<T, UFN>(dictionary_itr,
-                                     dictionary_itr + input.size(),
-                                     copy_bitmask(input.parent(), stream, default_mr),
-                                     input.null_count(),
-                                     default_mr,
-                                     stream);
+  auto output = transform_fn<T, UFN>(
+    dictionary_itr,
+    dictionary_itr + input.size(),
+    detail::copy_bitmask(input.parent(), rmm::cuda_stream_view{stream}, default_mr),
+    input.null_count(),
+    default_mr,
+    stream);
   return cudf::dictionary::detail::encode(
     output->view(), dictionary::detail::get_indices_type_for_size(output->size()), mr, stream);
 }
@@ -278,12 +282,13 @@ struct MathOpDispatcher {
                                            rmm::mr::device_memory_resource* mr,
                                            cudaStream_t stream)
   {
-    return transform_fn<T, UFN>(input.begin<T>(),
-                                input.end<T>(),
-                                copy_bitmask(input, stream, mr),
-                                input.null_count(),
-                                mr,
-                                stream);
+    return transform_fn<T, UFN>(
+      input.begin<T>(),
+      input.end<T>(),
+      cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr),
+      input.null_count(),
+      mr,
+      stream);
   }
 
   struct dictionary_dispatch {
@@ -335,12 +340,13 @@ struct BitwiseOpDispatcher {
                                            rmm::mr::device_memory_resource* mr,
                                            cudaStream_t stream)
   {
-    return transform_fn<T, UFN>(input.begin<T>(),
-                                input.end<T>(),
-                                copy_bitmask(input, stream, mr),
-                                input.null_count(),
-                                mr,
-                                stream);
+    return transform_fn<T, UFN>(
+      input.begin<T>(),
+      input.end<T>(),
+      cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr),
+      input.null_count(),
+      mr,
+      stream);
   }
 
   struct dictionary_dispatch {
@@ -400,12 +406,13 @@ struct LogicalOpDispatcher {
                                            rmm::mr::device_memory_resource* mr,
                                            cudaStream_t stream)
   {
-    return transform_fn<bool, UFN>(input.begin<T>(),
-                                   input.end<T>(),
-                                   copy_bitmask(input, stream, mr),
-                                   input.null_count(),
-                                   mr,
-                                   stream);
+    return transform_fn<bool, UFN>(
+      input.begin<T>(),
+      input.end<T>(),
+      cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr),
+      input.null_count(),
+      mr,
+      stream);
   }
 
   struct dictionary_dispatch {
@@ -416,12 +423,13 @@ struct LogicalOpDispatcher {
     {
       auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
       auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
-      return transform_fn<bool, UFN>(dictionary_itr,
-                                     dictionary_itr + input.size(),
-                                     copy_bitmask(input.parent(), stream, mr),
-                                     input.null_count(),
-                                     mr,
-                                     stream);
+      return transform_fn<bool, UFN>(
+        dictionary_itr,
+        dictionary_itr + input.size(),
+        cudf::detail::copy_bitmask(input.parent(), rmm::cuda_stream_view{stream}, mr),
+        input.null_count(),
+        mr,
+        stream);
     }
 
     template <typename T, typename std::enable_if_t<!is_supported<T>()>* = nullptr>

From 7802d5246694b27c64efc61e68d0043cb882ad55 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 2 Nov 2020 17:04:06 +1100
Subject: [PATCH 03/51] Revert commented out stuff.

---
 .../type_dispatcher/type_dispatcher_benchmark.cu          | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
index 222a2c40618..56b6ead120e 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
@@ -144,14 +144,14 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
     // std::vector<cudf::util::cuda::scoped_stream> v_stream(n_cols);
     for (int c = 0; c < n_cols; c++) {
       auto d_column = mutable_column_device_view::create(input.column(c));
-      // cudf::type_dispatcher(
-      //  d_column->type(), ColumnHandle<functor_type>{}, *d_column, work_per_thread);
+      cudf::type_dispatcher(
+        d_column->type(), ColumnHandle<functor_type>{}, *d_column, work_per_thread);
     }
   } else if (dispatching_type == DEVICE_DISPATCHING) {
     auto d_table_view = mutable_table_device_view::create(input);
-    // auto f            = device_dispatching_kernel<functor_type>;
+    auto f            = device_dispatching_kernel<functor_type>;
     // Launch the kernel
-    // f<<<grid_size, block_size>>>(*d_table_view);
+    f<<<grid_size, block_size>>>(*d_table_view);
   } else if (dispatching_type == NO_DISPATCHING) {
     auto f = no_dispatching_kernel<functor_type, T>;
     // Launch the kernel

From f0ca10c0453b91eba0ff739ec433b13eae6f85d0 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 2 Nov 2020 17:23:52 +1100
Subject: [PATCH 04/51] Convert AST to cuda_stream_view

---
 cpp/include/cudf/ast/detail/transform.cuh |  3 ++-
 cpp/src/ast/transform.cu                  | 16 +++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh
index 3366acefe35..454085ff9bd 100644
--- a/cpp/include/cudf/ast/detail/transform.cuh
+++ b/cpp/include/cudf/ast/detail/transform.cuh
@@ -27,6 +27,7 @@
 
 #include <cstring>
 #include <numeric>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 
@@ -369,7 +370,7 @@ struct ast_plan {
 std::unique_ptr<column> compute_column(
   table_view const table,
   expression const& expr,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 
diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu
index b8906a36121..ffc80a926fb 100644
--- a/cpp/src/ast/transform.cu
+++ b/cpp/src/ast/transform.cu
@@ -29,6 +29,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -87,7 +89,7 @@ __launch_bounds__(max_block_size) __global__
 
 std::unique_ptr<column> compute_column(table_view const table,
                                        expression const& expr,
-                                       cudaStream_t stream,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   // Linearize the AST
@@ -126,14 +128,14 @@ std::unique_ptr<column> compute_column(table_view const table,
     reinterpret_cast<const cudf::size_type*>(device_data_buffer_ptr + buffer_offsets[3]);
 
   // Create table device view
-  auto table_device         = table_device_view::create(table, stream);
+  auto table_device         = table_device_view::create(table, stream.value());
   auto const table_num_rows = table.num_rows();
 
   // Prepare output column
   auto output_column = cudf::make_fixed_width_column(
-    expr_data_type, table_num_rows, mask_state::UNALLOCATED, stream, mr);
+    expr_data_type, table_num_rows, mask_state::UNALLOCATED, stream.value(), mr);
   auto mutable_output_device =
-    cudf::mutable_column_device_view::create(output_column->mutable_view(), stream);
+    cudf::mutable_column_device_view::create(output_column->mutable_view(), stream.value());
 
   // Configure kernel parameters
   auto const num_intermediates     = expr_linearizer.get_intermediate_count();
@@ -153,7 +155,7 @@ std::unique_ptr<column> compute_column(table_view const table,
 
   // Execute the kernel
   cudf::ast::detail::compute_column_kernel<MAX_BLOCK_SIZE>
-    <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream>>>(
+    <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
       *table_device,
       device_literals,
       *mutable_output_device,
@@ -162,7 +164,7 @@ std::unique_ptr<column> compute_column(table_view const table,
       device_operator_source_indices,
       num_operators,
       num_intermediates);
-  CHECK_CUDA(stream);
+  CHECK_CUDA(stream.value());
   return output_column;
 }
 
@@ -173,7 +175,7 @@ std::unique_ptr<column> compute_column(table_view const table,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_column(table, expr, 0, mr);
+  return detail::compute_column(table, expr, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace ast

From 22a14ddb83b2350e58c9d64f3df0c076d08bd975 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 2 Nov 2020 17:52:23 +1100
Subject: [PATCH 05/51] Convert column_device_view to rmm::cuda_stream_view

---
 cpp/include/cudf/column/column_device_view.cuh |  6 ++++--
 cpp/src/column/column_device_view.cu           | 11 ++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 5446d9b2f29..5118db2364e 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -28,6 +28,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 /**
  * @file column_device_view.cuh
  * @brief Column device view class definitons
@@ -386,7 +388,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    *`source_view` available in device memory.
    */
   static std::unique_ptr<column_device_view, std::function<void(column_device_view*)>> create(
-    column_view source_view, cudaStream_t stream = 0);
+    column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Destroy the `column_device_view` object.
@@ -480,7 +482,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    */
   static std::unique_ptr<mutable_column_device_view,
                          std::function<void(mutable_column_device_view*)>>
-  create(mutable_column_view source_view, cudaStream_t stream = 0);
+  create(mutable_column_view source_view, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Returns pointer to the base device memory allocation casted to
diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu
index 8e61f776e39..fb3bab68446 100644
--- a/cpp/src/column/column_device_view.cu
+++ b/cpp/src/column/column_device_view.cu
@@ -18,6 +18,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <numeric>
+#include "rmm/cuda_stream_view.hpp"
 
 #include <rmm/thrust_rmm_allocator.h>
 
@@ -92,7 +93,7 @@ ColumnDeviceView* child_columns_to_device_array(ColumnView const& source, void*
 // helper function for column_device_view::create and mutable_column_device::create methods
 template <typename ColumnView, typename ColumnDeviceView>
 std::unique_ptr<ColumnDeviceView, std::function<void(ColumnDeviceView*)>>
-create_device_view_from_view(ColumnView const& source, cudaStream_t stream)
+create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view stream)
 {
   size_type num_children = source.num_children();
   // First calculate the size of memory needed to hold the
@@ -129,9 +130,9 @@ create_device_view_from_view(ColumnView const& source, cudaStream_t stream)
                            staging_buffer.data(),
                            descendant_storage->size(),
                            cudaMemcpyDefault,
-                           stream));
+                           stream.value()));
 
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  CUDA_TRY(cudaStreamSynchronize(stream.value()));
 
   return result;
 }
@@ -153,7 +154,7 @@ column_device_view::column_device_view(column_view source, void* h_ptr, void* d_
 
 // Construct a unique_ptr that invokes `destroy()` as it's deleter
 std::unique_ptr<column_device_view, std::function<void(column_device_view*)>>
-column_device_view::create(column_view source, cudaStream_t stream)
+column_device_view::create(column_view source, rmm::cuda_stream_view stream)
 {
   size_type num_children = source.num_children();
   if (num_children == 0) {
@@ -203,7 +204,7 @@ void mutable_column_device_view::destroy() { delete this; }
 
 // Construct a unique_ptr that invokes `destroy()` as it's deleter
 std::unique_ptr<mutable_column_device_view, std::function<void(mutable_column_device_view*)>>
-mutable_column_device_view::create(mutable_column_view source, cudaStream_t stream)
+mutable_column_device_view::create(mutable_column_view source, rmm::cuda_stream_view stream)
 {
   return source.num_children() == 0
            ? std::unique_ptr<mutable_column_device_view>(new mutable_column_device_view(source))

From 2fab2ad9a2d305afc955dfa83ac33ca83bdca449 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 2 Nov 2020 18:39:15 +1100
Subject: [PATCH 06/51] Convert column to rmm::cuda_stream_view

---
 cpp/include/cudf/column/column.hpp | 11 ++++-----
 cpp/src/column/column.cu           | 37 +++++++++++++++---------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index ce0ed412b27..b94a2f13e1d 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -15,10 +15,12 @@
  */
 #pragma once
 
+#include "column_view.hpp"
+
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
-#include "column_view.hpp"
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <memory>
@@ -50,9 +52,6 @@ class column {
   /**
    * @brief Construct a new column by deep copying the contents of `other`.
    *
-   * All device memory allocation and copying is done using the
-   * `device_memory_resource` and `stream` from `other`.
-   *
    * @param other The column to copy
    **/
   column(column const& other);
@@ -69,7 +68,7 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   column(column const& other,
-         cudaStream_t stream,
+         rmm::cuda_stream_view stream,
          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -124,7 +123,7 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   explicit column(column_view view,
-                  cudaStream_t stream                 = 0,
+                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index 399bc26f786..b64f88291b7 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -26,19 +26,19 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/copying.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <iterator>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <algorithm>
+#include <iterator>
 #include <numeric>
 #include <vector>
-#include "cudf/structs/structs_column_view.hpp"
-#include "cudf/types.hpp"
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 // Copy constructor
@@ -54,7 +54,9 @@ column::column(column const &other)
 }
 
 // Copy ctor w/ explicit stream/mr
-column::column(column const &other, cudaStream_t stream, rmm::mr::device_memory_resource *mr)
+column::column(column const &other,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource *mr)
   : _type{other._type},
     _size{other._size},
     _data{other._data, stream, mr},
@@ -181,7 +183,7 @@ void column::set_null_count(size_type new_null_count)
 namespace {
 struct create_column_from_view {
   cudf::column_view view;
-  cudaStream_t stream;
+  rmm::cuda_stream_view stream{};
   rmm::mr::device_memory_resource *mr;
 
   template <typename ColumnType,
@@ -189,7 +191,7 @@ struct create_column_from_view {
   std::unique_ptr<column> operator()()
   {
     cudf::strings_column_view sview(view);
-    return cudf::strings::detail::copy_slice(sview, 0, view.size(), 1, stream, mr);
+    return cudf::strings::detail::copy_slice(sview, 0, view.size(), 1, stream.value(), mr);
   }
 
   template <typename ColumnType,
@@ -208,13 +210,12 @@ struct create_column_from_view {
       children.emplace_back(std::make_unique<column>(indices_view, stream, mr));
       children.emplace_back(std::make_unique<column>(dict_view.keys(), stream, mr));
     }
-    return std::make_unique<column>(
-      view.type(),
-      view.size(),
-      rmm::device_buffer{0, stream, mr},
-      cudf::detail::copy_bitmask(view, rmm::cuda_stream_view{stream}, mr),
-      view.null_count(),
-      std::move(children));
+    return std::make_unique<column>(view.type(),
+                                    view.size(),
+                                    rmm::device_buffer{0, stream, mr},
+                                    cudf::detail::copy_bitmask(view, stream, mr),
+                                    view.null_count(),
+                                    std::move(children));
   }
 
   template <typename ColumnType, std::enable_if_t<cudf::is_fixed_width<ColumnType>()> * = nullptr>
@@ -233,7 +234,7 @@ struct create_column_from_view {
         view.size() * cudf::size_of(view.type()),
         stream,
         mr},
-      cudf::detail::copy_bitmask(view, rmm::cuda_stream_view{stream}, mr),
+      cudf::detail::copy_bitmask(view, stream, mr),
       view.null_count(),
       std::move(children));
   }
@@ -243,7 +244,7 @@ struct create_column_from_view {
   std::unique_ptr<column> operator()()
   {
     auto lists_view = lists_column_view(view);
-    return cudf::lists::detail::copy_slice(lists_view, 0, view.size(), stream, mr);
+    return cudf::lists::detail::copy_slice(lists_view, 0, view.size(), stream.value(), mr);
   }
 
   template <typename ColumnType,
@@ -272,14 +273,14 @@ struct create_column_from_view {
       std::move(children),
       view.null_count(),
       cudf::detail::copy_bitmask(view.null_mask(), begin, end, rmm::cuda_stream_view{stream}, mr),
-      stream,
+      stream.value(),
       mr);
   }
 };
 }  // anonymous namespace
 
 // Copy from a view
-column::column(column_view view, cudaStream_t stream, rmm::mr::device_memory_resource *mr)
+column::column(column_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr)
   :  // Move is needed here because the dereference operator of unique_ptr returns
      // an lvalue reference, which would otherwise dispatch to the copy constructor
     column{std::move(*type_dispatcher(view.type(), create_column_from_view{view, stream, mr}))}

From 984c7ca825509aedca8c8b0e842f21d2da640d5e Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 3 Nov 2020 12:27:12 +1100
Subject: [PATCH 07/51] Changelog for #6646

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 017d9f35806..08186bb4408 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,6 +38,7 @@
 - PR #6514 Initial work for decimal type in Java/JNI
 - PR #6608 Improve subword tokenizer docs
 - PR #6612 Update JNI to new RMM cuda_stream_view API
+- PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1)
 
 ## Bug Fixes
 

From 0ebf99e00572a55ab83f9c244ef6ee4eb75e973b Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 3 Nov 2020 14:52:16 +1100
Subject: [PATCH 08/51] Convert column factories to cuda_stream_view

---
 cpp/include/cudf/column/column_factories.hpp | 152 ++++++++++---------
 cpp/src/column/column_factories.cpp          |  48 +++---
 cpp/src/copying/scatter.cu                   |   2 +-
 cpp/src/dictionary/replace.cu                |   2 +-
 cpp/src/filling/fill.cu                      |   2 +-
 cpp/src/hash/hashing.cu                      |   2 +-
 cpp/src/lists/lists_column_factories.cu      |   4 +-
 cpp/src/replace/clamp.cu                     |   2 +-
 cpp/src/strings/find.cu                      |  33 ++--
 cpp/src/strings/strings_column_factories.cu  |  30 ++--
 cpp/src/strings/substring.cu                 |   8 +-
 cpp/src/structs/structs_column_factories.cu  |  14 +-
 12 files changed, 153 insertions(+), 146 deletions(-)

diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index b40089f0929..7665cd8ca86 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -15,11 +15,13 @@
  */
 #pragma once
 
-#include <rmm/thrust_rmm_allocator.h>
 #include <cudf/column/column.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 /**
  * @addtogroup column_factories
@@ -31,9 +33,9 @@ namespace cudf {
 /**
  * @brief Creates an empty column of the specified @p type
  *
- * An empty column does not contain any elements or a validity mask.
+ * An empty column contains zero elements and no validity mask.
  *
- * @param type The desired type
+ * @param[in] type The column data type
  * @return Empty column with desired type
  */
 std::unique_ptr<column> make_empty_column(data_type type);
@@ -59,7 +61,7 @@ std::unique_ptr<column> make_numeric_column(
   data_type type,
   size_type size,
   mask_state state                    = mask_state::UNALLOCATED,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -85,7 +87,7 @@ std::unique_ptr<column> make_numeric_column(
   size_type size,
   B&& null_mask,
   size_type null_count                = cudf::UNKNOWN_NULL_COUNT,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
@@ -115,7 +117,7 @@ std::unique_ptr<column> make_fixed_point_column(
   data_type type,
   size_type size,
   mask_state state                    = mask_state::UNALLOCATED,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -139,7 +141,7 @@ std::unique_ptr<column> make_fixed_point_column(
   size_type size,
   B&& null_mask,
   size_type null_count                = cudf::UNKNOWN_NULL_COUNT,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
@@ -171,7 +173,7 @@ std::unique_ptr<column> make_timestamp_column(
   data_type type,
   size_type size,
   mask_state state                    = mask_state::UNALLOCATED,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -197,7 +199,7 @@ std::unique_ptr<column> make_timestamp_column(
   size_type size,
   B&& null_mask,
   size_type null_count                = cudf::UNKNOWN_NULL_COUNT,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
@@ -229,7 +231,7 @@ std::unique_ptr<column> make_duration_column(
   data_type type,
   size_type size,
   mask_state state                    = mask_state::UNALLOCATED,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -255,7 +257,7 @@ std::unique_ptr<column> make_duration_column(
   size_type size,
   B&& null_mask,
   size_type null_count                = cudf::UNKNOWN_NULL_COUNT,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
@@ -287,7 +289,7 @@ std::unique_ptr<column> make_fixed_width_column(
   data_type type,
   size_type size,
   mask_state state                    = mask_state::UNALLOCATED,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -313,7 +315,7 @@ std::unique_ptr<column> make_fixed_width_column(
   size_type size,
   B&& null_mask,
   size_type null_count                = cudf::UNKNOWN_NULL_COUNT,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
@@ -342,16 +344,16 @@ std::unique_ptr<column> make_fixed_width_column(
  *
  * @throws std::bad_alloc if device memory allocation fails
  *
- * @param strings The vector of pointer/size pairs.
+ * @param[in] strings The vector of pointer/size pairs.
  *                Each pointer must be a device memory address or `nullptr`
  * (indicating a null string). The size must be the number of bytes.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used for allocation of the column's `null_mask` and children
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children
  * columns' device memory.
  */
 std::unique_ptr<column> make_strings_column(
   const rmm::device_vector<thrust::pair<const char*, size_type>>& strings,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -370,20 +372,20 @@ std::unique_ptr<column> make_strings_column(
  *
  * @throws std::bad_alloc if device memory allocation fails
  *
- * @param string_views The vector of string_view.
+ * @param[in] string_views The vector of string_view.
  *                Each string_view must point to a device memory address or
  * `null_placeholder` (indicating a null string). The size must be the number of
  * bytes.
- * @param null_placeholder string_view indicating null string in given list of
+ * @param[in] null_placeholder string_view indicating null string in given list of
  * string_views.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used for allocation of the column's `null_mask` and children
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children
  * columns' device memory.
  */
 std::unique_ptr<column> make_strings_column(
   const rmm::device_vector<string_view>& string_views,
   const string_view null_placeholder,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -401,21 +403,21 @@ std::unique_ptr<column> make_strings_column(
  *
  * @throws std::bad_alloc if device memory allocation fails
  *
- * @param strings The vector of chars in device memory.
+ * @param[in] strings The vector of chars in device memory.
  *                This char vector is expected to be UTF-8 encoded characters.
- * @param offsets The vector of byte offsets in device memory.
+ * @param[in] offsets The vector of byte offsets in device memory.
  *                The number of elements is one more than the total number
  *                of strings so the `offsets.back()` is the total
  *                number of bytes in the strings array.
  *                `offsets.front()` must always be 0 to point to the beginning
  *                of `strings`.
- * @param null_mask Device vector containing the null element indicator bitmask.
+ * @param[in] null_mask Device vector containing the null element indicator bitmask.
  *                  Arrow format for nulls is used for interpeting this bitmask.
- * @param null_count The number of null string entries. If equal to
+ * @param[in] null_count The number of null string entries. If equal to
  * `UNKNOWN_NULL_COUNT`, the null count will be computed dynamically on the
  * first invocation of `column::null_count()`
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used for allocation of the column's `null_mask` and children
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children
  * columns' device memory.
  */
 std::unique_ptr<column> make_strings_column(
@@ -423,7 +425,7 @@ std::unique_ptr<column> make_strings_column(
   const rmm::device_vector<size_type>& offsets,
   const rmm::device_vector<bitmask_type>& null_mask = {},
   size_type null_count                              = cudf::UNKNOWN_NULL_COUNT,
-  cudaStream_t stream                               = 0,
+  rmm::cuda_stream_view stream                      = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr               = rmm::mr::get_current_device_resource());
 
 /**
@@ -441,21 +443,21 @@ std::unique_ptr<column> make_strings_column(
  *
  * @throws std::bad_alloc if device memory allocation fails
  *
- * @param strings The contiguous array of chars in host memory.
+ * @param[in] strings The contiguous array of chars in host memory.
  *                This char array is expected to be UTF-8 encoded characters.
- * @param offsets The array of byte offsets in host memory.
+ * @param[in] offsets The array of byte offsets in host memory.
  *                The number of elements is one more than the total number
  *                of strings so the `offsets.back()` is the total
  *                number of bytes in the strings array.
  *                `offsets.front()` must always be 0 to point to the beginning
  *                of `strings`.
- * @param null_mask Host vector containing the null element indicator bitmask.
+ * @param[in] null_mask Host vector containing the null element indicator bitmask.
  *                  Arrow format for nulls is used for interpeting this bitmask.
- * @param null_count The number of null string entries. If equal to
+ * @param[in] null_count The number of null string entries. If equal to
  * `UNKNOWN_NULL_COUNT`, the null count will be computed dynamically on the
  * first invocation of `column::null_count()`
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used for allocation of the column's `null_mask` and children
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children
  * columns' device memory.
  */
 std::unique_ptr<column> make_strings_column(
@@ -463,7 +465,7 @@ std::unique_ptr<column> make_strings_column(
   const std::vector<size_type>& offsets,
   const std::vector<bitmask_type>& null_mask = {},
   size_type null_count                       = cudf::UNKNOWN_NULL_COUNT,
-  cudaStream_t stream                        = 0,
+  rmm::cuda_stream_view stream               = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
 
 /**
@@ -471,19 +473,19 @@ std::unique_ptr<column> make_strings_column(
  * and null mask and null count. The columns and mask are moved into the
  * resulting strings column.
  *
- * @param num_strings The number of strings the column represents.
- * @param offsets_column The column of offset values for this column.
+ * @param[in] num_strings The number of strings the column represents.
+ * @param[in] offsets_column The column of offset values for this column.
  *                       The number of elements is one more than the total number
  *                       of strings so the offset[last] - offset[0] is the total
  *                       number of bytes in the strings vector.
- * @param chars_column The column of char bytes for all the strings for this column.
+ * @param[in] chars_column The column of char bytes for all the strings for this column.
  *                     Individual strings are identified by the offsets and the
  *                     nullmask.
- * @param null_count The number of null string entries.
- * @param null_mask The bits specifying the null strings in device memory.
+ * @param[in] null_count The number of null string entries.
+ * @param[in] null_mask The bits specifying the null strings in device memory.
  *                  Arrow format for nulls is used for interpeting this bitmask.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used for allocation of the column's `null_mask` and children
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children
  * columns' device memory.
  */
 std::unique_ptr<column> make_strings_column(
@@ -492,7 +494,7 @@ std::unique_ptr<column> make_strings_column(
   std::unique_ptr<column> chars_column,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -534,21 +536,21 @@ std::unique_ptr<column> make_strings_column(
  * data    (depth 1)   {1, 2, 3, 4, 5, 6, 7}
  * @endcode
  *
- * @param num_lists The number of lists the column represents.
- * @param offsets_column The column of offset values for this column. Each value should represent
- *                       the starting offset into the child elements that corresponds to the
- *                       beginning of the row, with the first row starting at 0. The length of row
- *                       N can be determined by subtracting offsets[N+1] - offsets[N]. The total
- * number of offsets should be 1 longer than the # of rows in the column.
- * @param child_column The column of nested data referenced by the lists represented by the
+ * @param[in] num_lists The number of lists the column represents.
+ * @param[in] offsets_column The column of offset values for this column. Each value should
+ * represent the starting offset into the child elements that corresponds to the beginning of the
+ * row, with the first row starting at 0. The length of row N can be determined by subtracting
+ * offsets[N+1] - offsets[N]. The total number of offsets should be 1 longer than the # of rows in
+ * the column.
+ * @param[in] child_column The column of nested data referenced by the lists represented by the
  *                     offsets_column. Note: the child column may itself be
  *                     further nested.
- * @param null_count The number of null list entries.
- * @param null_mask The bits specifying the null lists in device memory.
+ * @param[in] null_count The number of null list entries.
+ * @param[in] null_mask The bits specifying the null lists in device memory.
  *                  Arrow format for nulls is used for interpeting this bitmask.
- * @param stream Optional stream for use with all memory allocation
+ * @param[in] stream Optional stream for use with all memory allocation
  *               and device kernels
- * @param mr Optional resource to use for device memory
+ * @param[in] mr Optional resource to use for device memory
  *           allocation of the column's `null_mask` and children.
  */
 std::unique_ptr<cudf::column> make_lists_column(
@@ -557,7 +559,7 @@ std::unique_ptr<cudf::column> make_lists_column(
   std::unique_ptr<column> child_column,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -575,12 +577,12 @@ std::unique_ptr<cudf::column> make_lists_column(
  * The specified null mask governs which struct row has a null value. This
  * is orthogonal to the null values of individual child columns.
  *
- * @param num_rows The number of struct values in the struct column.
- * @param child_columns The list of child/members that the struct is comprised of.
- * @param null_count The number of null values in the struct column.
- * @param null_mask The bits specifying the null struct values in the column.
- * @param stream Optional stream for use with all memory allocation and device kernels.
- * @param mr Optional resource to use for device memory allocation.
+ * @param[in] num_rows The number of struct values in the struct column.
+ * @param[in] child_columns The list of child/members that the struct is comprised of.
+ * @param[in] null_count The number of null values in the struct column.
+ * @param[in] null_mask The bits specifying the null struct values in the column.
+ * @param[in] stream Optional stream for use with all memory allocation and device kernels.
+ * @param[in] mr Optional resource to use for device memory allocation.
  *
  */
 std::unique_ptr<cudf::column> make_structs_column(
@@ -588,7 +590,7 @@ std::unique_ptr<cudf::column> make_structs_column(
   std::vector<std::unique_ptr<column>>&& child_columns,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -599,16 +601,16 @@ std::unique_ptr<cudf::column> make_structs_column(
  * The output column will contain all null rows if `s.invalid()==false`
  * The output column will be empty if `size==0`.
  *
- * @param s The scalar to use for values in the column.
- * @param size The number of rows for the output column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param[in] s The scalar to use for values in the column.
+ * @param[in] size The number of rows for the output column.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used to allocate the returned column's device memory.
  */
 std::unique_ptr<column> make_column_from_scalar(
   scalar const& s,
   size_type size,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Return a dictionary column with size elements that are all equal to the
@@ -619,16 +621,16 @@ std::unique_ptr<column> make_column_from_scalar(
  *
  * @throw cudf::logic_error if `s.is_valid()==false`
  *
- * @param s The scalar to use for values in the column.
- * @param size The number of rows for the output column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param[in] s The scalar to use for values in the column.
+ * @param[in] size The number of rows for the output column.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used to allocate the returned column's device memory.
  */
 std::unique_ptr<column> make_dictionary_from_scalar(
   scalar const& s,
   size_type size,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index efbfd1de501..72943313dc2 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -73,7 +73,7 @@ std::unique_ptr<column> make_empty_column(data_type type)
 std::unique_ptr<column> make_numeric_column(data_type type,
                                             size_type size,
                                             mask_state state,
-                                            cudaStream_t stream,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -91,7 +91,7 @@ std::unique_ptr<column> make_numeric_column(data_type type,
 std::unique_ptr<column> make_fixed_point_column(data_type type,
                                                 size_type size,
                                                 mask_state state,
-                                                cudaStream_t stream,
+                                                rmm::cuda_stream_view stream,
                                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -109,7 +109,7 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
 std::unique_ptr<column> make_timestamp_column(data_type type,
                                               size_type size,
                                               mask_state state,
-                                              cudaStream_t stream,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -127,7 +127,7 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
 std::unique_ptr<column> make_duration_column(data_type type,
                                              size_type size,
                                              mask_state state,
-                                             cudaStream_t stream,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -145,7 +145,7 @@ std::unique_ptr<column> make_duration_column(data_type type,
 std::unique_ptr<column> make_fixed_width_column(data_type type,
                                                 size_type size,
                                                 mask_state state,
-                                                cudaStream_t stream,
+                                                rmm::cuda_stream_view stream,
                                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -163,15 +163,15 @@ struct column_from_scalar_dispatch {
   template <typename T>
   std::unique_ptr<cudf::column> operator()(scalar const& value,
                                            size_type size,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream) const
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr) const
   {
     if (!value.is_valid())
       return make_fixed_width_column(value.type(), size, mask_state::ALL_NULL, stream, mr);
     auto output_column =
       make_fixed_width_column(value.type(), size, mask_state::UNALLOCATED, stream, mr);
     auto view = output_column->mutable_view();
-    detail::fill_in_place(view, 0, size, value, stream);
+    detail::fill_in_place(view, 0, size, value, stream.value());
     return output_column;
   }
 };
@@ -180,8 +180,8 @@ template <>
 std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::string_view>(
   scalar const& value,
   size_type size,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
 {
   auto null_mask = detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
 
@@ -199,7 +199,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
     data_type{type_id::STRING}, size, nullptr, static_cast<bitmask_type*>(null_mask.data()), size};
   auto sv = static_cast<scalar_type_t<cudf::string_view> const&>(value);
   // fill the column with the scalar
-  auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, mr, stream);
+  auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, mr, stream.value());
   output->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);  // should be no nulls
   return output;
 }
@@ -208,8 +208,8 @@ template <>
 std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::dictionary32>(
   scalar const& value,
   size_type size,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FAIL("dictionary not supported when creating from scalar");
 }
@@ -218,8 +218,8 @@ template <>
 std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::list_view>(
   scalar const& value,
   size_type size,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FAIL("TODO");
 }
@@ -228,31 +228,31 @@ template <>
 std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::struct_view>(
   scalar const& value,
   size_type size,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream) const
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FAIL("TODO. struct_view currently not supported.");
 }
 
 std::unique_ptr<column> make_column_from_scalar(scalar const& s,
                                                 size_type size,
-                                                rmm::mr::device_memory_resource* mr,
-                                                cudaStream_t stream)
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
   if (size == 0) return make_empty_column(s.type());
-  return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, mr, stream);
+  return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, stream, mr);
 }
 
 std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
                                                     size_type size,
-                                                    rmm::mr::device_memory_resource* mr,
-                                                    cudaStream_t stream)
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
 {
   if (size == 0) return make_empty_column(data_type{type_id::DICTIONARY32});
   CUDF_EXPECTS(s.is_valid(), "cannot create a dictionary with a null key");
   return make_dictionary_column(
-    make_column_from_scalar(s, 1, mr, stream),
-    make_column_from_scalar(numeric_scalar<uint32_t>(0), size, mr, stream),
+    make_column_from_scalar(s, 1, stream, mr),
+    make_column_from_scalar(numeric_scalar<uint32_t>(0), size, stream, mr),
     rmm::device_buffer{0, stream, mr},
     0);
 }
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 39ef1237ad9..373ed224f99 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -177,7 +177,7 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
   {
     auto dict_target = dictionary::detail::add_keys(
       dictionary_column_view(target),
-      make_column_from_scalar(source.get(), 1, rmm::mr::get_current_device_resource(), stream)
+      make_column_from_scalar(source.get(), 1, stream, rmm::mr::get_current_device_resource())
         ->view(),
       mr,
       stream);
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index fa3219ef039..918063ac508 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -173,7 +173,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   // first add the replacment to the keys so only the indices need to be processed
   auto const default_mr = rmm::mr::get_current_device_resource();
   auto input_matched    = dictionary::detail::add_keys(
-    input, make_column_from_scalar(replacement, 1, default_mr, stream)->view(), mr, stream);
+    input, make_column_from_scalar(replacement, 1, stream, default_mr)->view(), mr, stream);
   auto const input_view   = dictionary_column_view(input_matched->view());
   auto const scalar_index = get_index(input_view, replacement, default_mr, stream);
 
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index de6ab9f7261..a711482c1ac 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -162,7 +162,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
 
   // add the scalar to get the output dictionary key-set
   auto scalar_column =
-    cudf::make_column_from_scalar(value, 1, rmm::mr::get_current_device_resource(), stream);
+    cudf::make_column_from_scalar(value, 1, stream, rmm::mr::get_current_device_resource());
   auto target_matched =
     cudf::dictionary::detail::add_keys(target, scalar_column->view(), mr, stream);
   cudf::column_view const target_indices =
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index 63401ad823a..2066b889dd4 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -658,7 +658,7 @@ std::unique_ptr<column> md5_hash(table_view const& input,
 {
   if (input.num_columns() == 0 || input.num_rows() == 0) {
     const string_scalar string_128bit("d41d8cd98f00b204e9orig98ecf8427e");
-    auto output = make_column_from_scalar(string_128bit, input.num_rows(), mr, stream);
+    auto output = make_column_from_scalar(string_128bit, input.num_rows(), stream, mr);
     return output;
   }
 
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index 54ae7cfd5f5..baee0e82b72 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -17,6 +17,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 
 /**
@@ -28,7 +30,7 @@ std::unique_ptr<column> make_lists_column(size_type num_rows,
                                           std::unique_ptr<column> child_column,
                                           size_type null_count,
                                           rmm::device_buffer&& null_mask,
-                                          cudaStream_t stream,
+                                          rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
   if (null_count > 0) { CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); }
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index fff063b269a..a2fd8c91bc7 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -323,7 +323,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
       if (key.is_valid()) {
         result = dictionary::detail::add_keys(
           matched_view,
-          make_column_from_scalar(key_replace, 1, rmm::mr::get_current_device_resource(), stream)
+          make_column_from_scalar(key_replace, 1, stream, rmm::mr::get_current_device_resource())
             ->view(),
           mr,
           stream);
diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu
index 5a017570b59..d5a6356e3f1 100644
--- a/cpp/src/strings/find.cu
+++ b/cpp/src/strings/find.cu
@@ -68,13 +68,12 @@ std::unique_ptr<column> find_fn(strings_column_view const& strings,
   auto d_strings      = *strings_column;
   auto strings_count  = strings.size();
   // create output column
-  auto results = make_numeric_column(
-    data_type{type_id::INT32},
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results      = make_numeric_column(data_type{type_id::INT32},
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<int32_t>();
   // set the position values by evaluating the passed function
@@ -190,10 +189,9 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
   if (target.size() == 0)  // empty target string returns true
   {
     auto const true_scalar = make_fixed_width_scalar<bool>(true, stream);
-    auto results           = make_column_from_scalar(*true_scalar, strings.size(), mr, stream);
-    results->set_null_mask(
-      cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-      strings.null_count());
+    auto results           = make_column_from_scalar(*true_scalar, strings.size(), stream, mr);
+    results->set_null_mask(cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                           strings.null_count());
     return results;
   }
 
@@ -201,13 +199,12 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // create output column
-  auto results = make_numeric_column(
-    data_type{type_id::BOOL8},
-    strings_count,
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr),
-    strings.null_count(),
-    stream,
-    mr);
+  auto results      = make_numeric_column(data_type{type_id::BOOL8},
+                                     strings_count,
+                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
+                                     strings.null_count(),
+                                     stream,
+                                     mr);
   auto results_view = results->mutable_view();
   auto d_results    = results_view.data<bool>();
   // set the bool values by evaluating the passed function
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 6a97f30e5a0..60da9b682ec 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -23,6 +23,8 @@
 #include <strings/utilities.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/for_each.h>
 #include <thrust/transform_reduce.h>
 
@@ -32,18 +34,18 @@ namespace cudf {
 // Create a strings-type column from vector of pointer/size pairs
 std::unique_ptr<column> make_strings_column(
   const rmm::device_vector<thrust::pair<const char*, size_type>>& strings,
-  cudaStream_t stream,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr) {
   CUDF_FUNC_RANGE();
   size_type strings_count = strings.size();
-  if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream.value());
 
   auto execpol   = rmm::exec_policy(stream);
   auto d_strings = strings.data().get();
 
   // check total size is not too large for cudf column
   size_t bytes = thrust::transform_reduce(
-    execpol->on(stream),
+    execpol->on(stream.value()),
     thrust::make_counting_iterator<size_t>(0),
     thrust::make_counting_iterator<size_t>(strings_count),
     [d_strings] __device__(size_t idx) {
@@ -63,7 +65,7 @@ std::unique_ptr<column> make_strings_column(
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
   auto offsets_column = strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
   auto offsets_view = offsets_column->view();
   auto d_offsets    = offsets_view.data<int32_t>();
 
@@ -80,10 +82,10 @@ std::unique_ptr<column> make_strings_column(
 
   // build chars column
   auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+    strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream.value());
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
-  thrust::for_each_n(execpol->on(stream),
+  thrust::for_each_n(execpol->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      [d_strings, d_offsets, d_chars] __device__(size_type idx) {
@@ -115,7 +117,7 @@ struct string_view_to_pair {
 // Create a strings-type column from vector of string_view
 std::unique_ptr<column> make_strings_column(const rmm::device_vector<string_view>& string_views,
                                             const string_view null_placeholder,
-                                            cudaStream_t stream,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr) {
   auto it_pair =
     thrust::make_transform_iterator(string_views.begin(), string_view_to_pair{null_placeholder});
@@ -129,11 +131,11 @@ std::unique_ptr<column> make_strings_column(const rmm::device_vector<char>& stri
                                             const rmm::device_vector<size_type>& offsets,
                                             const rmm::device_vector<bitmask_type>& valid_mask,
                                             size_type null_count,
-                                            cudaStream_t stream,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr) {
   CUDF_FUNC_RANGE();
   size_type num_strings = offsets.size() - 1;
-  if (num_strings == 0) return strings::detail::make_empty_strings_column(mr, stream);
+  if (num_strings == 0) return strings::detail::make_empty_strings_column(mr, stream.value());
 
   CUDF_EXPECTS(null_count < num_strings, "null strings column not yet supported");
   if (null_count > 0) {
@@ -152,7 +154,7 @@ std::unique_ptr<column> make_strings_column(const rmm::device_vector<char>& stri
                            offsets.data().get(),
                            (num_strings + 1) * sizeof(int32_t),
                            cudaMemcpyDeviceToDevice,
-                           stream));
+                           stream.value()));
   // build null bitmask
   rmm::device_buffer null_mask{
     valid_mask.data().get(),
@@ -164,10 +166,10 @@ std::unique_ptr<column> make_strings_column(const rmm::device_vector<char>& stri
 
   // build chars column
   auto chars_column =
-    strings::detail::create_chars_child_column(num_strings, null_count, bytes, mr, stream);
+    strings::detail::create_chars_child_column(num_strings, null_count, bytes, mr, stream.value());
   auto chars_view = chars_column->mutable_view();
   CUDA_TRY(cudaMemcpyAsync(
-    chars_view.data<char>(), strings.data().get(), bytes, cudaMemcpyDeviceToDevice, stream));
+    chars_view.data<char>(), strings.data().get(), bytes, cudaMemcpyDeviceToDevice, stream.value()));
 
   return make_strings_column(num_strings,
                              std::move(offsets_column),
@@ -183,7 +185,7 @@ std::unique_ptr<column> make_strings_column(const std::vector<char>& strings,
                                             const std::vector<size_type>& offsets,
                                             const std::vector<bitmask_type>& null_mask,
                                             size_type null_count,
-                                            cudaStream_t stream,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr) {
   rmm::device_vector<char> d_strings{strings};
   rmm::device_vector<size_type> d_offsets{offsets};
@@ -198,7 +200,7 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
                                             std::unique_ptr<column> chars_column,
                                             size_type null_count,
                                             rmm::device_buffer&& null_mask,
-                                            cudaStream_t stream,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr) {
   if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable.");
   CUDF_EXPECTS(num_strings == offsets_column->size() - 1,
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 1d4656ffa8f..493e773adb9 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -350,12 +350,12 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
   // Compute the substring indices first
   auto start_chars_pos_vec = make_column_from_scalar(numeric_scalar<size_type>(0, true, stream),
                                                      strings_count,
-                                                     rmm::mr::get_current_device_resource(),
-                                                     stream);
+                                                     stream,
+                                                     rmm::mr::get_current_device_resource());
   auto stop_chars_pos_vec  = make_column_from_scalar(numeric_scalar<size_type>(0, true, stream),
                                                     strings_count,
-                                                    rmm::mr::get_current_device_resource(),
-                                                    stream);
+                                                    stream,
+                                                    rmm::mr::get_current_device_resource());
 
   auto start_char_pos = start_chars_pos_vec->mutable_view().data<size_type>();
   auto end_char_pos   = stop_chars_pos_vec->mutable_view().data<size_type>();
diff --git a/cpp/src/structs/structs_column_factories.cu b/cpp/src/structs/structs_column_factories.cu
index 2e239fce5f3..5f92fea76f5 100644
--- a/cpp/src/structs/structs_column_factories.cu
+++ b/cpp/src/structs/structs_column_factories.cu
@@ -14,12 +14,16 @@
  * limitations under the License.
  */
 
-#include <algorithm>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <algorithm>
 #include <memory>
-#include "cudf/types.hpp"
-#include "thrust/iterator/counting_iterator.h"
 
 namespace cudf {
 namespace {
@@ -29,7 +33,7 @@ void superimpose_parent_nullmask(bitmask_type const* parent_null_mask,
                                  std::size_t parent_null_mask_size,
                                  size_type parent_null_count,
                                  column& child,
-                                 cudaStream_t stream,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   if (!child.nullable()) {
@@ -78,7 +82,7 @@ std::unique_ptr<cudf::column> make_structs_column(
   std::vector<std::unique_ptr<column>>&& child_columns,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  cudaStream_t stream,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(null_count <= 0 || !null_mask.is_empty(),

From 31716db65511f0de03af26c1a5643802e073338d Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 3 Nov 2020 15:22:40 +1100
Subject: [PATCH 09/51] convert detail/aggregation headers and source to
 cuda_stream_view

---
 .../cudf/detail/aggregation/aggregation.cuh       | 15 ++++++++++-----
 cpp/src/aggregation/aggregation.cu                |  4 +++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 95e5d74b8a5..51cdb7e5841 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -23,6 +23,8 @@
 #include <cudf/detail/utilities/release_assert.cuh>
 #include <cudf/table/table_device_view.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 /**
@@ -409,15 +411,17 @@ struct identity_initializer {
  public:
   template <typename T, aggregation::Kind k>
   std::enable_if_t<is_supported<T, k>(), void> operator()(mutable_column_view const& col,
-                                                          cudaStream_t stream = 0)
+                                                          rmm::cuda_stream_view stream)
   {
-    thrust::fill(
-      rmm::exec_policy(stream)->on(stream), col.begin<T>(), col.end<T>(), get_identity<T, k>());
+    thrust::fill(rmm::exec_policy(stream)->on(stream.value()),
+                 col.begin<T>(),
+                 col.end<T>(),
+                 get_identity<T, k>());
   }
 
   template <typename T, aggregation::Kind k>
   std::enable_if_t<not is_supported<T, k>(), void> operator()(mutable_column_view const& col,
-                                                              cudaStream_t stream = 0)
+                                                              rmm::cuda_stream_view stream)
   {
     CUDF_FAIL("Unsupported aggregation for initializing values");
   }
@@ -436,10 +440,11 @@ struct identity_initializer {
  * @param table The table of columns to initialize.
  * @param aggs A vector of aggregation operations corresponding to the table
  * columns. The aggregations determine the identity value for each column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 void initialize_with_identity(mutable_table_view& table,
                               std::vector<aggregation::Kind> const& aggs,
-                              cudaStream_t stream = 0);
+                              rmm::cuda_stream_view stream);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index fb4a30299fc..564713e959b 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -16,11 +16,13 @@
 
 #include <cudf/detail/aggregation/aggregation.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 void initialize_with_identity(mutable_table_view& table,
                               std::vector<aggregation::Kind> const& aggs,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream)
 {
   // TODO: Initialize all the columns in a single kernel instead of invoking one
   // kernel per column

From ba2fc0fd3662a1d63bfc055e8b112b6e392b52f9 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 3 Nov 2020 15:46:33 +1100
Subject: [PATCH 10/51] Use cuda_stream_view in groupby sort_helper

---
 .../cudf/detail/groupby/sort_helper.hpp       | 29 +++++-----
 cpp/include/cudf/detail/utilities/cuda.cuh    |  7 ++-
 cpp/src/groupby/groupby.cu                    |  3 +-
 cpp/src/groupby/sort/groupby.cu               |  2 +-
 cpp/src/groupby/sort/sort_helper.cu           | 58 ++++++++++---------
 5 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index 8024de7a6af..5d14b8dd8b6 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -92,8 +93,8 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> sorted_values(
     column_view const& values,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0);
+    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Groups a column of values according to `keys`
@@ -107,8 +108,8 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> grouped_values(
     column_view const& values,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0);
+    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Get a table of sorted unique keys
@@ -116,8 +117,8 @@ struct sort_groupby_helper {
    * @return a new table in which each row is a unique row in the sorted key table.
    */
   std::unique_ptr<table> unique_keys(
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0);
+    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Get a table of sorted keys
@@ -125,8 +126,8 @@ struct sort_groupby_helper {
    * @return a new table containing the sorted keys.
    */
   std::unique_ptr<table> sorted_keys(
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0);
+    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Get the number of groups in `keys`
@@ -140,7 +141,7 @@ struct sort_groupby_helper {
    * When include_null_keys = NO, returned value is the number of rows in `keys`
    *  in which no element is null
    */
-  size_type num_keys(cudaStream_t stream = 0);
+  size_type num_keys(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Get the sorted order of `keys`.
@@ -155,7 +156,7 @@ struct sort_groupby_helper {
    *
    * @return the sort order indices for `keys`.
    */
-  column_view key_sort_order(cudaStream_t stream = 0);
+  column_view key_sort_order(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Get each group's offset into the sorted order of `keys`.
@@ -168,7 +169,7 @@ struct sort_groupby_helper {
    * @return vector of offsets of the starting point of each group in the sorted
    * key table
    */
-  index_vector const& group_offsets(cudaStream_t stream = 0);
+  index_vector const& group_offsets(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Get the group labels corresponding to the sorted order of `keys`.
@@ -183,7 +184,7 @@ struct sort_groupby_helper {
    *
    * @return vector of group labels for each row in the sorted key column
    */
-  index_vector const& group_labels(cudaStream_t stream = 0);
+  index_vector const& group_labels(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
  private:
   /**
@@ -200,7 +201,7 @@ struct sort_groupby_helper {
    * @return A nullable column of `INT32` containing group labels in the order
    *         of the unsorted key table
    */
-  column_view unsorted_keys_labels(cudaStream_t stream = 0);
+  column_view unsorted_keys_labels(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Get the column representing the row bitmask for the `keys`
@@ -214,7 +215,7 @@ struct sort_groupby_helper {
    * Computes and stores bitmask on first invocation and returns stored column
    * on subsequent calls.
    */
-  column_view keys_bitmask_column(cudaStream_t stream = 0);
+  column_view keys_bitmask_column(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
  private:
   column_ptr _key_sorted_order;      ///< Indices to produce _keys in sorted order
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 77d12663f20..33c61414a1c 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -21,9 +21,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <cub/cub.cuh>
 
-#include <assert.h>
 #include <type_traits>
 
 namespace cudf {
@@ -168,9 +169,9 @@ __global__ void single_thread_kernel(F f)
  * @param stream CUDA stream used for the kernel launch
  */
 template <class Functor>
-void device_single_thread(Functor functor, cudaStream_t stream = 0)
+void device_single_thread(Functor functor, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 {
-  single_thread_kernel<<<1, 1, 0, stream>>>(functor);
+  single_thread_kernel<<<1, 1, 0, stream.value()>>>(functor);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 22d41f938c5..90bbf6490ac 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -33,6 +33,7 @@
 
 #include <memory>
 #include <utility>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace groupby {
@@ -137,7 +138,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto grouped_keys = helper().sorted_keys(mr, 0);
+  auto grouped_keys = helper().sorted_keys(rmm::cuda_stream_default, mr);
 
   auto group_offsets = helper().group_offsets(0);
   std::vector<size_type> group_offsets_vector(group_offsets.size());
diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu
index 27c1a659b91..8e924f65d73 100644
--- a/cpp/src/groupby/sort/groupby.cu
+++ b/cpp/src/groupby/sort/groupby.cu
@@ -434,7 +434,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
 
   auto results = detail::extract_results(requests, cache);
 
-  return std::make_pair(helper().unique_keys(mr, stream), std::move(results));
+  return std::make_pair(helper().unique_keys(stream, mr), std::move(results));
 }
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 88bdaf829a1..b6a07a86af7 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -25,6 +25,8 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/binary_search.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -84,7 +86,7 @@ namespace cudf {
 namespace groupby {
 namespace detail {
 namespace sort {
-size_type sort_groupby_helper::num_keys(cudaStream_t stream)
+size_type sort_groupby_helper::num_keys(rmm::cuda_stream_view stream)
 {
   if (_num_keys > -1) return _num_keys;
 
@@ -100,7 +102,7 @@ size_type sort_groupby_helper::num_keys(cudaStream_t stream)
   return _num_keys;
 }
 
-column_view sort_groupby_helper::key_sort_order(cudaStream_t stream)
+column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
 {
   auto sliced_key_sorted_order = [stream, this]() {
     return cudf::detail::slice(this->_key_sorted_order->view(), 0, this->num_keys(stream));
@@ -117,7 +119,7 @@ column_view sort_groupby_helper::key_sort_order(cudaStream_t stream)
 
     auto d_key_sorted_order = _key_sorted_order->mutable_view().data<size_type>();
 
-    thrust::sequence(rmm::exec_policy(stream)->on(stream),
+    thrust::sequence(rmm::exec_policy(stream)->on(stream.value()),
                      d_key_sorted_order,
                      d_key_sorted_order + _key_sorted_order->size(),
                      0);
@@ -131,7 +133,7 @@ column_view sort_groupby_helper::key_sort_order(cudaStream_t stream)
       {},
       std::vector<null_order>(_keys.num_columns(), null_order::AFTER),
       rmm::mr::get_current_device_resource(),
-      stream);
+      stream.value());
   } else {  // Pandas style
     // Temporarily prepend the keys table with a column that indicates the
     // presence of a null value within a row. This allows moving all rows that
@@ -144,7 +146,7 @@ column_view sort_groupby_helper::key_sort_order(cudaStream_t stream)
       {},
       std::vector<null_order>(_keys.num_columns() + 1, null_order::AFTER),
       rmm::mr::get_current_device_resource(),
-      stream);
+      stream.value());
 
     // All rows with one or more null values are at the end of the resulting sorted order.
   }
@@ -152,27 +154,28 @@ column_view sort_groupby_helper::key_sort_order(cudaStream_t stream)
   return sliced_key_sorted_order();
 }
 
-sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(cudaStream_t stream)
+sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
+  rmm::cuda_stream_view stream)
 {
   if (_group_offsets) return *_group_offsets;
 
   _group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1);
 
-  auto device_input_table = table_device_view::create(_keys, stream);
+  auto device_input_table = table_device_view::create(_keys, stream.value());
   auto sorted_order       = key_sort_order().data<size_type>();
   decltype(_group_offsets->begin()) result_end;
   auto exec = rmm::exec_policy(stream);
 
   if (has_nulls(_keys)) {
     result_end = thrust::unique_copy(
-      exec->on(stream),
+      exec->on(stream.value()),
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(num_keys(stream)),
       _group_offsets->begin(),
       permuted_row_equality_comparator<true>(*device_input_table, sorted_order));
   } else {
     result_end = thrust::unique_copy(
-      exec->on(stream),
+      exec->on(stream.value()),
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(num_keys(stream)),
       _group_offsets->begin(),
@@ -186,7 +189,8 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(cuda
   return *_group_offsets;
 }
 
-sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels(cudaStream_t stream)
+sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels(
+  rmm::cuda_stream_view stream)
 {
   if (_group_labels) return *_group_labels;
 
@@ -198,19 +202,19 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_labels(cudaS
   if (num_keys(stream) == 0) return group_labels;
 
   auto exec = rmm::exec_policy(stream);
-  thrust::scatter(exec->on(stream),
+  thrust::scatter(exec->on(stream.value()),
                   thrust::make_constant_iterator(1, decltype(num_groups())(1)),
                   thrust::make_constant_iterator(1, num_groups()),
                   group_offsets().begin() + 1,
                   group_labels.begin());
 
   thrust::inclusive_scan(
-    exec->on(stream), group_labels.begin(), group_labels.end(), group_labels.begin());
+    exec->on(stream.value()), group_labels.begin(), group_labels.end(), group_labels.begin());
 
   return group_labels;
 }
 
-column_view sort_groupby_helper::unsorted_keys_labels(cudaStream_t stream)
+column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stream)
 {
   if (_unsorted_keys_labels) return _unsorted_keys_labels->view();
 
@@ -228,14 +232,14 @@ column_view sort_groupby_helper::unsorted_keys_labels(cudaStream_t stream)
                           table_view({temp_labels->view()}),
                           false,
                           rmm::mr::get_current_device_resource(),
-                          stream);
+                          stream.value());
 
   _unsorted_keys_labels = std::move(t_unsorted_keys_labels->release()[0]);
 
   return _unsorted_keys_labels->view();
 }
 
-column_view sort_groupby_helper::keys_bitmask_column(cudaStream_t stream)
+column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view stream)
 {
   if (_keys_bitmask_column) return _keys_bitmask_column->view();
 
@@ -250,7 +254,7 @@ column_view sort_groupby_helper::keys_bitmask_column(cudaStream_t stream)
 
   auto keys_bitmask_view = _keys_bitmask_column->mutable_view();
   using T                = id_to_type<type_id::INT8>;
-  thrust::fill(rmm::exec_policy(stream)->on(stream),
+  thrust::fill(rmm::exec_policy(stream)->on(stream.value()),
                keys_bitmask_view.begin<T>(),
                keys_bitmask_view.end<T>(),
                0);
@@ -259,14 +263,14 @@ column_view sort_groupby_helper::keys_bitmask_column(cudaStream_t stream)
 }
 
 sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
-  column_view const& values, rmm::mr::device_memory_resource* mr, cudaStream_t stream)
+  column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   column_ptr values_sort_order =
     cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(), values}),
                                       {},
                                       std::vector<null_order>(2, null_order::AFTER),
                                       mr,
-                                      stream);
+                                      stream.value());
 
   // Zero-copy slice this sort order so that its new size is num_keys()
   column_view gather_map = cudf::detail::slice(values_sort_order->view(), 0, num_keys(stream));
@@ -276,13 +280,13 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
                                                   cudf::detail::out_of_bounds_policy::NULLIFY,
                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
                                                   mr,
-                                                  stream);
+                                                  stream.value());
 
   return std::move(sorted_values_table->release()[0]);
 }
 
 sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
-  column_view const& values, rmm::mr::device_memory_resource* mr, cudaStream_t stream)
+  column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   auto gather_map = key_sort_order();
 
@@ -291,13 +295,13 @@ sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
                                                    cudf::detail::out_of_bounds_policy::NULLIFY,
                                                    cudf::detail::negative_index_policy::NOT_ALLOWED,
                                                    mr,
-                                                   stream);
+                                                   stream.value());
 
   return std::move(grouped_values_table->release()[0]);
 }
 
-std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::mr::device_memory_resource* mr,
-                                                        cudaStream_t stream)
+std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view stream,
+                                                        rmm::mr::device_memory_resource* mr)
 {
   auto idx_data = key_sort_order().data<size_type>();
 
@@ -305,18 +309,18 @@ std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::mr::device_memory_r
     group_offsets().begin(), [idx_data] __device__(size_type i) { return idx_data[i]; });
 
   return cudf::detail::gather(
-    _keys, gather_map_it, gather_map_it + num_groups(), false, mr, stream);
+    _keys, gather_map_it, gather_map_it + num_groups(), false, mr, stream.value());
 }
 
-std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::mr::device_memory_resource* mr,
-                                                        cudaStream_t stream)
+std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream,
+                                                        rmm::mr::device_memory_resource* mr)
 {
   return cudf::detail::gather(_keys,
                               key_sort_order(),
                               cudf::detail::out_of_bounds_policy::NULLIFY,
                               cudf::detail::negative_index_policy::NOT_ALLOWED,
                               mr,
-                              stream);
+                              stream.value());
 }
 
 }  // namespace sort

From d8827e909a161a026060dcf74e770b9980e8ad15 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 3 Nov 2020 16:14:04 +1100
Subject: [PATCH 11/51] Convert binops to cuda_stream_view

---
 cpp/include/cudf/detail/binaryop.hpp     |  21 ++--
 cpp/src/binaryop/binaryop.cpp            | 117 ++++++++++++-----------
 cpp/src/binaryop/compiled/binary_ops.cu  |  63 ++++++------
 cpp/src/binaryop/compiled/binary_ops.hpp |  16 ++--
 cpp/src/groupby/sort/groupby.cu          |   4 +-
 5 files changed, 113 insertions(+), 108 deletions(-)

diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index 23dccc70414..c12482967e1 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,11 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/binaryop.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
@@ -32,8 +33,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator,
@@ -46,8 +47,8 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
@@ -60,8 +61,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
@@ -74,8 +75,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   std::string const& ptx,
   data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 55aabb87d8d..fde4caa068d 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -45,8 +45,9 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <string>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 
@@ -57,7 +58,7 @@ namespace detail {
  */
 rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
                                              scalar const& s,
-                                             cudaStream_t stream,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   if (col.is_empty()) return rmm::device_buffer{0, stream, mr};
@@ -65,7 +66,7 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
   if (not s.is_valid()) {
     return cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr);
   } else if (s.is_valid() and col.nullable()) {
-    return cudf::detail::copy_bitmask(col, rmm::cuda_stream_view{stream}, mr);
+    return cudf::detail::copy_bitmask(col, stream, mr);
   } else {
     return rmm::device_buffer{0, stream, mr};
   }
@@ -105,11 +106,11 @@ void binary_operation(mutable_column_view& out,
                       scalar const& lhs,
                       column_view const& rhs,
                       binary_operator op,
-                      cudaStream_t stream)
+                      rmm::cuda_stream_view stream)
 {
   if (is_null_dependent(op)) {
     cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
+      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value())
       .set_kernel_inst("kernel_v_s_with_validity",             // name of the kernel we are
                                                                // launching
                        {cudf::jit::get_type_name(out.type()),  // list of template arguments
@@ -126,7 +127,7 @@ void binary_operation(mutable_column_view& out,
               lhs.is_valid());
   } else {
     cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
+      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value())
       .set_kernel_inst("kernel_v_s",                           // name of the kernel we are
                                                                // launching
                        {cudf::jit::get_type_name(out.type()),  // list of template arguments
@@ -144,11 +145,11 @@ void binary_operation(mutable_column_view& out,
                       column_view const& lhs,
                       scalar const& rhs,
                       binary_operator op,
-                      cudaStream_t stream)
+                      rmm::cuda_stream_view stream)
 {
   if (is_null_dependent(op)) {
     cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
+      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value())
       .set_kernel_inst("kernel_v_s_with_validity",             // name of the kernel we are
                                                                // launching
                        {cudf::jit::get_type_name(out.type()),  // list of template arguments
@@ -165,7 +166,7 @@ void binary_operation(mutable_column_view& out,
               rhs.is_valid());
   } else {
     cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
+      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value())
       .set_kernel_inst("kernel_v_s",                           // name of the kernel we are
                                                                // launching
                        {cudf::jit::get_type_name(out.type()),  // list of template arguments
@@ -183,11 +184,11 @@ void binary_operation(mutable_column_view& out,
                       column_view const& lhs,
                       column_view const& rhs,
                       binary_operator op,
-                      cudaStream_t stream)
+                      rmm::cuda_stream_view stream)
 {
   if (is_null_dependent(op)) {
     cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
+      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value())
       .set_kernel_inst("kernel_v_v_with_validity",             // name of the kernel we are
                                                                // launching
                        {cudf::jit::get_type_name(out.type()),  // list of template arguments
@@ -205,7 +206,7 @@ void binary_operation(mutable_column_view& out,
               rhs.offset());
   } else {
     cudf::jit::launcher(
-      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream)
+      hash, code::kernel, header_names, cudf::jit::compiler_flags, headers_code, stream.value())
       .set_kernel_inst("kernel_v_v",                           // name of the kernel we are
                                                                // launching
                        {cudf::jit::get_type_name(out.type()),  // list of template arguments
@@ -223,7 +224,7 @@ void binary_operation(mutable_column_view& out,
                       column_view const& lhs,
                       column_view const& rhs,
                       const std::string& ptx,
-                      cudaStream_t stream)
+                      rmm::cuda_stream_view stream)
 {
   std::string const output_type_name = cudf::jit::get_type_name(out.type());
 
@@ -234,7 +235,7 @@ void binary_operation(mutable_column_view& out,
     cudf::jit::parse_single_function_ptx(ptx, "GENERIC_BINARY_OP", output_type_name) + code::kernel;
 
   cudf::jit::launcher(
-    ptx_hash, cuda_source, header_names, cudf::jit::compiler_flags, headers_code, stream)
+    ptx_hash, cuda_source, header_names, cudf::jit::compiler_flags, headers_code, stream.value())
     .set_kernel_inst("kernel_v_v",       // name of the kernel
                                          // we are launching
                      {output_type_name,  // list of template arguments
@@ -277,8 +278,8 @@ std::unique_ptr<column> make_fixed_width_column_for_output(scalar const& lhs,
                                                            column_view const& rhs,
                                                            binary_operator op,
                                                            data_type output_type,
-                                                           rmm::mr::device_memory_resource* mr,
-                                                           cudaStream_t stream)
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -304,8 +305,8 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
                                                            scalar const& rhs,
                                                            binary_operator op,
                                                            data_type output_type,
-                                                           rmm::mr::device_memory_resource* mr,
-                                                           cudaStream_t stream)
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, lhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -331,8 +332,8 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
                                                            column_view const& rhs,
                                                            binary_operator op,
                                                            data_type output_type,
-                                                           rmm::mr::device_memory_resource* mr,
-                                                           cudaStream_t stream)
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -415,8 +416,8 @@ bool is_same_scale_necessary(binary_operator op)
 std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
                                                      column_view const& rhs,
                                                      binary_operator op,
-                                                     rmm::mr::device_memory_resource* mr,
-                                                     cudaStream_t stream)
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
 {
   using namespace numeric;
 
@@ -427,7 +428,7 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
   auto const scale       = compute_scale_for_binop(op, lhs.type().scale(), rhs.type().scale());
   auto const output_type = is_comparison_binop(op) ? data_type{type_id::BOOL8}  //
                                                    : data_type{lhs.type().id(), scale};
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream);
+  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   if (rhs.is_empty()) return out;
 
@@ -460,13 +461,13 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar =
             make_fixed_point_scalar<decimal32>(factor, scale_type{rhs.type().scale()});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), mr, stream);
+          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar =
             make_fixed_point_scalar<decimal64>(factor, scale_type{rhs.type().scale()});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), mr, stream);
+          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
@@ -491,8 +492,8 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
 std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
                                                      scalar const& rhs,
                                                      binary_operator op,
-                                                     rmm::mr::device_memory_resource* mr,
-                                                     cudaStream_t stream)
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
 {
   using namespace numeric;
 
@@ -503,7 +504,7 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
   auto const scale       = compute_scale_for_binop(op, lhs.type().scale(), rhs.type().scale());
   auto const output_type = is_comparison_binop(op) ? data_type{type_id::BOOL8}  //
                                                    : data_type{lhs.type().id(), scale};
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream);
+  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   if (lhs.is_empty()) return out;
 
@@ -536,13 +537,13 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar =
             make_fixed_point_scalar<decimal32>(factor, scale_type{lhs.type().scale()});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), mr, stream);
+          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar =
             make_fixed_point_scalar<decimal64>(factor, scale_type{lhs.type().scale()});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), mr, stream);
+          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
@@ -567,8 +568,8 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
 std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
                                                      column_view const& rhs,
                                                      binary_operator op,
-                                                     rmm::mr::device_memory_resource* mr,
-                                                     cudaStream_t stream)
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
 {
   using namespace numeric;
 
@@ -579,7 +580,7 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
   auto const scale       = compute_scale_for_binop(op, lhs.type().scale(), rhs.type().scale());
   auto const output_type = is_comparison_binop(op) ? data_type{type_id::BOOL8}  //
                                                    : data_type{lhs.type().id(), scale};
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream);
+  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   if (lhs.is_empty() or rhs.is_empty()) return out;
 
@@ -594,13 +595,13 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar =
             make_fixed_point_scalar<decimal32>(factor, scale_type{lhs.type().scale()});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), mr, stream);
+          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar =
             make_fixed_point_scalar<decimal64>(factor, scale_type{lhs.type().scale()});
-          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), mr, stream);
+          return binary_operation(*scalar, lhs, binary_operator::MUL, rhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, result->view(), rhs, op, stream);
@@ -612,13 +613,13 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
           auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
           auto const scalar =
             make_fixed_point_scalar<decimal32>(factor, scale_type{rhs.type().scale()});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), mr, stream);
+          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         } else {
           CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
           auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
           auto const scalar =
             make_fixed_point_scalar<decimal64>(factor, scale_type{rhs.type().scale()});
-          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), mr, stream);
+          return binary_operation(*scalar, rhs, binary_operator::MUL, lhs.type(), stream, mr);
         }
       }();
       binops::jit::binary_operation(out_view, lhs, result->view(), op, stream);
@@ -634,21 +635,21 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return binops::compiled::binary_operation(lhs, rhs, op, output_type, mr, stream);
+    return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, mr, stream);
+    return fixed_point_binary_operation(lhs, rhs, op, stream, mr);
 
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
   CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
 
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream);
+  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   if (rhs.is_empty()) return out;
 
@@ -661,21 +662,21 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          scalar const& rhs,
                                          binary_operator op,
                                          data_type output_type,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return binops::compiled::binary_operation(lhs, rhs, op, output_type, mr, stream);
+    return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, mr, stream);
+    return fixed_point_binary_operation(lhs, rhs, op, stream, mr);
 
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
   CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
 
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream);
+  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   if (lhs.is_empty()) return out;
 
@@ -688,23 +689,23 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
 
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING)
-    return binops::compiled::binary_operation(lhs, rhs, op, output_type, mr, stream);
+    return binops::compiled::binary_operation(lhs, rhs, op, output_type, stream, mr);
 
   if (is_fixed_point(lhs.type()) or is_fixed_point(rhs.type()))
-    return fixed_point_binary_operation(lhs, rhs, op, mr, stream);
+    return fixed_point_binary_operation(lhs, rhs, op, stream, mr);
 
   // Check for datatype
   CUDF_EXPECTS(is_fixed_width(output_type), "Invalid/Unsupported output datatype");
   CUDF_EXPECTS(is_fixed_width(lhs.type()), "Invalid/Unsupported lhs datatype");
   CUDF_EXPECTS(is_fixed_width(rhs.type()), "Invalid/Unsupported rhs datatype");
 
-  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, mr, stream);
+  auto out = make_fixed_width_column_for_output(lhs, rhs, op, output_type, stream, mr);
 
   if (lhs.is_empty() or rhs.is_empty()) return out;
 
@@ -717,8 +718,8 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          std::string const& ptx,
                                          data_type output_type,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   // Check for datatype
   auto is_type_supported_ptx = [](data_type type) -> bool {
@@ -753,7 +754,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> binary_operation(column_view const& lhs,
@@ -763,7 +764,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> binary_operation(column_view const& lhs,
@@ -773,7 +774,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> binary_operation(column_view const& lhs,
@@ -783,7 +784,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, ptx, output_type, mr);
+  return detail::binary_operation(lhs, rhs, ptx, output_type, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index e21681a8467..a466a66f74f 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 #include "binary_ops.hpp"
 
@@ -105,8 +106,8 @@ struct binary_op {
                                      binary_operator op,
                                      data_type out_type,
                                      bool const reversed,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto new_mask = binops::detail::scalar_col_valid_mask_and(lhs, rhs, stream, mr);
     auto out      = make_fixed_width_column(out_type,
@@ -125,12 +126,12 @@ struct binary_op {
       if (lhs.has_nulls()) {
         auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{});
         reversed
-          ? thrust::transform(rmm::exec_policy(stream)->on(stream),
+          ? thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                               lhs_itr,
                               lhs_itr + lhs.size(),
                               out_itr,
                               apply_binop_scalar_rhs_lhs<Lhs, Rhs, Out>{op, rhs_scalar_view})
-          : thrust::transform(rmm::exec_policy(stream)->on(stream),
+          : thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                               lhs_itr,
                               lhs_itr + lhs.size(),
                               out_itr,
@@ -140,12 +141,12 @@ struct binary_op {
           thrust::make_counting_iterator(size_type{0}),
           [col = *lhs_device_view] __device__(size_type i) { return col.element<Lhs>(i); });
         reversed
-          ? thrust::transform(rmm::exec_policy(stream)->on(stream),
+          ? thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                               lhs_itr,
                               lhs_itr + lhs.size(),
                               out_itr,
                               apply_binop_scalar_rhs_lhs<Lhs, Rhs, Out>{op, rhs_scalar_view})
-          : thrust::transform(rmm::exec_policy(stream)->on(stream),
+          : thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                               lhs_itr,
                               lhs_itr + lhs.size(),
                               out_itr,
@@ -153,7 +154,7 @@ struct binary_op {
       }
     }
 
-    CHECK_CUDA(stream);
+    CHECK_CUDA(stream.value());
 
     return out;
   }
@@ -162,8 +163,8 @@ struct binary_op {
                                      column_view const& rhs,
                                      binary_operator op,
                                      data_type out_type,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto new_mask = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
     auto out      = make_fixed_width_column(
@@ -177,7 +178,7 @@ struct binary_op {
       if (lhs.has_nulls() && rhs.has_nulls()) {
         auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{});
         auto rhs_itr = cudf::detail::make_null_replacement_iterator(*rhs_device_view, Rhs{});
-        thrust::transform(rmm::exec_policy(stream)->on(stream),
+        thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                           lhs_itr,
                           lhs_itr + lhs.size(),
                           rhs_itr,
@@ -188,7 +189,7 @@ struct binary_op {
         auto rhs_itr = thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_type{0}),
           [col = *rhs_device_view] __device__(size_type i) { return col.element<Rhs>(i); });
-        thrust::transform(rmm::exec_policy(stream)->on(stream),
+        thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                           lhs_itr,
                           lhs_itr + lhs.size(),
                           rhs_itr,
@@ -199,7 +200,7 @@ struct binary_op {
           thrust::make_counting_iterator(size_type{0}),
           [col = *lhs_device_view] __device__(size_type i) { return col.element<Lhs>(i); });
         auto rhs_itr = cudf::detail::make_null_replacement_iterator(*rhs_device_view, Rhs{});
-        thrust::transform(rmm::exec_policy(stream)->on(stream),
+        thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                           lhs_itr,
                           lhs_itr + lhs.size(),
                           rhs_itr,
@@ -212,7 +213,7 @@ struct binary_op {
         auto rhs_itr = thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_type{0}),
           [col = *rhs_device_view] __device__(size_type i) { return col.element<Rhs>(i); });
-        thrust::transform(rmm::exec_policy(stream)->on(stream),
+        thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                           lhs_itr,
                           lhs_itr + lhs.size(),
                           rhs_itr,
@@ -221,7 +222,7 @@ struct binary_op {
       }
     }
 
-    CHECK_CUDA(stream);
+    CHECK_CUDA(stream.value());
 
     return out;
   }
@@ -304,7 +305,7 @@ struct null_considering_binop {
   void populate_out_col(LhsViewT const& lhsv,
                         RhsViewT const& rhsv,
                         cudf::size_type col_size,
-                        cudaStream_t stream,
+                        rmm::cuda_stream_view stream,
                         CompareFunc cfunc,
                         OutT* out_col) const
   {
@@ -312,7 +313,7 @@ struct null_considering_binop {
     compare_functor<LhsViewT, RhsViewT, OutT, CompareFunc> binop_func{lhsv, rhsv, cfunc};
 
     // Execute it on every element
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator(0),
                       thrust::make_counting_iterator(col_size),
                       out_col,
@@ -326,8 +327,8 @@ struct null_considering_binop {
                                      binary_operator op,
                                      data_type output_type,
                                      cudf::size_type col_size,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     std::unique_ptr<column> out;
     // Create device views for inputs
@@ -418,8 +419,8 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
@@ -427,12 +428,12 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
   if (is_null_dependent(op)) {
     if (rhs.is_empty()) return cudf::make_empty_column(output_type);
     auto rhs_device_view = cudf::column_device_view::create(rhs, stream);
-    return null_considering_binop{}(lhs, *rhs_device_view, op, output_type, rhs.size(), mr, stream);
+    return null_considering_binop{}(lhs, *rhs_device_view, op, output_type, rhs.size(), stream, mr);
   } else {
     CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype");
     // Should pass the right type of scalar and column_view when specializing binary_op
     return binary_op<cudf::string_view, cudf::string_view, bool>{}(
-      rhs, lhs, op, output_type, true, mr, stream);
+      rhs, lhs, op, output_type, true, stream, mr);
   }
 }
 
@@ -440,8 +441,8 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          scalar const& rhs,
                                          binary_operator op,
                                          data_type output_type,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
@@ -449,11 +450,11 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
   if (is_null_dependent(op)) {
     if (lhs.is_empty()) return cudf::make_empty_column(output_type);
     auto lhs_device_view = cudf::column_device_view::create(lhs, stream);
-    return null_considering_binop{}(*lhs_device_view, rhs, op, output_type, lhs.size(), mr, stream);
+    return null_considering_binop{}(*lhs_device_view, rhs, op, output_type, lhs.size(), stream, mr);
   } else {
     CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype");
     return binary_op<cudf::string_view, cudf::string_view, bool>{}(
-      lhs, rhs, op, output_type, false, mr, stream);
+      lhs, rhs, op, output_type, false, stream, mr);
   }
 }
 
@@ -461,8 +462,8 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
@@ -473,11 +474,11 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
     auto lhs_device_view = cudf::column_device_view::create(lhs, stream);
     auto rhs_device_view = cudf::column_device_view::create(rhs, stream);
     return null_considering_binop{}(
-      *lhs_device_view, *rhs_device_view, op, output_type, lhs.size(), mr, stream);
+      *lhs_device_view, *rhs_device_view, op, output_type, lhs.size(), stream, mr);
   } else {
     CUDF_EXPECTS(is_boolean(output_type), "Invalid/Unsupported output datatype");
     return binary_op<cudf::string_view, cudf::string_view, bool>{}(
-      lhs, rhs, op, output_type, mr, stream);
+      lhs, rhs, op, output_type, stream, mr);
   }
 }
 
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index 3e6203ce8dd..a3f62f5018e 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -19,6 +19,8 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/null_mask.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace binops {
 namespace detail {
@@ -27,7 +29,7 @@ namespace detail {
  */
 rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
                                              scalar const& s,
-                                             cudaStream_t stream,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 
@@ -66,8 +68,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between a string column and a string
@@ -92,8 +94,8 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between two string columns.
@@ -118,8 +120,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace compiled
 }  // namespace binops
diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu
index 8e924f65d73..c9038082d88 100644
--- a/cpp/src/groupby/sort/groupby.cu
+++ b/cpp/src/groupby/sort/groupby.cu
@@ -269,8 +269,8 @@ void store_result_functor::operator()<aggregation::MEAN>(aggregation const& agg)
                                    count_result,
                                    binary_operator::DIV,
                                    cudf::detail::target_type(values.type(), aggregation::MEAN),
-                                   mr,
-                                   stream);
+                                   stream,
+                                   mr);
   cache.add_result(col_idx, agg, std::move(result));
 };
 

From 6c88b596dc55483303bb4ee24ef52f8b1f2675d0 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 4 Nov 2020 17:27:20 +1100
Subject: [PATCH 12/51] Fix includes and copyright dates

---
 cpp/benchmarks/copying/shift_benchmark.cu     | 31 +++++++++++++++----
 .../null_mask/set_null_mask_benchmark.cpp     |  7 +++--
 cpp/include/cudf/ast/detail/transform.cuh     |  3 +-
 cpp/include/cudf/column/column.hpp            |  4 +--
 .../cudf/column/column_device_view.cuh        |  5 +--
 cpp/include/cudf/detail/copy.hpp              |  2 +-
 cpp/include/cudf/detail/copy_if.cuh           |  5 +--
 cpp/include/cudf/detail/copy_if_else.cuh      |  2 +-
 cpp/include/cudf/detail/null_mask.hpp         |  5 +--
 cpp/include/cudf/detail/valid_if.cuh          |  7 +++--
 cpp/include/cudf/null_mask.hpp                |  2 +-
 cpp/include/cudf/scalar/scalar.hpp            |  2 +-
 .../cudf/strings/detail/copy_if_else.cuh      |  5 +--
 cpp/include/cudf/strings/detail/scatter.cuh   |  2 +-
 cpp/src/binaryop/binaryop.cpp                 |  3 +-
 cpp/src/binaryop/compiled/binary_ops.cu       |  6 ++--
 cpp/src/bitmask/null_mask.cu                  |  4 +--
 cpp/src/column/column_device_view.cu          |  5 +--
 cpp/src/copying/copy.cpp                      |  5 +--
 cpp/src/copying/copy.cu                       |  5 +--
 cpp/src/copying/copy_range.cu                 |  2 --
 cpp/src/copying/sample.cu                     |  3 +-
 cpp/src/copying/shift.cu                      |  1 +
 cpp/src/copying/slice.cpp                     |  5 +--
 cpp/src/datetime/datetime_ops.cu              |  2 +-
 cpp/src/dictionary/dictionary_factories.cu    |  1 +
 cpp/src/filling/fill.cu                       |  2 --
 cpp/src/groupby/sort/sort_helper.cu           |  2 +-
 cpp/src/io/csv/durations.cu                   |  1 +
 cpp/src/io/utilities/column_buffer.hpp        |  2 +-
 cpp/src/lists/copying/copying.cu              | 22 +++++++++++--
 cpp/src/merge/merge.cu                        |  1 +
 cpp/src/quantiles/quantile.cu                 |  3 +-
 cpp/src/reductions/scan.cu                    | 18 ++++++++++-
 cpp/src/replace/nulls.cu                      |  2 +-
 cpp/src/replace/replace.cu                    |  7 +++--
 cpp/src/reshape/byte_cast.cu                  |  3 +-
 cpp/src/reshape/interleave_columns.cu         |  1 +
 cpp/src/scalar/scalar.cpp                     |  4 +--
 cpp/src/strings/findall.cu                    |  4 ++-
 cpp/src/strings/split/split.cu                |  1 +
 cpp/src/strings/substring.cu                  |  1 +
 cpp/src/strings/translate.cu                  |  1 +
 cpp/src/strings/wrap.cu                       |  3 +-
 cpp/src/text/replace.cu                       |  3 ++
 cpp/src/text/stemmer.cu                       |  1 +
 cpp/src/unary/cast_ops.cu                     |  2 +-
 47 files changed, 144 insertions(+), 64 deletions(-)

diff --git a/cpp/benchmarks/copying/shift_benchmark.cu b/cpp/benchmarks/copying/shift_benchmark.cu
index 4cf3455debb..291c0ef6777 100644
--- a/cpp/benchmarks/copying/shift_benchmark.cu
+++ b/cpp/benchmarks/copying/shift_benchmark.cu
@@ -1,15 +1,34 @@
-#include <benchmark/benchmark.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/sequence.h>
-#include <thrust/transform.h>
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/copying.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf_test/column_wrapper.hpp>
+
+#include <benchmark/benchmark.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/transform.h>
+
 #include <memory>
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
diff --git a/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp b/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
index e0a35ff0097..7f663700e02 100644
--- a/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
+++ b/cpp/benchmarks/null_mask/set_null_mask_benchmark.cpp
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/null_mask.hpp>
 
+#include <benchmark/benchmark.h>
+
 class SetNullmask : public cudf::benchmark {
 };
 
diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh
index 454085ff9bd..96c8abe6c66 100644
--- a/cpp/include/cudf/ast/detail/transform.cuh
+++ b/cpp/include/cudf/ast/detail/transform.cuh
@@ -25,9 +25,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <cstring>
 #include <numeric>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index b94a2f13e1d..7966b6a1472 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include "column_view.hpp"
+#include <cudf/column/column_view.hpp>
 
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 5118db2364e..046a8069d1f 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -15,8 +15,6 @@
  */
 #pragma once
 
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <cudf/column/column_view.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.cuh>
@@ -30,6 +28,9 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
 /**
  * @file column_device_view.cuh
  * @brief Column device view class definitons
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 0312f1ebe75..719323b6045 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 9399df22450..abae1c33d4f 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,9 +36,10 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <algorithm>
 #include <cub/cub.cuh>
 
+#include <algorithm>
+
 namespace {
 // Compute the count of elements that pass the mask within each block
 template <typename Filter, int block_size>
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index d5be077d27b..763da179639 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 4b2c5b0a8d6..50a2424e86c 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,9 @@
 
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <vector>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 011a3fa616c..f8f3ba51468 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,10 +22,11 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+
 #include <thrust/device_vector.h>
 #include <thrust/distance.h>
-#include <rmm/device_scalar.hpp>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 110fd2b5087..690f4cdbbb0 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index dcce9f043e8..1f960c21197 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -22,13 +22,13 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 
 #include <memory>
 #include <utility>
 #include <vector>
-#include "rmm/cuda_stream_view.hpp"
 
 /**
  * @file
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 7bfe1df4239..3433ab7d210 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,8 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 627b9902506..4f495afa099 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 55aabb87d8d..8322cc9cfae 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -45,8 +45,9 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <string>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index e21681a8467..94096158fab 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "binary_ops.hpp"
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -22,8 +24,6 @@
 
 #include <rmm/thrust_rmm_allocator.h>
 
-#include "binary_ops.hpp"
-
 namespace cudf {
 namespace binops {
 namespace compiled {
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 2a61c343b05..06f969a9d43 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
@@ -39,7 +40,6 @@
 #include <algorithm>
 #include <numeric>
 #include <type_traits>
-#include "rmm/mr/device/device_memory_resource.hpp"
 
 namespace cudf {
 size_type state_null_count(mask_state state, size_type size)
diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu
index fb3bab68446..4250d63761f 100644
--- a/cpp/src/column/column_device_view.cu
+++ b/cpp/src/column/column_device_view.cu
@@ -17,10 +17,11 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <numeric>
-#include "rmm/cuda_stream_view.hpp"
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
+#include <numeric>
 
 namespace cudf {
 // Trivially copy all members but the children
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 6c0aeb601c2..811c9b6e42d 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <algorithm>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 619d24c1204..91244af2d13 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -19,9 +19,10 @@
 #include <cudf/detail/copy_if_else.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/strings/string_view.cuh>
-#include "cudf/fixed_point/fixed_point.hpp"
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index daca5900768..95be6cb8bbc 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -34,8 +34,6 @@
 
 #include <thrust/iterator/constant_iterator.h>
 
-#include <cuda_runtime.h>
-
 #include <memory>
 
 namespace {
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index c270be1ccca..e9e2e1d6340 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -21,7 +21,8 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/device_vector.h>
 #include <thrust/random.h>
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index 169b6760985..2dc3b04f2f2 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -31,6 +31,7 @@
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
+
 #include <algorithm>
 #include <iterator>
 #include <memory>
diff --git a/cpp/src/copying/slice.cpp b/cpp/src/copying/slice.cpp
index a9141b7a48f..017dc37d002 100644
--- a/cpp/src/copying/slice.cpp
+++ b/cpp/src/copying/slice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,9 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <algorithm>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index c3e2cc9a2ff..ce2df92efc0 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -25,9 +25,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
-#include "rmm/cuda_stream_view.hpp"
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace datetime {
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 286f4961946..ec598b71f88 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -21,6 +21,7 @@
 #include <cudf/dictionary/detail/encode.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index de6ab9f7261..a564eae5f01 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -32,8 +32,6 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <cuda_runtime.h>
-
 #include <memory>
 
 namespace {
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 88bdaf829a1..f219084bfc7 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 863e7f0a8b3..15dfb5f5534 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
+
 #include <strings/convert/utilities.cuh>
 #include <strings/utilities.cuh>
 
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index cde8a321f8e..0290857119b 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -31,8 +31,8 @@
 #include <cudf_test/column_utilities.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index c7bf2139a83..ccf57a09d52 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -1,9 +1,27 @@
-#include <thrust/iterator/counting_iterator.h>
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/gather.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
 #include <iostream>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace lists {
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index c22f5afe181..265a20bcbb7 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -24,6 +24,7 @@
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
+
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/merge.h>
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 280cc0198cf..31205f292c0 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -25,9 +25,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <vector>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/reductions/scan.cu b/cpp/src/reductions/scan.cu
index d5c9527e927..6d90124db36 100644
--- a/cpp/src/reductions/scan.cu
+++ b/cpp/src/reductions/scan.cu
@@ -1,3 +1,18 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
@@ -11,7 +26,8 @@
 #include <cudf/reduction.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 2a8fea154e5..d13d729536b 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -33,7 +34,6 @@
 #include <cudf/strings/replace.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include "cudf/copying.hpp"
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 21b583cddbe..01f75f41cfc 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -17,7 +17,7 @@
  * limitations under the License.
  */
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,9 +50,10 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <thrust/find.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <thrust/find.h>
 
 namespace {  // anonymous
 
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 841a8879aa6..0f5c7595cd0 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -21,7 +21,8 @@
 #include <cudf/reshape.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index ef2ef8858ea..9e6197afe0f 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+
 #include <strings/utilities.cuh>
 
 namespace cudf {
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 89d3534a41f..052c2aaedc7 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,10 @@
 
 #include <cudf/scalar/scalar.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
 #include <string>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu
index d7e695c0a3a..7830ab13dbb 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/findall.cu
@@ -23,9 +23,11 @@
 #include <cudf/strings/findall.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/extrema.h>
 
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 4ef46b289e2..fb0efa1131c 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -25,6 +25,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
+
 #include <strings/split/split_utils.cuh>
 
 #include <thrust/binary_search.h>  // upper_bound()
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 1d4656ffa8f..af068e2997e 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -27,6 +27,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/substring.hpp>
+
 #include <strings/utilities.cuh>
 
 namespace cudf {
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 1fc9ff7f813..4cc5d2bcba8 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -23,6 +23,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
+
 #include <strings/utilities.cuh>
 
 #include <thrust/find.h>
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index 181283c5e34..c61fd0797a4 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -15,6 +15,8 @@
  */
 
 #include <strings/char_types/is_flags.h>
+#include <strings/utilities.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -25,7 +27,6 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
-#include <strings/utilities.cuh>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 4263c5f1864..8da94e69da9 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -23,9 +23,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
+
 #include <nvtext/detail/tokenize.hpp>
 #include <nvtext/replace.hpp>
+
 #include <strings/utilities.cuh>
+
 #include <text/utilities/tokenize_ops.cuh>
 
 namespace nvtext {
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 1521dc90dae..8810ea759e7 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -23,6 +23,7 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+
 #include <strings/utilities.cuh>
 
 #include <nvtext/stemmer.hpp>
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index e96f6e4f004..e8cc606865b 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 325ca52a79aec8488e4e3c89a9823279960817c8 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 5 Nov 2020 13:14:41 +1100
Subject: [PATCH 13/51] Update round to use detail::copy_bitmask

---
 cpp/src/round/round.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 9bc95175d9f..362ed4e8da0 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/round.hpp>
 #include <cudf/round.hpp>
@@ -124,7 +125,7 @@ struct round_fn {
 
     auto result = cudf::make_fixed_width_column(input.type(),  //
                                                 input.size(),
-                                                copy_bitmask(input, stream, mr),
+                                                detail::copy_bitmask(input, stream, mr),
                                                 input.null_count(),
                                                 stream,
                                                 mr);

From ad24fb7dd6a054b2c5f68ada49b6c21c12c786c7 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 5 Nov 2020 13:22:58 +1100
Subject: [PATCH 14/51] Use stream.synchronize()

---
 cpp/src/column/column_device_view.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu
index 4250d63761f..fb54c9b0bcc 100644
--- a/cpp/src/column/column_device_view.cu
+++ b/cpp/src/column/column_device_view.cu
@@ -133,7 +133,7 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str
                            cudaMemcpyDefault,
                            stream.value()));
 
-  CUDA_TRY(cudaStreamSynchronize(stream.value()));
+  stream.synchronize();
 
   return result;
 }

From 3b6b0aabbed5faea6c7eea22744f378cc826e378 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 5 Nov 2020 14:21:31 +1100
Subject: [PATCH 15/51] Convert concatenate to cuda_stream_view

---
 cpp/include/cudf/detail/concatenate.cuh       |  6 +-
 cpp/include/cudf/detail/concatenate.hpp       | 10 ++--
 .../cudf/dictionary/detail/concatenate.hpp    |  4 +-
 cpp/include/cudf/lists/detail/concatenate.hpp |  4 +-
 .../cudf/strings/detail/concatenate.hpp       |  6 +-
 cpp/src/copying/concatenate.cu                | 57 ++++++++++---------
 cpp/src/dictionary/add_keys.cu                |  4 +-
 cpp/src/dictionary/detail/concatenate.cu      | 27 +++++----
 cpp/src/dictionary/set_keys.cu                |  2 +-
 cpp/src/interop/from_arrow.cpp                |  2 +-
 cpp/src/join/hash_join.cu                     |  2 +-
 cpp/src/lists/copying/concatenate.cu          | 23 ++++----
 cpp/src/replace/replace.cu                    |  2 +-
 cpp/src/strings/copying/concatenate.cu        | 22 ++++---
 14 files changed, 97 insertions(+), 74 deletions(-)

diff --git a/cpp/include/cudf/detail/concatenate.cuh b/cpp/include/cudf/detail/concatenate.cuh
index b379a5b81a2..a30ad6e853d 100644
--- a/cpp/include/cudf/detail/concatenate.cuh
+++ b/cpp/include/cudf/detail/concatenate.cuh
@@ -21,6 +21,8 @@
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <vector>
 
 namespace cudf {
@@ -36,7 +38,7 @@ void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
                        rmm::device_vector<size_t> const& d_offsets,
                        bitmask_type* dest_mask,
                        size_type output_size,
-                       cudaStream_t stream);
+                       rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::concatenate_masks(std::vector<column_view> const&,bitmask_type*)
@@ -45,7 +47,7 @@ void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
  */
 void concatenate_masks(std::vector<column_view> const& views,
                        bitmask_type* dest_mask,
-                       cudaStream_t stream);
+                       rmm::cuda_stream_view stream);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 04faa0b11b5..43eb5203b37 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -19,6 +19,8 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <vector>
 
 namespace cudf {
@@ -31,8 +33,8 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(
   std::vector<column_view> const& columns_to_concat,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::concatenate(std::vector<table_view> const&,rmm::mr::device_memory_resource*)
@@ -41,8 +43,8 @@ std::unique_ptr<column> concatenate(
  */
 std::unique_ptr<table> concatenate(
   std::vector<table_view> const& tables_to_concat,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index 82467c7cda3..ae2e0f0ba38 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -35,7 +37,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(
   std::vector<column_view> const& columns,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index 580e37dcf6a..f9adc893b8e 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -19,6 +19,8 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
@@ -41,7 +43,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(
   std::vector<column_view> const& columns,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index a544e66a197..3e6fc6d67fc 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -19,6 +19,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -40,8 +42,8 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(
   std::vector<column_view> const& columns,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 95a4d16673e..1063422bf73 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -33,6 +33,7 @@
 #include <algorithm>
 #include <numeric>
 #include <utility>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
@@ -44,11 +45,11 @@ constexpr bool use_fused_kernel_heuristic(bool const has_nulls, size_t const num
   return has_nulls || num_columns > 4;
 }
 
-auto create_device_views(std::vector<column_view> const& views, cudaStream_t stream)
+auto create_device_views(std::vector<column_view> const& views, rmm::cuda_stream_view stream)
 {
   // Create device views for each input view
-  using CDViewPtr =
-    decltype(column_device_view::create(std::declval<column_view>(), std::declval<cudaStream_t>()));
+  using CDViewPtr = decltype(
+    column_device_view::create(std::declval<column_view>(), std::declval<rmm::cuda_stream_view>()));
   auto device_view_owners = std::vector<CDViewPtr>(views.size());
   std::transform(
     views.cbegin(), views.cend(), device_view_owners.begin(), [stream](auto const& col) {
@@ -130,11 +131,11 @@ void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
                        rmm::device_vector<size_t> const& d_offsets,
                        bitmask_type* dest_mask,
                        size_type output_size,
-                       cudaStream_t stream)
+                       rmm::cuda_stream_view stream)
 {
   constexpr size_type block_size{256};
   cudf::detail::grid_1d config(output_size, block_size);
-  concatenate_masks_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
+  concatenate_masks_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
     d_views.data().get(),
     d_offsets.data().get(),
     static_cast<size_type>(d_views.size()),
@@ -144,7 +145,7 @@ void concatenate_masks(rmm::device_vector<column_device_view> const& d_views,
 
 void concatenate_masks(std::vector<column_view> const& views,
                        bitmask_type* dest_mask,
-                       cudaStream_t stream)
+                       rmm::cuda_stream_view stream)
 {
   // Preprocess and upload inputs to device memory
   auto const device_views = create_device_views(views, stream);
@@ -210,8 +211,8 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
 template <typename T>
 std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
                                           bool const has_nulls,
-                                          rmm::mr::device_memory_resource* mr,
-                                          cudaStream_t stream)
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
   using mask_policy = cudf::mask_allocation_policy;
 
@@ -238,7 +239,7 @@ std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
   cudf::detail::grid_1d config(output_size, block_size);
   auto const kernel = has_nulls ? fused_concatenate_kernel<T, block_size, true>
                                 : fused_concatenate_kernel<T, block_size, false>;
-  kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
+  kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
     d_views.data().get(),
     d_offsets.data().get(),
     static_cast<size_type>(d_views.size()),
@@ -253,8 +254,8 @@ std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
 template <typename T>
 std::unique_ptr<column> for_each_concatenate(std::vector<column_view> const& views,
                                              bool const has_nulls,
-                                             rmm::mr::device_memory_resource* mr,
-                                             cudaStream_t stream)
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
 {
   size_type const total_element_count =
     std::accumulate(views.begin(), views.end(), 0, [](auto accumulator, auto const& v) {
@@ -263,15 +264,17 @@ std::unique_ptr<column> for_each_concatenate(std::vector<column_view> const& vie
 
   using mask_policy = cudf::mask_allocation_policy;
   auto const policy = has_nulls ? mask_policy::ALWAYS : mask_policy::NEVER;
-  auto col          = cudf::allocate_like(views.front(), total_element_count, policy, mr);
+  auto col = cudf::detail::allocate_like(views.front(), total_element_count, policy, stream, mr);
 
   col->set_null_count(0);             // prevent null count from being materialized...
   auto m_view = col->mutable_view();  // ...when we take a mutable view
 
   auto count = 0;
   for (auto& v : views) {
-    thrust::copy(
-      rmm::exec_policy()->on(stream), v.begin<T>(), v.end<T>(), m_view.begin<T>() + count);
+    thrust::copy(rmm::exec_policy(stream)->on(stream.value()),
+                 v.begin<T>(),
+                 v.end<T>(),
+                 m_view.begin<T>() + count);
     count += v.size();
   }
 
@@ -285,8 +288,8 @@ std::unique_ptr<column> for_each_concatenate(std::vector<column_view> const& vie
 
 struct concatenate_dispatch {
   std::vector<column_view> const& views;
+  rmm::cuda_stream_view stream;
   rmm::mr::device_memory_resource* mr;
-  cudaStream_t stream;
 
   // fixed width
   template <typename T>
@@ -299,9 +302,9 @@ struct concatenate_dispatch {
 
     // Use a heuristic to guess when the fused kernel will be faster
     if (use_fused_kernel_heuristic(has_nulls, views.size())) {
-      return fused_concatenate<Type>(views, has_nulls, mr, stream);
+      return fused_concatenate<Type>(views, has_nulls, stream, mr);
     } else {
-      return for_each_concatenate<Type>(views, has_nulls, mr, stream);
+      return for_each_concatenate<Type>(views, has_nulls, stream, mr);
     }
   }
 };
@@ -315,7 +318,7 @@ std::unique_ptr<column> concatenate_dispatch::operator()<cudf::dictionary32>()
 template <>
 std::unique_ptr<column> concatenate_dispatch::operator()<cudf::string_view>()
 {
-  return cudf::strings::detail::concatenate(views, mr, stream);
+  return cudf::strings::detail::concatenate(views, stream, mr);
 }
 
 template <>
@@ -326,8 +329,8 @@ std::unique_ptr<column> concatenate_dispatch::operator()<cudf::list_view>()
 
 // Concatenates the elements from a vector of column_views
 std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_concat,
-                                    rmm::mr::device_memory_resource* mr,
-                                    cudaStream_t stream)
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(not columns_to_concat.empty(), "Unexpected empty list of columns to concatenate.");
 
@@ -343,12 +346,12 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_c
     return empty_like(columns_to_concat.front());
   }
 
-  return type_dispatcher(type, concatenate_dispatch{columns_to_concat, mr, stream});
+  return type_dispatcher(type, concatenate_dispatch{columns_to_concat, stream, mr});
 }
 
 std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_concat,
-                                   rmm::mr::device_memory_resource* mr,
-                                   cudaStream_t stream)
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   if (tables_to_concat.empty()) { return std::make_unique<table>(); }
 
@@ -368,7 +371,7 @@ std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_conc
                    tables_to_concat.cend(),
                    std::back_inserter(cols),
                    [i](auto const& t) { return t.column(i); });
-    concat_columns.emplace_back(detail::concatenate(cols, mr, stream));
+    concat_columns.emplace_back(detail::concatenate(cols, stream, mr));
   }
   return std::make_unique<table>(std::move(concat_columns));
 }
@@ -394,7 +397,7 @@ rmm::device_buffer concatenate_masks(std::vector<column_view> const& views,
     return null_mask;
   }
   // no nulls, so return an empty device buffer
-  return rmm::device_buffer{0, (cudaStream_t)0, mr};
+  return rmm::device_buffer{0, rmm::cuda_stream_default, mr};
 }
 
 // Concatenates the elements from a vector of column_views
@@ -402,14 +405,14 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns_to_c
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(columns_to_concat, mr, 0);
+  return detail::concatenate(columns_to_concat, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> concatenate(std::vector<table_view> const& tables_to_concat,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(tables_to_concat, mr, 0);
+  return detail::concatenate(tables_to_concat, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index dc18afebb3b..5633dcfbc30 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -58,10 +58,10 @@ std::unique_ptr<column> add_keys(
   // first, concatenate the keys together
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
   auto combined_keys = cudf::detail::concatenate(
-    std::vector<column_view>{old_keys, new_keys}, rmm::mr::get_current_device_resource(), stream);
+    std::vector<column_view>{old_keys, new_keys}, stream, rmm::mr::get_current_device_resource());
   // sort and remove any duplicates from the combined keys
   // drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f]
-  auto table_keys = cudf::detail::drop_duplicates(table_view{{*combined_keys}},
+  auto table_keys = cudf::detail::drop_duplicates(table_view{{combined_keys->view()}},
                                                   std::vector<size_type>{0},  // only one key column
                                                   duplicate_keep_option::KEEP_FIRST,
                                                   null_equality::EQUAL,
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 0baec216c55..b83de6575e8 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -24,10 +24,13 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
 #include <thrust/binary_search.h>
 #include <thrust/transform_scan.h>
+
 #include <algorithm>
-#include <rmm/device_uvector.hpp>
 #include <vector>
 
 namespace cudf {
@@ -84,7 +87,7 @@ struct compute_children_offsets_fn {
    * @param stream Stream used for allocating the output rmm::device_uvector.
    * @return Vector of offsets_pair objects for keys and indices.
    */
-  rmm::device_uvector<offsets_pair> create_children_offsets(cudaStream_t stream)
+  rmm::device_uvector<offsets_pair> create_children_offsets(rmm::cuda_stream_view stream)
   {
     std::vector<offsets_pair> offsets(columns_ptrs.size());
     thrust::transform_exclusive_scan(
@@ -105,8 +108,8 @@ struct compute_children_offsets_fn {
                              offsets.data(),
                              offsets.size() * sizeof(offsets_pair),
                              cudaMemcpyHostToDevice,
-                             stream));
-    CUDA_TRY(cudaStreamSynchronize(stream));
+                             stream.value()));
+    stream.synchronize();
     return d_offsets;
   }
 
@@ -130,7 +133,7 @@ struct dispatch_compute_indices {
              column_view const& new_keys,
              offsets_pair const* d_offsets,
              size_type const* d_map_to_keys,
-             cudaStream_t stream,
+             rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource* mr)
   {
     auto keys_view     = column_device_view::create(all_keys, stream);
@@ -155,7 +158,7 @@ struct dispatch_compute_indices {
     auto result_itr =
       cudf::detail::indexalator_factory::make_output_iterator(result->mutable_view());
     // new indices values are computed by matching the concatenated keys to the new key set
-    thrust::lower_bound(rmm::exec_policy(stream)->on(stream),
+    thrust::lower_bound(rmm::exec_policy(stream)->on(stream.value()),
                         new_keys_view->begin<Element>(),
                         new_keys_view->end<Element>(),
                         all_itr,
@@ -173,7 +176,7 @@ struct dispatch_compute_indices {
              column_view const&,
              offsets_pair const*,
              size_type const*,
-             cudaStream_t stream,
+             rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource*)
   {
     CUDF_FAIL("list_view as keys for dictionary not supported");
@@ -183,7 +186,7 @@ struct dispatch_compute_indices {
 }  // namespace
 
 std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
-                                    cudaStream_t stream,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   // exception here is the same behavior as in cudf::concatenate
@@ -202,7 +205,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
     return keys;
   });
   auto all_keys =
-    cudf::detail::concatenate(keys_views, rmm::mr::get_current_device_resource(), stream);
+    cudf::detail::concatenate(keys_views, stream, rmm::mr::get_current_device_resource());
 
   // sort keys and remove duplicates;
   // this becomes the keys child for the output dictionary column
@@ -211,7 +214,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
                                                   duplicate_keep_option::KEEP_FIRST,
                                                   null_equality::EQUAL,
                                                   mr,
-                                                  stream)
+                                                  stream.value())
                       ->release();
   std::unique_ptr<column> keys_column(std::move(table_keys.front()));
 
@@ -222,7 +225,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
     if (dict_view.is_empty()) return column_view{data_type{type_id::UINT32}, 0, nullptr};
     return dict_view.get_indices_annotated();  // nicely includes validity mask and view offset
   });
-  auto all_indices        = cudf::detail::concatenate(indices_views, mr, stream);
+  auto all_indices        = cudf::detail::concatenate(indices_views, stream, mr);
   auto const indices_size = all_indices->size();
 
   // build a vector of values to map the old indices to the concatenated keys
@@ -234,7 +237,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
                                                      });
   // the indices offsets (pair.second) are for building the map
   thrust::lower_bound(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     children_offsets.begin() + 1,
     children_offsets.end(),
     indices_itr,
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index e5be253fd1d..d95fdefe153 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -156,7 +156,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(std::vector<dictionary_c
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
-  auto new_keys  = cudf::detail::concatenate(keys, rmm::mr::get_current_device_resource(), stream);
+  auto new_keys  = cudf::detail::concatenate(keys, stream, rmm::mr::get_current_device_resource());
   auto keys_view = new_keys->view();
   std::vector<std::unique_ptr<column>> result(input.size());
   std::transform(input.begin(), input.end(), result.begin(), [keys_view, mr, stream](auto& col) {
diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp
index 141c8121dff..045c1174b08 100644
--- a/cpp/src/interop/from_arrow.cpp
+++ b/cpp/src/interop/from_arrow.cpp
@@ -380,7 +380,7 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   concat_columns.end(),
                                   std::back_inserter(column_views),
                                   [](auto const& col) { return col->view(); });
-                   return cudf::detail::concatenate(column_views, mr, stream);
+                   return cudf::detail::concatenate(column_views, stream, mr);
                  });
 
   return std::make_unique<table>(std::move(columns));
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 120777abf96..456e26a7cae 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -425,7 +425,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> construct_join_output_
                                               rmm::mr::get_current_device_resource(),
                                               stream);
       common_table           = cudf::detail::concatenate(
-        {common_from_build->view(), common_from_probe->view()}, mr, stream);
+        {common_from_build->view(), common_from_probe->view()}, stream, mr);
     }
     joined_indices = concatenate_vector_pairs(complement_indices, joined_indices);
   } else {
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 4fddf8f3ce9..4fc1ffce1ec 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -23,6 +23,9 @@
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 
 namespace cudf {
@@ -46,7 +49,7 @@ namespace {
  */
 std::unique_ptr<column> merge_offsets(std::vector<lists_column_view> const& columns,
                                       size_type total_list_count,
-                                      cudaStream_t stream,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   // outgoing offsets
@@ -61,18 +64,18 @@ std::unique_ptr<column> merge_offsets(std::vector<lists_column_view> const& colu
   std::for_each(columns.begin(), columns.end(), [&](lists_column_view const& c) {
     if (c.size() > 0) {
       // handle sliced columns
-      int const local_shift =
-        shift -
-        (c.offset() > 0 ? cudf::detail::get_value<size_type>(c.offsets(), c.offset(), stream) : 0);
-      column_device_view offsets(c.offsets(), 0, 0);
+      int const local_shift = shift - (c.offset() > 0 ? cudf::detail::get_value<size_type>(
+                                                          c.offsets(), c.offset(), stream.value())
+                                                      : 0);
+      column_device_view offsets(c.offsets(), nullptr, nullptr);
       thrust::transform(
-        rmm::exec_policy(stream)->on(stream),
+        rmm::exec_policy(stream)->on(stream.value()),
         offsets.begin<size_type>() + c.offset(),
         offsets.begin<size_type>() + c.offset() + c.size() + 1,
         d_merged_offsets.begin<size_type>() + count,
         [local_shift] __device__(size_type offset) { return offset + local_shift; });
 
-      shift += c.get_sliced_child(stream).size();
+      shift += c.get_sliced_child(stream.value()).size();
       count += c.size();
     }
   });
@@ -88,7 +91,7 @@ std::unique_ptr<column> merge_offsets(std::vector<lists_column_view> const& colu
  */
 std::unique_ptr<column> concatenate(
   std::vector<column_view> const& columns,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   std::vector<lists_column_view> lists_columns;
@@ -107,9 +110,9 @@ std::unique_ptr<column> concatenate(
                 [&total_list_count, &children, stream](lists_column_view const& l) {
                   // count total # of lists
                   total_list_count += l.size();
-                  children.push_back(l.get_sliced_child(stream));
+                  children.push_back(l.get_sliced_child(stream.value()));
                 });
-  auto data = cudf::detail::concatenate(children, mr, stream);
+  auto data = cudf::detail::concatenate(children, stream, mr);
 
   // merge offsets
   auto offsets = merge_offsets(lists_columns, total_list_count, stream, mr);
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 01f75f41cfc..eef397b6a13 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -453,7 +453,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
 
   auto matched_input = [&] {
     auto new_keys = cudf::detail::concatenate(
-      {values.keys(), replacements.keys()}, rmm::mr::get_current_device_resource(), stream);
+      {values.keys(), replacements.keys()}, stream, rmm::mr::get_current_device_resource());
     return cudf::dictionary::detail::add_keys(input, new_keys->view(), mr, stream);
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 9a8a64f2f99..be56c256bfa 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -24,6 +24,8 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/binary_search.h>
 #include <thrust/for_each.h>
 #include <thrust/transform_reduce.h>
@@ -61,7 +63,8 @@ struct chars_size_transform {
   }
 };
 
-auto create_strings_device_views(std::vector<column_view> const& views, cudaStream_t stream)
+auto create_strings_device_views(std::vector<column_view> const& views,
+                                 rmm::cuda_stream_view stream)
 {
   // Create device views for each input view
   using CDViewPtr =
@@ -101,12 +104,12 @@ auto create_strings_device_views(std::vector<column_view> const& views, cudaStre
   // error: the default constructor of "cudf::column_device_view" cannot be
   // referenced -- it is a deleted function
   auto d_partition_offsets = rmm::device_vector<size_t>(views.size() + 1);
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     d_views.cbegin(),
                     d_views.cend(),
                     std::next(d_partition_offsets.begin()),
                     chars_size_transform{});
-  thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream),
+  thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                          d_partition_offsets.cbegin(),
                          d_partition_offsets.cend(),
                          d_partition_offsets.begin());
@@ -213,8 +216,8 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*
 }
 
 std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
-                                    rmm::mr::device_memory_resource* mr,
-                                    cudaStream_t stream)
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   // Compute output sizes
   auto const device_views         = create_strings_device_views(columns, stream);
@@ -225,7 +228,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
   auto const total_bytes          = std::get<5>(device_views);
   auto const offsets_count        = strings_count + 1;
 
-  if (strings_count == 0) { return make_empty_strings_column(mr, stream); }
+  if (strings_count == 0) { return make_empty_strings_column(mr, stream.value()); }
 
   CUDF_EXPECTS(offsets_count <= std::numeric_limits<size_type>::max(),
                "total number of strings is too large for cudf column");
@@ -261,7 +264,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
     cudf::detail::grid_1d config(offsets_count, block_size);
     auto const kernel = has_nulls ? fused_concatenate_string_offset_kernel<block_size, true>
                                   : fused_concatenate_string_offset_kernel<block_size, false>;
-    kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
+    kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
       d_views.data().get(),
       d_input_offsets.data().get(),
       d_partition_offsets.data().get(),
@@ -281,7 +284,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
       constexpr size_type block_size{256};
       cudf::detail::grid_1d config(total_bytes, block_size);
       auto const kernel = fused_concatenate_string_chars_kernel;
-      kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream>>>(
+      kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
         d_views.data().get(),
         d_partition_offsets.data().get(),
         static_cast<size_type>(d_views.size()),
@@ -303,7 +306,8 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
         // copy the chars column data
         auto d_chars    = chars_child.data<char>() + bytes_offset;
         size_type bytes = thrust::device_pointer_cast(d_offsets)[column_size] - bytes_offset;
-        CUDA_TRY(cudaMemcpyAsync(d_new_chars, d_chars, bytes, cudaMemcpyDeviceToDevice, stream));
+        CUDA_TRY(
+          cudaMemcpyAsync(d_new_chars, d_chars, bytes, cudaMemcpyDeviceToDevice, stream.value()));
 
         // get ready for the next column
         d_new_chars += bytes;

From a2edf78e0154ad011b449862376f771d29e3f133 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Fri, 6 Nov 2020 13:13:43 +1100
Subject: [PATCH 16/51] Convert table and copy_if to cuda_stream_view

---
 cpp/include/cudf/detail/copy_if.cuh           | 53 ++++++++++---------
 cpp/include/cudf/table/table.hpp              |  6 ++-
 cpp/src/dictionary/remove_keys.cu             |  2 +-
 .../stream_compaction/apply_boolean_mask.cu   |  4 +-
 cpp/src/stream_compaction/drop_nans.cu        |  2 +-
 cpp/src/stream_compaction/drop_nulls.cu       |  2 +-
 cpp/src/table/table.cpp                       |  6 ++-
 cpp/src/text/generate_ngrams.cu               |  4 +-
 8 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index abae1c33d4f..05a84a238ff 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -32,6 +32,7 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
@@ -210,8 +211,8 @@ struct scatter_gather_functor {
     cudf::size_type const* block_offsets,
     Filter filter,
     cudf::size_type per_thread,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     auto output_column = cudf::detail::allocate_like(
       input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
@@ -231,18 +232,18 @@ struct scatter_gather_functor {
       CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
                                0,
                                cudf::bitmask_allocation_size_bytes(output.size()),
-                               stream));
+                               stream.value()));
     }
 
     auto output_device_view = cudf::mutable_column_device_view::create(output, stream);
     auto input_device_view  = cudf::column_device_view::create(input, stream);
-    scatter<<<grid.num_blocks, block_size, 0, stream>>>(*output_device_view,
-                                                        null_count.data(),
-                                                        *input_device_view,
-                                                        block_offsets,
-                                                        input.size(),
-                                                        per_thread,
-                                                        filter);
+    scatter<<<grid.num_blocks, block_size, 0, stream.value()>>>(*output_device_view,
+                                                                null_count.data(),
+                                                                *input_device_view,
+                                                                block_offsets,
+                                                                input.size(),
+                                                                per_thread,
+                                                                filter);
 
     if (has_valid) { output_column->set_null_count(null_count.value(stream)); }
     return output_column;
@@ -256,19 +257,19 @@ struct scatter_gather_functor {
     cudf::size_type const* block_offsets,
     Filter filter,
     cudf::size_type per_thread,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     rmm::device_uvector<cudf::size_type> indices(output_size, stream);
 
-    thrust::copy_if(rmm::exec_policy(stream)->on(stream),
+    thrust::copy_if(rmm::exec_policy(stream)->on(stream.value()),
                     thrust::counting_iterator<cudf::size_type>(0),
                     thrust::counting_iterator<cudf::size_type>(input.size()),
                     indices.begin(),
                     filter);
 
     auto output_table = cudf::detail::gather(
-      cudf::table_view{{input}}, indices.begin(), indices.end(), false, mr, stream);
+      cudf::table_view{{input}}, indices.begin(), indices.end(), false, mr, stream.value());
 
     // There will be only one column
     return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
@@ -281,8 +282,8 @@ struct scatter_gather_functor {
     cudf::size_type const* block_offsets,
     Filter filter,
     cudf::size_type per_thread,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     CUDF_FAIL("fixed_point type not supported for this operation yet");
   }
@@ -309,8 +310,8 @@ template <typename Filter>
 std::unique_ptr<table> copy_if(
   table_view const& input,
   Filter filter,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_FUNC_RANGE();
 
@@ -326,12 +327,12 @@ std::unique_ptr<table> copy_if(
   rmm::device_uvector<cudf::size_type> block_offsets(grid.num_blocks + 1, stream);
 
   // 1. Find the count of elements in each block that "pass" the mask
-  compute_block_counts<Filter, block_size><<<grid.num_blocks, block_size, 0, stream>>>(
+  compute_block_counts<Filter, block_size><<<grid.num_blocks, block_size, 0, stream.value()>>>(
     block_counts.begin(), input.num_rows(), per_thread, filter);
 
   // initialize just the first element of block_offsets to 0 since the InclusiveSum below
   // starts at the second element.
-  CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream));
+  CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
 
   // 2. Find the offset for each block's output using a scan of block counts
   if (grid.num_blocks > 1) {
@@ -342,7 +343,7 @@ std::unique_ptr<table> copy_if(
                                   block_counts.begin(),
                                   block_offsets.begin() + 1,
                                   grid.num_blocks,
-                                  stream);
+                                  stream.value());
     rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
 
     // Run exclusive prefix sum
@@ -351,7 +352,7 @@ std::unique_ptr<table> copy_if(
                                   block_counts.begin(),
                                   block_offsets.begin() + 1,
                                   grid.num_blocks,
-                                  stream);
+                                  stream.value());
   }
 
   // As it is InclusiveSum, last value in block_offsets will be output_size
@@ -362,9 +363,9 @@ std::unique_ptr<table> copy_if(
     grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(),
     sizeof(cudf::size_type),
     cudaMemcpyDefault,
-    stream));
+    stream.value()));
 
-  CUDA_TRY(cudaStreamSynchronize(stream));
+  stream.synchronize();
 
   if (output_size == input.num_rows()) {
     return std::make_unique<table>(input, stream, mr);
@@ -378,8 +379,8 @@ std::unique_ptr<table> copy_if(
                                    block_offsets.begin(),
                                    filter,
                                    per_thread,
-                                   mr,
-                                   stream);
+                                   stream,
+                                   mr);
     });
 
     return std::make_unique<table>(std::move(out_columns));
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index afce337303f..e760e18c6d6 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <vector>
 
@@ -63,7 +65,7 @@ class table {
    * @param mr Device memory resource used for allocating the device memory for the new columns
    **/
   table(table_view view,
-        cudaStream_t stream                 = 0,
+        rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index e6b179c0a7d..3913d68b10f 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -85,7 +85,7 @@ std::unique_ptr<column> remove_keys_fn(
     // copy the non-removed keys ( keys_to_keep_fn(idx)==true )
     auto table_keys =
       cudf::detail::copy_if(
-        table_view{{keys_view, keys_positions->view()}}, keys_to_keep_fn, mr, stream)
+        table_view{{keys_view, keys_positions->view()}}, keys_to_keep_fn, stream, mr)
         ->release();
     auto const filtered_view = table_keys[1]->view();
     auto filtered_itr = cudf::detail::indexalator_factory::make_input_iterator(filtered_view);
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index a6f548b7c5d..ccb31898e95 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -74,9 +74,9 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
   auto device_boolean_mask = cudf::column_device_view::create(boolean_mask, stream);
 
   if (boolean_mask.has_nulls()) {
-    return detail::copy_if(input, boolean_mask_filter<true>{*device_boolean_mask}, mr, stream);
+    return detail::copy_if(input, boolean_mask_filter<true>{*device_boolean_mask}, stream, mr);
   } else {
-    return detail::copy_if(input, boolean_mask_filter<false>{*device_boolean_mask}, mr, stream);
+    return detail::copy_if(input, boolean_mask_filter<false>{*device_boolean_mask}, stream, mr);
   }
 }
 
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index 2ba0f05b45a..ddd5d0c9934 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -99,7 +99,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
   auto keys_device_view = cudf::table_device_view::create(keys_view, stream);
 
   return cudf::detail::copy_if(
-    input, valid_table_filter{*keys_device_view, keep_threshold}, mr, stream);
+    input, valid_table_filter{*keys_device_view, keep_threshold}, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index fb487e6d6e4..49708b635d8 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -72,7 +72,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
   auto keys_device_view = cudf::table_device_view::create(keys_view, stream);
 
   return cudf::detail::copy_if(
-    input, valid_table_filter{*keys_device_view, keep_threshold}, mr, stream);
+    input, valid_table_filter{*keys_device_view, keep_threshold}, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index aca414e0df4..afda6313254 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 
 // Copy the columns from another table
@@ -46,7 +48,7 @@ table::table(std::vector<std::unique_ptr<column>>&& columns) : _columns{std::mov
 }
 
 // Copy the contents of a `table_view`
-table::table(table_view view, cudaStream_t stream, rmm::mr::device_memory_resource* mr)
+table::table(table_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
   : _num_rows{view.num_rows()}
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index c3e338b59d8..792b94aaee6 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -104,8 +104,8 @@ std::unique_ptr<cudf::column> generate_ngrams(
                              if (d_strings.is_null(idx)) return false;
                              return !d_strings.element<cudf::string_view>(idx).empty();
                            },
-                           mr,
-                           stream)
+                           stream,
+                           mr)
                            ->release();
     strings_count = table_offsets.front()->size() - 1;
     return std::move(table_offsets.front());

From 94b1627ec7c170e6523961b2d8d689a4b23d5194 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Fri, 6 Nov 2020 13:23:51 +1100
Subject: [PATCH 17/51] Changelog for #6648

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e46bd535dc7..4cf4cfae1b1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -49,6 +49,7 @@
 - PR #6610 Add ability to set scalar values in `cudf.DataFrame`
 - PR #6612 Update JNI to new RMM cuda_stream_view API
 - PR #6646 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 1)
+- PR #6648 Replace `cudaStream_t` with `rmm::cuda_stream_view` (part 2)
 - PR #6579 Update scatter APIs to use reference wrapper / const scalar
 - PR #6614 Add support for conversion to Pandas nullable dtypes and fix related issue in `cudf.to_json`
 - PR #6622 Update `to_pandas` api docs

From c497fcc749dad1c87bc108b516406cad6837fafe Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Fri, 6 Nov 2020 13:27:12 +1100
Subject: [PATCH 18/51] Convert copy_range to cuda_stream_view

---
 cpp/include/cudf/detail/copy_range.cuh        | 16 ++---
 .../cudf/strings/detail/copy_range.cuh        | 15 ++---
 cpp/src/copying/copy_range.cu                 | 59 ++++++++++---------
 3 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 599b7de358d..afe67540c42 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 
 #include <cub/cub.cuh>
@@ -134,7 +136,7 @@ void copy_range(SourceValueIterator source_value_begin,
                 mutable_column_view& target,
                 size_type target_begin,
                 size_type target_end,
-                cudaStream_t stream = 0)
+                rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 {
   CUDF_EXPECTS((target_begin <= target_end) && (target_begin >= 0) &&
                  (target_begin < target.size()) && (target_end <= target.size()),
@@ -162,7 +164,7 @@ void copy_range(SourceValueIterator source_value_begin,
 
     auto kernel =
       copy_range_kernel<block_size, SourceValueIterator, SourceValidityIterator, T, true>;
-    kernel<<<grid.num_blocks, block_size, 0, stream>>>(
+    kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
       source_value_begin,
       source_validity_begin,
       *mutable_column_device_view::create(target, stream),
@@ -174,7 +176,7 @@ void copy_range(SourceValueIterator source_value_begin,
   } else {
     auto kernel =
       copy_range_kernel<block_size, SourceValueIterator, SourceValidityIterator, T, false>;
-    kernel<<<grid.num_blocks, block_size, 0, stream>>>(
+    kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
       source_value_begin,
       source_validity_begin,
       *mutable_column_device_view::create(target, stream),
@@ -195,7 +197,7 @@ void copy_range_in_place(column_view const& source,
                          size_type source_begin,
                          size_type source_end,
                          size_type target_begin,
-                         cudaStream_t stream = 0);
+                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @copydoc cudf::copy_range
@@ -208,8 +210,8 @@ std::unique_ptr<column> copy_range(
   size_type source_begin,
   size_type source_end,
   size_type target_begin,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh
index b4b586d2b19..fe0d1dcf2a7 100644
--- a/cpp/include/cudf/strings/detail/copy_range.cuh
+++ b/cpp/include/cudf/strings/detail/copy_range.cuh
@@ -22,6 +22,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -100,8 +101,8 @@ std::unique_ptr<column> copy_range(
   strings_column_view const& target,
   size_type target_begin,
   size_type target_end,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(
     (target_begin >= 0) && (target_begin < target.size()) && (target_end <= target.size()),
@@ -154,7 +155,7 @@ std::unique_ptr<column> copy_range(
           source_value_begin, source_validity_begin, d_target, target_begin, target_end});
 
       p_offsets_column = detail::make_offsets_child_column(
-        string_size_begin, string_size_begin + target.size(), mr, stream);
+        string_size_begin, string_size_begin + target.size(), mr, stream.value());
     } else if (null_count > 0) {  // check validities for source only
       auto string_size_begin = thrust::make_transform_iterator(
         thrust::make_counting_iterator(0),
@@ -162,7 +163,7 @@ std::unique_ptr<column> copy_range(
           source_value_begin, source_validity_begin, d_target, target_begin, target_end});
 
       p_offsets_column = detail::make_offsets_child_column(
-        string_size_begin, string_size_begin + target.size(), mr, stream);
+        string_size_begin, string_size_begin + target.size(), mr, stream.value());
     } else {  // no need to check validities
       auto string_size_begin = thrust::make_transform_iterator(
         thrust::make_counting_iterator(0),
@@ -170,7 +171,7 @@ std::unique_ptr<column> copy_range(
           source_value_begin, source_validity_begin, d_target, target_begin, target_end});
 
       p_offsets_column = detail::make_offsets_child_column(
-        string_size_begin, string_size_begin + target.size(), mr, stream);
+        string_size_begin, string_size_begin + target.size(), mr, stream.value());
     }
 
     // create the chars column
@@ -180,12 +181,12 @@ std::unique_ptr<column> copy_range(
     auto chars_bytes = p_offsets[target.size()];
 
     auto p_chars_column = strings::detail::create_chars_child_column(
-      target.size(), null_count, chars_bytes, mr, stream);
+      target.size(), null_count, chars_bytes, mr, stream.value());
 
     // copy to the chars column
 
     auto p_chars = (p_chars_column->mutable_view()).template data<char>();
-    thrust::for_each(rmm::exec_policy(stream)->on(stream),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator(0),
                      thrust::make_counting_iterator(target.size()),
                      [source_value_begin,
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index 95be6cb8bbc..1df9fc78aa2 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -35,6 +35,7 @@
 #include <thrust/iterator/constant_iterator.h>
 
 #include <memory>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace {
 template <typename T>
@@ -43,7 +44,7 @@ void in_place_copy_range(cudf::column_view const& source,
                          cudf::size_type source_begin,
                          cudf::size_type source_end,
                          cudf::size_type target_begin,
-                         cudaStream_t stream = 0)
+                         rmm::cuda_stream_view stream)
 {
   auto p_source_device_view = cudf::column_device_view::create(source, stream);
   if (source.has_nulls()) {
@@ -72,7 +73,7 @@ struct in_place_copy_range_dispatch {
   std::enable_if_t<cudf::is_fixed_width<T>(), void> operator()(cudf::size_type source_begin,
                                                                cudf::size_type source_end,
                                                                cudf::size_type target_begin,
-                                                               cudaStream_t stream = 0)
+                                                               rmm::cuda_stream_view stream)
   {
     in_place_copy_range<T>(source, target, source_begin, source_end, target_begin, stream);
   }
@@ -81,7 +82,7 @@ struct in_place_copy_range_dispatch {
   std::enable_if_t<not cudf::is_fixed_width<T>(), void> operator()(cudf::size_type source_begin,
                                                                    cudf::size_type source_end,
                                                                    cudf::size_type target_begin,
-                                                                   cudaStream_t stream = 0)
+                                                                   rmm::cuda_stream_view stream)
   {
     CUDF_FAIL("in-place copy does not work for variable width types.");
   }
@@ -96,8 +97,8 @@ struct out_of_place_copy_range_dispatch {
     cudf::size_type source_begin,
     cudf::size_type source_end,
     cudf::size_type target_begin,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     auto p_ret = std::make_unique<cudf::column>(target, stream, mr);
     if ((!p_ret->nullable()) && source.has_nulls(source_begin, source_end)) {
@@ -119,8 +120,8 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   cudf::size_type source_begin,
   cudf::size_type source_end,
   cudf::size_type target_begin,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto target_end           = target_begin + (source_end - source_begin);
   auto p_source_device_view = cudf::column_device_view::create(source, stream);
@@ -133,8 +134,8 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
       cudf::strings_column_view(target),
       target_begin,
       target_end,
-      mr,
-      stream);
+      stream,
+      mr);
   } else {
     return cudf::strings::detail::copy_range(
       p_source_device_view->begin<cudf::string_view>() + source_begin,
@@ -142,8 +143,8 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
       cudf::strings_column_view(target),
       target_begin,
       target_end,
-      mr,
-      stream);
+      stream,
+      mr);
   }
 }
 
@@ -152,8 +153,8 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<numer
   cudf::size_type source_begin,
   cudf::size_type source_end,
   cudf::size_type target_begin,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FAIL("decimal64 type not supported");
 }
@@ -163,8 +164,8 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<numer
   cudf::size_type source_begin,
   cudf::size_type source_end,
   cudf::size_type target_begin,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FAIL("decimal32 type not supported");
 }
@@ -174,8 +175,8 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   cudf::size_type source_begin,
   cudf::size_type source_end,
   cudf::size_type target_begin,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   // check the keys in the source and target
   cudf::dictionary_column_view const dict_source(source);
@@ -185,10 +186,10 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
 
   // combine keys so both dictionaries have the same set
   auto target_matched =
-    cudf::dictionary::detail::add_keys(dict_target, dict_source.keys(), mr, stream);
+    cudf::dictionary::detail::add_keys(dict_target, dict_source.keys(), mr, stream.value());
   auto const target_view = cudf::dictionary_column_view(target_matched->view());
   auto source_matched    = cudf::dictionary::detail::set_keys(
-    dict_source, target_view.keys(), rmm::mr::get_current_device_resource(), stream);
+    dict_source, target_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
   auto const source_view = cudf::dictionary_column_view(source_matched->view());
 
   // build the new indices by calling in_place_copy_range on just the indices
@@ -230,8 +231,8 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   cudf::size_type source_begin,
   cudf::size_type source_end,
   cudf::size_type target_begin,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FAIL("list_view type not supported");
 }
@@ -245,7 +246,7 @@ void copy_range_in_place(column_view const& source,
                          size_type source_begin,
                          size_type source_end,
                          size_type target_begin,
-                         cudaStream_t stream)
+                         rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(cudf::is_fixed_width(target.type()) == true,
                "In-place copy_range does not support variable-sized types.");
@@ -272,8 +273,8 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_begin,
                                    size_type source_end,
                                    size_type target_begin,
-                                   rmm::mr::device_memory_resource* mr,
-                                   cudaStream_t stream)
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS((source_begin >= 0) && (source_end <= source.size()) &&
                  (source_begin <= source_end) && (target_begin >= 0) &&
@@ -286,8 +287,8 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                source_begin,
                                source_end,
                                target_begin,
-                               mr,
-                               stream);
+                               stream,
+                               mr);
 }
 
 }  // namespace detail
@@ -299,7 +300,8 @@ void copy_range_in_place(column_view const& source,
                          size_type target_begin)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_range_in_place(source, target, source_begin, source_end, target_begin, 0);
+  return detail::copy_range_in_place(
+    source, target, source_begin, source_end, target_begin, rmm::cuda_stream_default);
 }
 
 std::unique_ptr<column> copy_range(column_view const& source,
@@ -310,7 +312,8 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_range(source, target, source_begin, source_end, target_begin, mr, 0);
+  return detail::copy_range(
+    source, target, source_begin, source_end, target_begin, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf

From 03aec6aaa698b0994aa528efa7caeff652c32a7f Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Fri, 6 Nov 2020 13:38:04 +1100
Subject: [PATCH 19/51] Convert fill to cuda_stream_view

---
 cpp/include/cudf/detail/fill.hpp         | 10 +++--
 cpp/include/cudf/strings/detail/fill.hpp |  8 ++--
 cpp/src/column/column_factories.cpp      |  2 +-
 cpp/src/filling/fill.cu                  | 51 +++++++++++++-----------
 cpp/src/strings/filling/fill.cu          | 20 +++++-----
 5 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index 24438c3af06..cfaf323ab12 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/filling.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 
 namespace cudf {
@@ -33,7 +35,7 @@ void fill_in_place(mutable_column_view& destination,
                    size_type begin,
                    size_type end,
                    scalar const& value,
-                   cudaStream_t stream = 0);
+                   rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @copydoc cudf::fill
@@ -45,8 +47,8 @@ std::unique_ptr<column> fill(
   size_type begin,
   size_type end,
   scalar const& value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index 519880df561..1ddf0ad5cdf 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -44,8 +46,8 @@ std::unique_ptr<column> fill(
   size_type begin,
   size_type end,
   string_scalar const& value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 72943313dc2..a79277ca21e 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -199,7 +199,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
     data_type{type_id::STRING}, size, nullptr, static_cast<bitmask_type*>(null_mask.data()), size};
   auto sv = static_cast<scalar_type_t<cudf::string_view> const&>(value);
   // fill the column with the scalar
-  auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, mr, stream.value());
+  auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, stream, mr);
   output->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);  // should be no nulls
   return output;
 }
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index f801ba0eab4..6fba9bc01a5 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -32,6 +32,9 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <memory>
 
 namespace {
@@ -40,7 +43,7 @@ void in_place_fill(cudf::mutable_column_view& destination,
                    cudf::size_type begin,
                    cudf::size_type end,
                    cudf::scalar const& value,
-                   cudaStream_t stream = 0)
+                   rmm::cuda_stream_view stream)
 {
   using ScalarType = cudf::scalar_type_t<T>;
   auto p_scalar    = static_cast<ScalarType const*>(&value);
@@ -61,7 +64,7 @@ struct in_place_fill_range_dispatch {
   template <typename T>
   std::enable_if_t<cudf::is_fixed_width<T>(), void> operator()(cudf::size_type begin,
                                                                cudf::size_type end,
-                                                               cudaStream_t stream = 0)
+                                                               rmm::cuda_stream_view stream)
   {
     in_place_fill<T>(destination, begin, end, value, stream);
   }
@@ -69,7 +72,7 @@ struct in_place_fill_range_dispatch {
   template <typename T>
   std::enable_if_t<not cudf::is_fixed_width<T>(), void> operator()(cudf::size_type begin,
                                                                    cudf::size_type end,
-                                                                   cudaStream_t stream = 0)
+                                                                   rmm::cuda_stream_view stream)
   {
     CUDF_FAIL("in-place fill does not work for variable width types.");
   }
@@ -83,8 +86,8 @@ struct out_of_place_fill_range_dispatch {
   std::unique_ptr<cudf::column> operator()(
     cudf::size_type begin,
     cudf::size_type end,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
     CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
@@ -108,8 +111,8 @@ template <>
 std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf::list_view>(
   cudf::size_type begin,
   cudf::size_type end,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FAIL("list_view not supported yet");
 }
@@ -118,8 +121,8 @@ template <>
 std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf::struct_view>(
   cudf::size_type begin,
   cudf::size_type end,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FAIL("struct_view not supported yet");
 }
@@ -128,22 +131,22 @@ template <>
 std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf::string_view>(
   cudf::size_type begin,
   cudf::size_type end,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
   using ScalarType = cudf::scalar_type_t<cudf::string_view>;
   auto p_scalar    = static_cast<ScalarType const*>(&value);
   return cudf::strings::detail::fill(
-    cudf::strings_column_view(input), begin, end, *p_scalar, mr, stream);
+    cudf::strings_column_view(input), begin, end, *p_scalar, stream, mr);
 }
 
 template <>
 std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf::dictionary32>(
   cudf::size_type begin,
   cudf::size_type end,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return std::make_unique<cudf::column>(input, stream, mr);
   cudf::dictionary_column_view const target(input);
@@ -162,21 +165,21 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   auto scalar_column =
     cudf::make_column_from_scalar(value, 1, stream, rmm::mr::get_current_device_resource());
   auto target_matched =
-    cudf::dictionary::detail::add_keys(target, scalar_column->view(), mr, stream);
+    cudf::dictionary::detail::add_keys(target, scalar_column->view(), mr, stream.value());
   cudf::column_view const target_indices =
     cudf::dictionary_column_view(target_matched->view()).get_indices_annotated();
 
   // get the index of the key just added
   auto index_of_value = cudf::dictionary::detail::get_index(
-    target_matched->view(), value, rmm::mr::get_current_device_resource(), stream);
+    target_matched->view(), value, rmm::mr::get_current_device_resource(), stream.value());
   // now call fill using just the indices column and the new index
   auto new_indices =
     cudf::type_dispatcher(target_indices.type(),
                           out_of_place_fill_range_dispatch{*index_of_value, target_indices},
                           begin,
                           end,
-                          mr,
-                          stream);
+                          stream,
+                          mr);
   auto const indices_type = new_indices->type();
   auto const output_size  = new_indices->size();        // record these
   auto const null_count   = new_indices->null_count();  // before the release()
@@ -206,7 +209,7 @@ void fill_in_place(mutable_column_view& destination,
                    size_type begin,
                    size_type end,
                    scalar const& value,
-                   cudaStream_t stream)
+                   rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(cudf::is_fixed_width(destination.type()) == true,
                "In-place fill does not support variable-sized types.");
@@ -228,13 +231,13 @@ std::unique_ptr<column> fill(column_view const& input,
                              size_type begin,
                              size_type end,
                              scalar const& value,
-                             rmm::mr::device_memory_resource* mr,
-                             cudaStream_t stream)
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS((begin >= 0) && (end <= input.size()) && (begin <= end), "Range is out of bounds.");
 
   return cudf::type_dispatcher(
-    input.type(), out_of_place_fill_range_dispatch{value, input}, begin, end, mr, stream);
+    input.type(), out_of_place_fill_range_dispatch{value, input}, begin, end, stream, mr);
 }
 
 }  // namespace detail
@@ -245,7 +248,7 @@ void fill_in_place(mutable_column_view& destination,
                    scalar const& value)
 {
   CUDF_FUNC_RANGE();
-  return detail::fill_in_place(destination, begin, end, value, 0);
+  return detail::fill_in_place(destination, begin, end, value, rmm::cuda_stream_default);
 }
 
 std::unique_ptr<column> fill(column_view const& input,
@@ -255,7 +258,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::fill(input, begin, end, value, mr, 0);
+  return detail::fill(input, begin, end, value, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index fa36de38e22..5ed3de2c888 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,8 @@
 #include <cudf/utilities/error.hpp>
 #include <strings/utilities.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -34,11 +36,11 @@ std::unique_ptr<column> fill(
   size_type begin,
   size_type end,
   string_scalar const& value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return detail::make_empty_strings_column(mr, stream.value());
   CUDF_EXPECTS((begin >= 0) && (end <= strings_count),
                "Parameters [begin,end) are outside the range of the provided strings column");
   CUDF_EXPECTS(begin <= end, "Parameters [begin,end) have invalid range values");
@@ -72,17 +74,17 @@ std::unique_ptr<column> fill(
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
   auto offsets_column = detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // create the chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
+  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
+  auto chars_column = strings::detail::create_chars_child_column(
+    strings_count, null_count, bytes, mr, stream.value());
   // fill the chars column
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     [d_strings, begin, end, d_value, d_offsets, d_chars] __device__(size_type idx) {

From 927378b27e6d437bb79c8786fe9c379ca4931806 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Fri, 6 Nov 2020 15:35:57 +1100
Subject: [PATCH 20/51] Convert gather to cuda_stream_view

---
 cpp/include/cudf/detail/copy_if.cuh          |  2 +-
 cpp/include/cudf/detail/gather.cuh           | 41 ++++++++++----------
 cpp/include/cudf/detail/gather.hpp           | 21 +++++++++-
 cpp/include/cudf/detail/get_value.cuh        | 11 ++++--
 cpp/include/cudf/detail/indexalator.cuh      |  2 +-
 cpp/include/cudf/detail/scatter.cuh          |  2 +-
 cpp/include/cudf/lists/detail/gather.cuh     | 22 ++++++-----
 cpp/include/cudf/lists/lists_column_view.hpp |  4 +-
 cpp/include/cudf/strings/detail/gather.cuh   | 25 ++++++------
 cpp/include/cudf/table/table_device_view.cuh | 14 ++++---
 cpp/src/copying/gather.cu                    | 15 ++++---
 cpp/src/copying/sample.cu                    |  6 +--
 cpp/src/dictionary/add_keys.cu               |  4 +-
 cpp/src/dictionary/decode.cu                 |  4 +-
 cpp/src/dictionary/remove_keys.cu            |  4 +-
 cpp/src/filling/repeat.cu                    |  4 +-
 cpp/src/groupby/groupby.cu                   |  1 +
 cpp/src/groupby/hash/groupby.cu              |  8 ++--
 cpp/src/groupby/sort/group_argmax.cu         |  4 +-
 cpp/src/groupby/sort/group_argmin.cu         |  4 +-
 cpp/src/groupby/sort/group_nth_element.cu    |  2 +-
 cpp/src/groupby/sort/groupby.cu              |  8 ++--
 cpp/src/groupby/sort/sort_helper.cu          | 14 +++----
 cpp/src/hash/hashing.cu                      |  2 +-
 cpp/src/join/hash_join.cu                    | 20 +++++-----
 cpp/src/join/semi_join.cu                    |  3 +-
 cpp/src/lists/copying/gather.cu              |  8 ++--
 cpp/src/lists/extract.cu                     |  4 +-
 cpp/src/lists/lists_column_view.cu           |  4 +-
 cpp/src/partitioning/partitioning.cu         |  2 +-
 cpp/src/partitioning/round_robin.cu          | 10 ++---
 cpp/src/quantiles/quantiles.cu               | 11 ++++--
 cpp/src/reshape/tile.cu                      |  2 +-
 cpp/src/rolling/rolling.cu                   |  4 +-
 cpp/src/sort/sort.cu                         |  4 +-
 cpp/src/stream_compaction/drop_duplicates.cu |  4 +-
 cpp/src/strings/copying/copying.cu           |  4 +-
 cpp/src/strings/sorting/sorting.cu           |  4 +-
 cpp/src/table/table_device_view.cu           | 14 ++++---
 cpp/src/transform/encode.cu                  |  4 +-
 40 files changed, 187 insertions(+), 139 deletions(-)

diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 05a84a238ff..9daec13d578 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -269,7 +269,7 @@ struct scatter_gather_functor {
                     filter);
 
     auto output_table = cudf::detail::gather(
-      cudf::table_view{{input}}, indices.begin(), indices.end(), false, mr, stream.value());
+      cudf::table_view{{input}}, indices.begin(), indices.end(), false, stream, mr);
 
     // There will be only one column
     return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index f20af839916..9a115772a0c 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -37,6 +37,7 @@
 #include <rmm/thrust_rmm_allocator.h>
 
 #include <algorithm>
+#include "rmm/cuda_stream_view.hpp"
 
 #include <thrust/functional.h>
 #include <thrust/gather.h>
@@ -120,11 +121,11 @@ void gather_helper(InputItr source_itr,
                    MapIterator gather_map_begin,
                    MapIterator gather_map_end,
                    bool nullify_out_of_bounds,
-                   cudaStream_t stream)
+                   rmm::cuda_stream_view stream)
 {
   using map_type = typename std::iterator_traits<MapIterator>::value_type;
   if (nullify_out_of_bounds) {
-    thrust::gather_if(rmm::exec_policy(stream)->on(stream),
+    thrust::gather_if(rmm::exec_policy(stream)->on(stream.value()),
                       gather_map_begin,
                       gather_map_end,
                       gather_map_begin,
@@ -132,7 +133,7 @@ void gather_helper(InputItr source_itr,
                       target_itr,
                       bounds_checker<map_type>{0, source_size});
   } else {
-    thrust::gather(rmm::exec_policy(stream)->on(stream),
+    thrust::gather(rmm::exec_policy(stream)->on(stream.value()),
                    gather_map_begin,
                    gather_map_end,
                    source_itr,
@@ -169,7 +170,7 @@ struct column_gatherer_impl {
                                      MapIterator gather_map_begin,
                                      MapIterator gather_map_end,
                                      bool nullify_out_of_bounds,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
     auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
@@ -216,15 +217,15 @@ struct column_gatherer_impl<string_view, MapItType> {
                                      MapItType gather_map_begin,
                                      MapItType gather_map_end,
                                      bool nullify_out_of_bounds,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
     if (true == nullify_out_of_bounds) {
       return cudf::strings::detail::gather<true>(
-        strings_column_view(source_column), gather_map_begin, gather_map_end, mr, stream);
+        strings_column_view(source_column), gather_map_begin, gather_map_end, stream, mr);
     } else {
       return cudf::strings::detail::gather<false>(
-        strings_column_view(source_column), gather_map_begin, gather_map_end, mr, stream);
+        strings_column_view(source_column), gather_map_begin, gather_map_end, stream, mr);
     }
   }
 };
@@ -289,7 +290,7 @@ struct column_gatherer_impl<list_view, MapItRoot> {
                                      MapItRoot gather_map_begin,
                                      MapItRoot gather_map_end,
                                      bool nullify_out_of_bounds,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
     lists_column_view list(column);
@@ -354,7 +355,7 @@ struct column_gatherer {
                                      MapIterator gather_map_begin,
                                      MapIterator gather_map_end,
                                      bool nullify_out_of_bounds,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
     column_gatherer_impl<Element, MapIterator> gatherer{};
@@ -386,7 +387,7 @@ struct column_gatherer_impl<dictionary32, MapItType> {
                                      MapItType gather_map_begin,
                                      MapItType gather_map_end,
                                      bool nullify_out_of_bounds,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
     dictionary_column_view dictionary(source_column);
@@ -457,7 +458,7 @@ void gather_bitmask(table_device_view input,
                     size_type mask_count,
                     size_type mask_size,
                     size_type* valid_counts,
-                    cudaStream_t stream)
+                    rmm::cuda_stream_view stream)
 {
   if (mask_size == 0) { return; }
 
@@ -469,7 +470,7 @@ void gather_bitmask(table_device_view input,
     valid_if_n_kernel<decltype(counting_it), decltype(counting_it), Selector, block_size>;
 
   cudf::detail::grid_1d grid{mask_size, block_size, 1};
-  kernel<<<grid.num_blocks, block_size, 0, stream>>>(
+  kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
     counting_it, counting_it, selector, masks, mask_count, mask_size, valid_counts);
 }
 
@@ -478,8 +479,8 @@ void gather_bitmask(table_view const& source,
                     MapIterator gather_map,
                     std::vector<std::unique_ptr<column>>& target,
                     gather_bitmask_op op,
-                    rmm::mr::device_memory_resource* mr,
-                    cudaStream_t stream)
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* mr)
 {
   if (target.empty()) { return; }
 
@@ -548,7 +549,7 @@ struct column_gatherer_impl<struct_view, MapItRoot> {
                                      MapItRoot gather_map_begin,
                                      MapItRoot gather_map_end,
                                      bool nullify_out_of_bounds,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
     structs_column_view structs_column(column);
@@ -578,8 +579,8 @@ struct column_gatherer_impl<struct_view, MapItRoot> {
       gather_map_begin,
       output_struct_members,
       nullify_out_of_bounds ? gather_bitmask_op::NULLIFY : gather_bitmask_op::DONT_CHECK,
-      mr,
-      stream);
+      stream,
+      mr);
 
     return cudf::make_structs_column(
       gather_map_size,
@@ -620,8 +621,8 @@ std::unique_ptr<table> gather(
   MapIterator gather_map_begin,
   MapIterator gather_map_end,
   bool nullify_out_of_bounds          = false,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   std::vector<std::unique_ptr<column>> destination_columns;
 
@@ -641,7 +642,7 @@ std::unique_ptr<table> gather(
 
   auto const op =
     nullify_out_of_bounds ? gather_bitmask_op::NULLIFY : gather_bitmask_op::DONT_CHECK;
-  gather_bitmask(source_table, gather_map_begin, destination_columns, op, mr, stream);
+  gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
 
   return std::make_unique<table>(std::move(destination_columns));
 }
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 0f9b01c53d5..adace7e27f8 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -1,3 +1,18 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #pragma once
 
 #include <cudf/column/column_view.hpp>
@@ -5,6 +20,8 @@
 
 #include <cudf/table/table.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 
 namespace cudf {
@@ -48,7 +65,7 @@ std::unique_ptr<table> gather(
   column_view const& gather_map,
   out_of_bounds_policy bounds,
   negative_index_policy neg_indices,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh
index 25ca123fb16..eeff6fb2d9b 100644
--- a/cpp/include/cudf/detail/get_value.cuh
+++ b/cpp/include/cudf/detail/get_value.cuh
@@ -20,6 +20,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 
@@ -38,15 +40,18 @@ namespace detail {
  * @return Value from the `col_view[element_index]`
  */
 template <typename T>
-T get_value(column_view const& col_view, size_type element_index, cudaStream_t stream)
+T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(cudf::is_fixed_width(col_view.type()), "get_value supports only fixed-width types");
   CUDF_EXPECTS(data_type(type_to_id<T>()) == col_view.type(), "get_value data type mismatch");
   CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(),
                "invalid element_index value");
   T result;
-  CUDA_TRY(cudaMemcpyAsync(
-    &result, col_view.data<T>() + element_index, sizeof(T), cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(cudaMemcpyAsync(&result,
+                           col_view.data<T>() + element_index,
+                           sizeof(T),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
   return result;
 }
 
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index adee8fd84e0..32ac19518d2 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -229,7 +229,7 @@ struct base_indexalator {
  * @code
  *  auto begin = indexalator_factory::create_input_iterator(gather_map);
  *  auto end   = begin + gather_map.size();
- *  auto result = detail::gather( source, begin, end, IGNORE, mr, stream );
+ *  auto result = detail::gather( source, begin, end, IGNORE, stream, mr );
  * @endcode
  *
  * @code
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 0e30ce603cf..6d93c78fd3e 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -283,7 +283,7 @@ std::unique_ptr<table> scatter(
   auto gather_map = scatter_to_gather(
     updated_scatter_map_begin, updated_scatter_map_end, target.num_rows(), stream);
 
-  gather_bitmask(source, gather_map.begin(), result, gather_bitmask_op::PASSTHROUGH, mr, stream);
+  gather_bitmask(source, gather_map.begin(), result, gather_bitmask_op::PASSTHROUGH, stream, mr);
 
   return std::make_unique<table>(std::move(result));
 }
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 51291e69b6b..8ea84780fc4 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -19,7 +19,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+
 #include <rmm/device_uvector.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace lists {
@@ -59,9 +61,9 @@ template <bool NullifyOutOfBounds, typename MapItType>
 gather_data make_gather_data(cudf::lists_column_view const& source_column,
                              MapItType gather_map,
                              size_type gather_map_size,
-                             cudaStream_t stream,
-                             rmm::mr::device_memory_resource* mr,
-                             rmm::device_uvector<int32_t>&& prev_base_offsets)
+                             rmm::device_uvector<int32_t>&& prev_base_offsets,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   // size of the gather map is the # of output rows
   size_type output_count = gather_map_size;
@@ -79,7 +81,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
   // generate the compacted outgoing offsets.
   auto count_iter = thrust::make_counting_iterator<int32_t>(0);
   thrust::transform_exclusive_scan(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     count_iter,
     count_iter + offset_count,
     dst_offsets_v.begin<int32_t>(),
@@ -103,7 +105,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 
   // generate the base offsets
   rmm::device_uvector<int32_t> base_offsets = rmm::device_uvector<int32_t>(output_count, stream);
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     gather_map,
                     gather_map + offset_count,
                     base_offsets.data(),
@@ -234,16 +236,16 @@ template <bool NullifyOutOfBounds, typename MapItType>
 gather_data make_gather_data(cudf::lists_column_view const& source_column,
                              MapItType gather_map,
                              size_type gather_map_size,
-                             cudaStream_t stream,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   return make_gather_data<NullifyOutOfBounds, MapItType>(
     source_column,
     gather_map,
     gather_map_size,
+    rmm::device_uvector<int32_t>{0, stream, mr},
     stream,
-    mr,
-    rmm::device_uvector<int32_t>{0, stream, mr});
+    mr);
 }
 
 /**
@@ -262,7 +264,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 std::unique_ptr<column> gather_list_nested(
   lists_column_view const& list,
   gather_data& gd,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -281,7 +283,7 @@ std::unique_ptr<column> gather_list_nested(
 std::unique_ptr<column> gather_list_leaf(
   column_view const& column,
   gather_data const& gd,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index cc1463f3f91..d494ee445b3 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 /**
  * @file
  * @brief Class definition for cudf::lists_column_view
@@ -84,7 +86,7 @@ class lists_column_view : private column_view {
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view get_sliced_child(cudaStream_t stream) const;
+  column_view get_sliced_child(rmm::cuda_stream_view stream) const;
 };
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 25d47288013..8ca70db74a6 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -22,6 +22,7 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 
@@ -61,12 +62,12 @@ std::unique_ptr<cudf::column> gather(
   strings_column_view const& strings,
   MapIterator begin,
   MapIterator end,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto output_count  = std::distance(begin, end);
   auto strings_count = strings.size();
-  if (output_count == 0) return make_empty_strings_column(mr, stream);
+  if (output_count == 0) return make_empty_strings_column(mr, stream.value());
 
   auto execpol        = rmm::exec_policy(stream);
   auto strings_column = column_device_view::create(strings.parent(), stream);
@@ -80,13 +81,13 @@ std::unique_ptr<cudf::column> gather(
   };
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
   auto offsets_column          = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + output_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + output_count, mr, stream.value());
   auto offsets_view = offsets_column->view();
   auto d_offsets    = offsets_view.template data<int32_t>();
 
   // build chars column
   size_type bytes   = thrust::device_pointer_cast(d_offsets)[output_count];
-  auto chars_column = create_chars_child_column(output_count, 0, bytes, mr, stream);
+  auto chars_column = create_chars_child_column(output_count, 0, bytes, mr, stream.value());
   auto chars_view   = chars_column->mutable_view();
   auto d_chars      = chars_view.template data<char>();
   // fill in chars
@@ -102,8 +103,10 @@ std::unique_ptr<cudf::column> gather(
       string_view d_str = d_strings.element<string_view>(index);
       memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
     };
-  thrust::for_each_n(
-    execpol->on(stream), thrust::make_counting_iterator<size_type>(0), output_count, gather_chars);
+  thrust::for_each_n(execpol->on(stream.value()),
+                     thrust::make_counting_iterator<size_type>(0),
+                     output_count,
+                     gather_chars);
 
   return make_strings_column(output_count,
                              std::move(offsets_column),
@@ -143,11 +146,11 @@ std::unique_ptr<cudf::column> gather(
   MapIterator begin,
   MapIterator end,
   bool nullify_out_of_bounds,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (nullify_out_of_bounds) return gather<true>(strings, begin, end, mr, stream);
-  return gather<false>(strings, begin, end, mr, stream);
+  if (nullify_out_of_bounds) return gather<true>(strings, begin, end, stream, mr);
+  return gather<false>(strings, begin, end, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index f60f5f9fe57..8a1938423f0 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -21,6 +21,7 @@
 
 #include <cassert>
 #include <memory>
+#include "rmm/cuda_stream_view.hpp"
 
 /**
  * @file table_device_view.cuh
@@ -67,10 +68,9 @@ class table_device_view_base {
   ColumnDeviceView* _columns{};  ///< Array of view objects in device memory
   size_type _num_rows{};
   size_type _num_columns{};
-  cudaStream_t _stream{};
 
  protected:
-  table_device_view_base(HostTableView source_view, cudaStream_t stream);
+  table_device_view_base(HostTableView source_view, rmm::cuda_stream_view stream);
 
   rmm::device_buffer* _descendant_storage{};
 };
@@ -78,7 +78,8 @@ class table_device_view_base {
 
 class table_device_view : public detail::table_device_view_base<column_device_view, table_view> {
  public:
-  static auto create(table_view source_view, cudaStream_t stream = 0)
+  static auto create(table_view source_view,
+                     rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     auto deleter = [](table_device_view* t) { t->destroy(); };
     return std::unique_ptr<table_device_view, decltype(deleter)>{
@@ -86,7 +87,7 @@ class table_device_view : public detail::table_device_view_base<column_device_vi
   }
 
  private:
-  table_device_view(table_view source_view, cudaStream_t stream)
+  table_device_view(table_view source_view, rmm::cuda_stream_view stream)
     : detail::table_device_view_base<column_device_view, table_view>(source_view, stream)
   {
   }
@@ -95,7 +96,8 @@ class table_device_view : public detail::table_device_view_base<column_device_vi
 class mutable_table_device_view
   : public detail::table_device_view_base<mutable_column_device_view, mutable_table_view> {
  public:
-  static auto create(mutable_table_view source_view, cudaStream_t stream = 0)
+  static auto create(mutable_table_view source_view,
+                     rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
     auto deleter = [](mutable_table_device_view* t) { t->destroy(); };
     return std::unique_ptr<mutable_table_device_view, decltype(deleter)>{
@@ -103,7 +105,7 @@ class mutable_table_device_view
   }
 
  private:
-  mutable_table_device_view(mutable_table_view source_view, cudaStream_t stream)
+  mutable_table_device_view(mutable_table_view source_view, rmm::cuda_stream_view stream)
     : detail::table_device_view_base<mutable_column_device_view, mutable_table_view>(source_view,
                                                                                      stream)
   {
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index f8e11500603..4e186c00ac3 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -23,6 +23,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/count.h>
 
 namespace cudf {
@@ -32,8 +34,8 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               column_view const& gather_map,
                               out_of_bounds_policy bounds,
                               negative_index_policy neg_indices,
-                              rmm::mr::device_memory_resource* mr,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(gather_map.has_nulls() == false, "gather_map contains nulls");
 
@@ -45,7 +47,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
     cudf::size_type begin =
       neg_indices == negative_index_policy::ALLOWED ? -source_table.num_rows() : 0;
     cudf::size_type end = source_table.num_rows();
-    CUDF_EXPECTS(gather_map.size() == thrust::count_if(rmm::exec_policy(stream)->on(stream),
+    CUDF_EXPECTS(gather_map.size() == thrust::count_if(rmm::exec_policy(stream)->on(stream.value()),
                                                        map_begin,
                                                        map_end,
                                                        [begin, end] __device__(size_type index) {
@@ -63,11 +65,11 @@ std::unique_ptr<table> gather(table_view const& source_table,
                   thrust::make_transform_iterator(map_begin, idx_converter),
                   thrust::make_transform_iterator(map_end, idx_converter),
                   bounds == out_of_bounds_policy::IGNORE,
-                  mr,
-                  stream);
+                  stream,
+                  mr);
   }
   return gather(
-    source_table, map_begin, map_end, bounds == out_of_bounds_policy::IGNORE, mr, stream);
+    source_table, map_begin, map_end, bounds == out_of_bounds_policy::IGNORE, stream, mr);
 }
 
 }  // namespace detail
@@ -87,6 +89,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
     gather_map,
     check_bounds ? detail::out_of_bounds_policy::FAIL : detail::out_of_bounds_policy::NULLIFY,
     index_policy,
+    rmm::cuda_stream_default,
     mr);
 }
 
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index e9e2e1d6340..15dc3565da4 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -60,7 +60,7 @@ std::unique_ptr<table> sample(table_view const& input,
       thrust::make_transform_iterator(thrust::counting_iterator<size_type>(0), RandomGen);
     auto end = thrust::make_transform_iterator(thrust::counting_iterator<size_type>(n), RandomGen);
 
-    return detail::gather(input, begin, end, false, mr, stream.value());
+    return detail::gather(input, begin, end, false, stream, mr);
   } else {
     auto gather_map = make_numeric_column(
       data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream.value());
@@ -78,8 +78,8 @@ std::unique_ptr<table> sample(table_view const& input,
                           gather_map_view.begin<size_type>(),
                           gather_map_view.end<size_type>(),
                           false,
-                          mr,
-                          stream.value());
+                          stream,
+                          mr);
   }
 }
 
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 5633dcfbc30..c02f38e2a0e 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -92,8 +92,8 @@ std::unique_ptr<column> add_keys(
                                             indices_view,
                                             cudf::detail::out_of_bounds_policy::IGNORE,
                                             cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                            mr,
-                                            stream)
+                                            stream,
+                                            mr)
                          ->release();
   // The output of lower_bound is INT32 but we need to convert to unsigned indices.
   auto const indices_type = get_indices_type_for_size(keys_column->size());
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index c0bde1c92a5..913da30df16 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -49,8 +49,8 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
                                            indices,
                                            cudf::detail::out_of_bounds_policy::IGNORE,
                                            cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                           mr,
-                                           stream)
+                                           stream,
+                                           mr)
                         ->release();
   auto output_column = std::unique_ptr<column>(std::move(table_column.front()));
 
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 3913d68b10f..e04c6257692 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -114,8 +114,8 @@ std::unique_ptr<column> remove_keys_fn(
                                             indices_view,
                                             cudf::detail::out_of_bounds_policy::NULLIFY,
                                             cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                            mr,
-                                            stream)
+                                            stream,
+                                            mr)
                          ->release();
   std::unique_ptr<column> indices_column(std::move(table_indices.front()));
 
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 8191fd179c7..96e2e15f262 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -131,7 +131,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                       thrust::make_counting_iterator(output_size),
                       indices.begin());
 
-  return gather(input_table, indices.begin(), indices.end(), false, mr, stream);
+  return gather(input_table, indices.begin(), indices.end(), false, stream, mr);
 }
 
 std::unique_ptr<table> repeat(table_view const& input_table,
@@ -151,7 +151,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
     thrust::make_counting_iterator(0), [count] __device__(auto i) { return i / count; });
   auto map_end = map_begin + output_size;
 
-  return gather(input_table, map_begin, map_end, false, mr, stream);
+  return gather(input_table, map_begin, map_end, false, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 90bbf6490ac..4c391852386 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -150,6 +150,7 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re
                                           helper().key_sort_order(),
                                           cudf::detail::out_of_bounds_policy::NULLIFY,
                                           cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                          rmm::cuda_stream_default,
                                           mr);
     return groupby::groups{
       std::move(grouped_keys), std::move(group_offsets_vector), std::move(grouped_values)};
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 5bc7e0d02f0..14f813ae142 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -150,7 +150,7 @@ void sparse_to_dense_results(std::vector<aggregation_request> const& requests,
       [&sparse_results, &gather_map, map_size, i, mr, stream](auto const& agg) {
         auto s                  = sparse_results.get_result(i, agg);
         auto dense_result_table = cudf::detail::gather(
-          table_view({s}), gather_map.begin(), gather_map.begin() + map_size, false, mr, stream);
+          table_view({s}), gather_map.begin(), gather_map.begin() + map_size, false, stream, mr);
         return std::move(dense_result_table->release()[0]);
       };
 
@@ -173,8 +173,8 @@ void sparse_to_dense_results(std::vector<aggregation_request> const& requests,
                              arg_result->nullable() ? cudf::detail::out_of_bounds_policy::IGNORE
                                                     : cudf::detail::out_of_bounds_policy::NULLIFY,
                              cudf::detail::negative_index_policy::NOT_ALLOWED,
-                             mr,
-                             stream);
+                             stream,
+                             mr);
       return std::move(transformed_result->release()[0]);
     };
 
@@ -396,7 +396,7 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
   sparse_to_dense_results(requests, sparse_results, cache, gather_map, map_size, stream, mr);
 
   return cudf::detail::gather(
-    keys, gather_map.begin(), gather_map.begin() + map_size, false, mr, stream);
+    keys, gather_map.begin(), gather_map.begin() + map_size, false, stream, mr);
 }
 
 }  // namespace
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index 3d9d490e669..b49fbeb7387 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -54,8 +54,8 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                          indices->nullable() ? cudf::detail::out_of_bounds_policy::IGNORE
                                              : cudf::detail::out_of_bounds_policy::NULLIFY,
                          cudf::detail::negative_index_policy::NOT_ALLOWED,
-                         mr,
-                         stream);
+                         stream,
+                         mr);
 
   return std::move(result_table->release()[0]);
 }
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index 1beaab58fe3..5ae11ba0506 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -54,8 +54,8 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                          indices->nullable() ? cudf::detail::out_of_bounds_policy::IGNORE
                                              : cudf::detail::out_of_bounds_policy::NULLIFY,
                          cudf::detail::negative_index_policy::NOT_ALLOWED,
-                         mr,
-                         stream);
+                         stream,
+                         mr);
 
   return std::move(result_table->release()[0]);
 }
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index e33ec34b92e..bc9d0016207 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -104,7 +104,7 @@ std::unique_ptr<column> group_nth_element(column_view const &values,
                        });
   }
   auto output_table = cudf::detail::gather(
-    table_view{{values}}, nth_index.begin(), nth_index.end(), true, mr, stream);
+    table_view{{values}}, nth_index.begin(), nth_index.end(), true, stream, mr);
   if (!output_table->get_column(0).has_nulls()) output_table->get_column(0).set_null_mask({}, 0);
   return std::make_unique<column>(std::move(output_table->get_column(0)));
 }
diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu
index c9038082d88..7077e6f089c 100644
--- a/cpp/src/groupby/sort/groupby.cu
+++ b/cpp/src/groupby/sort/groupby.cu
@@ -204,8 +204,8 @@ void store_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
                              argmin_result.nullable() ? cudf::detail::out_of_bounds_policy::IGNORE
                                                       : cudf::detail::out_of_bounds_policy::NULLIFY,
                              cudf::detail::negative_index_policy::NOT_ALLOWED,
-                             mr,
-                             stream);
+                             stream,
+                             mr);
       return std::move(transformed_result->release()[0]);
     }
   }();
@@ -241,8 +241,8 @@ void store_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
                              argmax_result.nullable() ? cudf::detail::out_of_bounds_policy::IGNORE
                                                       : cudf::detail::out_of_bounds_policy::NULLIFY,
                              cudf::detail::negative_index_policy::NOT_ALLOWED,
-                             mr,
-                             stream);
+                             stream,
+                             mr);
       return std::move(transformed_result->release()[0]);
     }
   }();
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index e63a3a61015..064c3e97b20 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -279,8 +279,8 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
                                                   gather_map,
                                                   cudf::detail::out_of_bounds_policy::NULLIFY,
                                                   cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                                  mr,
-                                                  stream.value());
+                                                  stream,
+                                                  mr);
 
   return std::move(sorted_values_table->release()[0]);
 }
@@ -294,8 +294,8 @@ sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
                                                    gather_map,
                                                    cudf::detail::out_of_bounds_policy::NULLIFY,
                                                    cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                                   mr,
-                                                   stream.value());
+                                                   stream,
+                                                   mr);
 
   return std::move(grouped_values_table->release()[0]);
 }
@@ -309,7 +309,7 @@ std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view st
     group_offsets().begin(), [idx_data] __device__(size_type i) { return idx_data[i]; });
 
   return cudf::detail::gather(
-    _keys, gather_map_it, gather_map_it + num_groups(), false, mr, stream.value());
+    _keys, gather_map_it, gather_map_it + num_groups(), false, stream, mr);
 }
 
 std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream,
@@ -319,8 +319,8 @@ std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::cuda_stream_view st
                               key_sort_order(),
                               cudf::detail::out_of_bounds_policy::NULLIFY,
                               cudf::detail::negative_index_policy::NOT_ALLOWED,
-                              mr,
-                              stream.value());
+                              stream,
+                              mr);
 }
 
 }  // namespace sort
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index 2066b889dd4..03b6248f35a 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -583,7 +583,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
 
       // Handle bitmask using gather to take advantage of ballot_sync
       detail::gather_bitmask(
-        input, gather_map.begin(), output_cols, detail::gather_bitmask_op::DONT_CHECK, mr, stream);
+        input, gather_map.begin(), output_cols, detail::gather_bitmask_op::DONT_CHECK, stream, mr);
     }
 
     auto output{std::make_unique<table>(std::move(output_cols))};
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 456e26a7cae..91188539790 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -416,14 +416,14 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> construct_join_output_
                                               complement_indices.second.begin(),
                                               complement_indices.second.end(),
                                               nullify_out_of_bounds,
-                                              rmm::mr::get_current_device_resource(),
-                                              stream);
+                                              stream,
+                                              rmm::mr::get_current_device_resource());
       auto common_from_probe = detail::gather(probe.select(probe_common_col),
                                               joined_indices.first.begin(),
                                               joined_indices.first.end(),
                                               nullify_out_of_bounds,
-                                              rmm::mr::get_current_device_resource(),
-                                              stream);
+                                              stream,
+                                              rmm::mr::get_current_device_resource());
       common_table           = cudf::detail::concatenate(
         {common_from_build->view(), common_from_probe->view()}, stream, mr);
     }
@@ -434,8 +434,8 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> construct_join_output_
                                     joined_indices.first.begin(),
                                     joined_indices.first.end(),
                                     nullify_out_of_bounds,
-                                    mr,
-                                    stream);
+                                    stream,
+                                    mr);
     }
   }
 
@@ -444,15 +444,15 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> construct_join_output_
                                                       joined_indices.first.begin(),
                                                       joined_indices.first.end(),
                                                       nullify_out_of_bounds,
-                                                      mr,
-                                                      stream);
+                                                      stream,
+                                                      mr);
 
   std::unique_ptr<table> build_table = detail::gather(build.select(build_noncommon_col),
                                                       joined_indices.second.begin(),
                                                       joined_indices.second.end(),
                                                       nullify_out_of_bounds,
-                                                      mr,
-                                                      stream);
+                                                      stream,
+                                                      mr);
 
   return combine_join_columns(probe_table->release(),
                               probe_noncommon_col,
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 318bd30fe0f..9d6dd55ec03 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -24,6 +24,7 @@
 
 #include <hash/concurrent_unordered_map.cuh>
 #include <join/join_common_utils.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
@@ -155,7 +156,7 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
   // rebuild left table for call to gather
   auto const left_updated = scatter_columns(left_selected, left_on, left);
   return cudf::detail::gather(
-    left_updated.select(return_columns), gather_map.begin(), gather_map_end, false, mr);
+    left_updated.select(return_columns), gather_map.begin(), gather_map_end, false, stream, mr);
 }
 }  // namespace detail
 
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index 1f6155b3167..96c20fd93ad 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -18,6 +18,8 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/lists/detail/gather.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
@@ -86,7 +88,7 @@ struct list_gatherer {
  */
 std::unique_ptr<column> gather_list_leaf(column_view const& column,
                                          gather_data const& gd,
-                                         cudaStream_t stream,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   // gather map iterator for this level (N)
@@ -135,7 +137,7 @@ std::unique_ptr<column> gather_list_leaf(column_view const& column,
  */
 std::unique_ptr<column> gather_list_nested(cudf::lists_column_view const& list,
                                            gather_data& gd,
-                                           cudaStream_t stream,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   // gather map iterator for this level (N)
@@ -164,7 +166,7 @@ std::unique_ptr<column> gather_list_nested(cudf::lists_column_view const& list,
   // generate gather_data for next level (N+1), potentially recycling the temporary
   // base_offsets buffer.
   gather_data child_gd = make_gather_data<false>(
-    list, gather_map_begin, gather_map_size, stream, mr, std::move(gd.base_offsets));
+    list, gather_map_begin, gather_map_size, std::move(gd.base_offsets), stream, mr);
 
   // the nesting case.
   if (list.child().type() == cudf::data_type{type_id::LIST}) {
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index e1fbc74b818..5adb21a47f1 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -97,8 +97,8 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                      d_gather_map,
                                      d_gather_map + gather_map->size(),
                                      true,  // nullify-out-of-bounds
-                                     mr,
-                                     stream)
+                                     stream,
+                                     mr)
                   ->release();
   if (result.front()->null_count() == 0)
     result.front()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
diff --git a/cpp/src/lists/lists_column_view.cu b/cpp/src/lists/lists_column_view.cu
index e4d52d74a13..5c717487951 100644
--- a/cpp/src/lists/lists_column_view.cu
+++ b/cpp/src/lists/lists_column_view.cu
@@ -20,6 +20,8 @@
 #include <cudf/lists/list_view.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 
 lists_column_view::lists_column_view(column_view const& lists_column) : column_view(lists_column)
@@ -41,7 +43,7 @@ column_view lists_column_view::child() const
   return column_view::child(child_column_index);
 }
 
-column_view lists_column_view::get_sliced_child(cudaStream_t stream) const
+column_view lists_column_view::get_sliced_child(rmm::cuda_stream_view stream) const
 {
   // if I have a positive offset, I need to slice my child
   if (offset() > 0) {
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index b18c231b309..c63b7079a07 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -596,7 +596,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
 
       // Handle bitmask using gather to take advantage of ballot_sync
       detail::gather_bitmask(
-        input, gather_map.begin(), output_cols, detail::gather_bitmask_op::DONT_CHECK, mr, stream);
+        input, gather_map.begin(), output_cols, detail::gather_bitmask_op::DONT_CHECK, stream, mr);
     }
 
     auto output{std::make_unique<table>(std::move(output_cols))};
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 4d9d67e8dd8..aadcaa6d51f 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -99,8 +99,8 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                                          rotated_iter_begin,
                                          rotated_iter_begin + nrows,  // map
                                          false,
-                                         mr,
-                                         stream);
+                                         stream,
+                                         mr);
 
     auto ret_pair =
       std::make_pair(std::move(uniq_tbl), std::vector<cudf::size_type>(num_partitions));
@@ -134,8 +134,8 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                                          d_row_indices.begin(),
                                          d_row_indices.end(),  // map
                                          false,
-                                         mr,
-                                         stream);
+                                         stream,
+                                         mr);
 
     auto ret_pair =
       std::make_pair(std::move(uniq_tbl), std::vector<cudf::size_type>(num_partitions));
@@ -251,7 +251,7 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
       return num_partitions * index_within_partition + partition_index;
     });
 
-  auto uniq_tbl = cudf::detail::gather(input, iter_begin, iter_begin + nrows, false, mr, stream);
+  auto uniq_tbl = cudf::detail::gather(input, iter_begin, iter_begin + nrows, false, stream, mr);
   auto ret_pair = std::make_pair(std::move(uniq_tbl), std::vector<cudf::size_type>(num_partitions));
 
   // this has the effect of rotating the set of partition sizes
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index 96377223a19..51e71104cac 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -23,6 +23,8 @@
 #include <cudf/utilities/error.hpp>
 #include <quantiles/quantiles_util.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <vector>
 
@@ -33,6 +35,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  SortMapIterator sortmap,
                                  std::vector<double> const& q,
                                  interpolation interp,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   auto quantile_idx_lookup = [sortmap, interp, size = input.num_rows()] __device__(double q) {
@@ -44,7 +47,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
 
   auto quantile_idx_iter = thrust::make_transform_iterator(q_device.begin(), quantile_idx_lookup);
 
-  return detail::gather(input, quantile_idx_iter, quantile_idx_iter + q.size(), false, mr);
+  return detail::gather(input, quantile_idx_iter, quantile_idx_iter + q.size(), false, stream, mr);
 }
 
 }  // namespace detail
@@ -67,10 +70,12 @@ std::unique_ptr<table> quantiles(table_view const& input,
   CUDF_EXPECTS(input.num_rows() > 0, "multi-column quantiles require at least one input row.");
 
   if (is_input_sorted == sorted::YES) {
-    return detail::quantiles(input, thrust::make_counting_iterator<size_type>(0), q, interp, mr);
+    return detail::quantiles(
+      input, thrust::make_counting_iterator<size_type>(0), q, interp, rmm::cuda_stream_default, mr);
   } else {
     auto sorted_idx = detail::sorted_order(input, column_order, null_precedence);
-    return detail::quantiles(input, sorted_idx->view().data<size_type>(), q, interp, mr);
+    return detail::quantiles(
+      input, sorted_idx->view().data<size_type>(), q, interp, rmm::cuda_stream_default, mr);
   }
 }
 
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 73b20f5ba3f..2803ee1bab3 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -52,7 +52,7 @@ std::unique_ptr<table> tile(const table_view &in,
   auto counting_it  = thrust::make_counting_iterator<size_type>(0);
   auto tiled_it     = thrust::make_transform_iterator(counting_it, tile_functor{in_num_rows});
 
-  return detail::gather(in, tiled_it, tiled_it + out_num_rows, false, mr, stream);
+  return detail::gather(in, tiled_it, tiled_it + out_num_rows, false, stream, mr);
 }
 }  // namespace detail
 
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index b0d63c19e9a..a31eabe3964 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -701,8 +701,8 @@ struct rolling_window_launcher {
                                        output->view(),
                                        detail::out_of_bounds_policy::IGNORE,
                                        detail::negative_index_policy::NOT_ALLOWED,
-                                       mr,
-                                       stream);
+                                       stream,
+                                       mr);
     return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
   }
 
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index ce92aad9859..d163c4e5be8 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -49,8 +49,8 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                         sorted_order->view(),
                         detail::out_of_bounds_policy::NULLIFY,
                         detail::negative_index_policy::NOT_ALLOWED,
-                        mr,
-                        stream);
+                        stream,
+                        mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu
index e9206f60c8b..970ce7eb198 100644
--- a/cpp/src/stream_compaction/drop_duplicates.cu
+++ b/cpp/src/stream_compaction/drop_duplicates.cu
@@ -213,8 +213,8 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
                         unique_indices_view,
                         detail::out_of_bounds_policy::NULLIFY,
                         detail::negative_index_policy::NOT_ALLOWED,
-                        mr,
-                        stream);
+                        stream,
+                        mr);
 }
 
 /**
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index f2cf4e29e50..4c99b45f5ce 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -55,8 +55,8 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
                                            indices_view,
                                            cudf::detail::out_of_bounds_policy::NULLIFY,
                                            cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                           mr,
-                                           stream)
+                                           stream,
+                                           mr)
                         ->release();
   std::unique_ptr<column> output_column(std::move(sliced_table.front()));
   if (output_column->null_count() == 0)
diff --git a/cpp/src/strings/sorting/sorting.cu b/cpp/src/strings/sorting/sorting.cu
index 64b78475541..0a5a2238d9b 100644
--- a/cpp/src/strings/sorting/sorting.cu
+++ b/cpp/src/strings/sorting/sorting.cu
@@ -67,8 +67,8 @@ std::unique_ptr<cudf::column> sort(strings_column_view strings,
                                            indices_view,
                                            cudf::detail::out_of_bounds_policy::NULLIFY,
                                            cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                           mr,
-                                           stream)
+                                           stream,
+                                           mr)
                         ->release();
   return std::move(table_sorted.front());
 }
diff --git a/cpp/src/table/table_device_view.cu b/cpp/src/table/table_device_view.cu
index 5ddab572225..a2cb69044ed 100644
--- a/cpp/src/table/table_device_view.cu
+++ b/cpp/src/table/table_device_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <algorithm>
 #include <numeric>
 #include <vector>
@@ -34,8 +36,8 @@ void table_device_view_base<ColumnDeviceView, HostTableView>::destroy()
 
 template <typename ColumnDeviceView, typename HostTableView>
 table_device_view_base<ColumnDeviceView, HostTableView>::table_device_view_base(
-  HostTableView source_view, cudaStream_t stream)
-  : _num_rows{source_view.num_rows()}, _num_columns{source_view.num_columns()}, _stream{stream}
+  HostTableView source_view, rmm::cuda_stream_view stream)
+  : _num_rows{source_view.num_rows()}, _num_columns{source_view.num_columns()}
 {
   // The table's columns must be converted to ColumnDeviceView
   // objects and copied into device memory for the table_device_view's
@@ -82,9 +84,9 @@ table_device_view_base<ColumnDeviceView, HostTableView>::table_device_view_base(
       d_end += col_child_data_size;
     }
 
-    CUDA_TRY(
-      cudaMemcpyAsync(_columns, h_buffer.data(), views_size_bytes, cudaMemcpyDefault, stream));
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    CUDA_TRY(cudaMemcpyAsync(
+      _columns, h_buffer.data(), views_size_bytes, cudaMemcpyDefault, stream.value()));
+    stream.synchronize();
   }
 }
 
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index c8e7bd2fd5e..57475e0f59e 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -78,8 +78,8 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
                                       gather_map_column,
                                       cudf::detail::out_of_bounds_policy::FAIL,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                      mr,
-                                      stream);
+                                      stream,
+                                      mr);
   }
 
   auto indices_column =

From 243d2a153a56c80e63c3c2c875e9fe79b7421a4c Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Fri, 6 Nov 2020 15:37:28 +1100
Subject: [PATCH 21/51] rename type_conversion .cu->.cpp

---
 cpp/src/io/utilities/{type_conversion.cu => type_conversion.cpp} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cpp/src/io/utilities/{type_conversion.cu => type_conversion.cpp} (100%)

diff --git a/cpp/src/io/utilities/type_conversion.cu b/cpp/src/io/utilities/type_conversion.cpp
similarity index 100%
rename from cpp/src/io/utilities/type_conversion.cu
rename to cpp/src/io/utilities/type_conversion.cpp

From 554a370eee4651bf9abd218c70669fdb4834f535 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Fri, 6 Nov 2020 15:39:17 +1100
Subject: [PATCH 22/51] Rename structs_column_view .cu->.cpp

---
 .../structs/{structs_column_view.cu => structs_column_view.cpp} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename cpp/src/structs/{structs_column_view.cu => structs_column_view.cpp} (97%)

diff --git a/cpp/src/structs/structs_column_view.cu b/cpp/src/structs/structs_column_view.cpp
similarity index 97%
rename from cpp/src/structs/structs_column_view.cu
rename to cpp/src/structs/structs_column_view.cpp
index f9cb345de6f..ff7f6516cef 100644
--- a/cpp/src/structs/structs_column_view.cu
+++ b/cpp/src/structs/structs_column_view.cpp
@@ -25,4 +25,4 @@ structs_column_view::structs_column_view(column_view const& rhs) : column_view{r
   CUDF_EXPECTS(type().id() == type_id::STRUCT, "structs_column_view only supports struct columns");
 }
 
-}  // namespace cudf
\ No newline at end of file
+}  // namespace cudf

From a76e9ec270228248537219e2fbbbfc8d947b1965 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Fri, 6 Nov 2020 15:44:20 +1100
Subject: [PATCH 23/51] Convert hash groupby to cuda_stream_view

---
 cpp/include/cudf/detail/groupby.hpp |  3 ++-
 cpp/src/groupby/hash/groupby.cu     | 27 ++++++++++++++-------------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index 3091b5c121e..c616a2c8d50 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -19,6 +19,7 @@
 
 #include <memory>
 #include <utility>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace groupby {
@@ -41,7 +42,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   table_view const& keys,
   std::vector<aggregation_request> const& requests,
   null_policy include_null_keys,
-  cudaStream_t stream,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr);
 }  // namespace hash
 
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 14f813ae142..e0c9d92fd30 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-20, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <hash/concurrent_unordered_map.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <utility>
 
@@ -137,7 +139,7 @@ void sparse_to_dense_results(std::vector<aggregation_request> const& requests,
                              cudf::detail::result_cache* dense_results,
                              rmm::device_vector<size_type> const& gather_map,
                              size_type map_size,
-                             cudaStream_t stream,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   for (size_t i = 0; i < requests.size(); i++) {
@@ -203,7 +205,7 @@ void sparse_to_dense_results(std::vector<aggregation_request> const& requests,
 template <bool keys_have_nulls>
 auto create_hash_map(table_device_view const& d_keys,
                      null_policy include_null_keys,
-                     cudaStream_t stream = 0)
+                     rmm::cuda_stream_view stream)
 {
   size_type constexpr unused_key{std::numeric_limits<size_type>::max()};
   size_type constexpr unused_value{std::numeric_limits<size_type>::max()};
@@ -226,7 +228,7 @@ auto create_hash_map(table_device_view const& d_keys,
                           hasher,
                           rows_equal,
                           allocator_type(),
-                          stream);
+                          stream.value());
 }
 
 /**
@@ -241,7 +243,7 @@ void compute_single_pass_aggs(table_view const& keys,
                               cudf::detail::result_cache* sparse_results,
                               Map& map,
                               null_policy include_null_keys,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream)
 {
   // flatten the aggs to a table that can be operated on by aggregate_row
   table_view flattened_values;
@@ -281,7 +283,7 @@ void compute_single_pass_aggs(table_view const& keys,
     auto row_bitmask{
       cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource())};
     thrust::for_each_n(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream)->on(stream.value()),
       thrust::make_counting_iterator(0),
       keys.num_rows(),
       hash::compute_single_pass_aggs<true, Map>{map,
@@ -292,7 +294,7 @@ void compute_single_pass_aggs(table_view const& keys,
                                                 static_cast<bitmask_type*>(row_bitmask.data())});
   } else {
     thrust::for_each_n(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream)->on(stream.value()),
       thrust::make_counting_iterator(0),
       keys.num_rows(),
       hash::compute_single_pass_aggs<false, Map>{
@@ -313,9 +315,8 @@ void compute_single_pass_aggs(table_view const& keys,
  * `map`.
  */
 template <typename Map>
-std::pair<rmm::device_vector<size_type>, size_type> extract_populated_keys(Map map,
-                                                                           size_type num_keys,
-                                                                           cudaStream_t stream = 0)
+std::pair<rmm::device_vector<size_type>, size_type> extract_populated_keys(
+  Map map, size_type num_keys, rmm::cuda_stream_view stream)
 {
   rmm::device_vector<size_type> populated_keys(num_keys);
 
@@ -326,7 +327,7 @@ std::pair<rmm::device_vector<size_type>, size_type> extract_populated_keys(Map m
   };
 
   auto end_it = thrust::copy_if(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_transform_iterator(map.data(), get_key),
     thrust::make_transform_iterator(map.data() + map.capacity(), get_key),
     populated_keys.begin(),
@@ -369,7 +370,7 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
                                               std::vector<aggregation_request> const& requests,
                                               cudf::detail::result_cache* cache,
                                               null_policy include_null_keys,
-                                              cudaStream_t stream,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   auto d_keys = table_device_view::create(keys);
@@ -425,7 +426,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   table_view const& keys,
   std::vector<aggregation_request> const& requests,
   null_policy include_null_keys,
-  cudaStream_t stream,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   cudf::detail::result_cache cache(requests.size());

From 817c7156b05a429f766011c9581339d0dc67bfc3 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 08:30:11 +1100
Subject: [PATCH 24/51] Convert hashing to cuda_stream_view

---
 cpp/include/cudf/detail/hashing.hpp |  18 +++--
 cpp/src/hash/hashing.cu             | 119 +++++++++++++++-------------
 2 files changed, 73 insertions(+), 64 deletions(-)

diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index c5600f0af18..445affb37f7 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -17,6 +17,8 @@
 
 #include <cudf/hashing.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 /**
@@ -28,8 +30,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   table_view const& input,
   std::vector<size_type> const& columns_to_hash,
   int num_partitions,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::hash
@@ -40,19 +42,19 @@ std::unique_ptr<column> hash(
   table_view const& input,
   hash_id hash_function                     = hash_id::HASH_MURMUR3,
   std::vector<uint32_t> const& initial_hash = {},
-  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                       = 0);
+  rmm::cuda_stream_view stream              = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> murmur_hash3_32(
   table_view const& input,
   std::vector<uint32_t> const& initial_hash = {},
-  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                       = 0);
+  rmm::cuda_stream_view stream              = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> md5_hash(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index 03b6248f35a..ab703c78261 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -27,6 +27,8 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace {
 // Launch configuration for optimized hash partition
@@ -330,7 +332,7 @@ void copy_block_partitions_impl(InputIter const input,
                                 size_type const* block_partition_sizes,
                                 size_type const* scanned_block_partition_sizes,
                                 size_type grid_size,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream)
 {
   // We need 3 chunks of shared memory:
   // 1. BLOCK_SIZE * ROWS_PER_THREAD elements of size_type for copying to output
@@ -339,7 +341,7 @@ void copy_block_partitions_impl(InputIter const input,
   int const smem = OPTIMIZED_BLOCK_SIZE * OPTIMIZED_ROWS_PER_THREAD * sizeof(*output) +
                    (num_partitions + 1) * sizeof(size_type) * 2;
 
-  copy_block_partitions<<<grid_size, OPTIMIZED_BLOCK_SIZE, smem, stream>>>(
+  copy_block_partitions<<<grid_size, OPTIMIZED_BLOCK_SIZE, smem, stream.value()>>>(
     input,
     output,
     num_rows,
@@ -357,7 +359,7 @@ rmm::device_vector<size_type> compute_gather_map(size_type num_rows,
                                                  size_type const* block_partition_sizes,
                                                  size_type const* scanned_block_partition_sizes,
                                                  size_type grid_size,
-                                                 cudaStream_t stream)
+                                                 rmm::cuda_stream_view stream)
 {
   auto sequence = thrust::make_counting_iterator(0);
   rmm::device_vector<size_type> gather_map(num_rows);
@@ -385,8 +387,8 @@ struct copy_block_partitions_dispatcher {
                                      size_type const* block_partition_sizes,
                                      size_type const* scanned_block_partition_sizes,
                                      size_type grid_size,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     rmm::device_buffer output(input.size() * sizeof(DataType), stream, mr);
 
@@ -412,8 +414,8 @@ struct copy_block_partitions_dispatcher {
                                      size_type const* block_partition_sizes,
                                      size_type const* scanned_block_partition_sizes,
                                      size_type grid_size,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     // Use move_to_output_buffer to create an equivalent gather map
     auto gather_map = compute_gather_map(input.size(),
@@ -443,8 +445,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   table_view const& input,
   table_view const& table_to_hash,
   size_type num_partitions,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto const num_rows = table_to_hash.num_rows();
 
@@ -490,14 +492,14 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     compute_row_partition_numbers<<<grid_size,
                                     block_size,
                                     num_partitions * sizeof(size_type),
-                                    stream>>>(hasher,
-                                              num_rows,
-                                              num_partitions,
-                                              partitioner_type(num_partitions),
-                                              row_partition_numbers.data().get(),
-                                              row_partition_offset.data().get(),
-                                              block_partition_sizes.data().get(),
-                                              global_partition_sizes.data().get());
+                                    stream.value()>>>(hasher,
+                                                      num_rows,
+                                                      num_partitions,
+                                                      partitioner_type(num_partitions),
+                                                      row_partition_numbers.data().get(),
+                                                      row_partition_offset.data().get(),
+                                                      block_partition_sizes.data().get(),
+                                                      global_partition_sizes.data().get());
   } else {
     // Determines how the mapping between hash value and partition number is computed
     using partitioner_type = modulo_partitioner<hash_value_type>;
@@ -508,19 +510,19 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     compute_row_partition_numbers<<<grid_size,
                                     block_size,
                                     num_partitions * sizeof(size_type),
-                                    stream>>>(hasher,
-                                              num_rows,
-                                              num_partitions,
-                                              partitioner_type(num_partitions),
-                                              row_partition_numbers.data().get(),
-                                              row_partition_offset.data().get(),
-                                              block_partition_sizes.data().get(),
-                                              global_partition_sizes.data().get());
+                                    stream.value()>>>(hasher,
+                                                      num_rows,
+                                                      num_partitions,
+                                                      partitioner_type(num_partitions),
+                                                      row_partition_numbers.data().get(),
+                                                      row_partition_offset.data().get(),
+                                                      block_partition_sizes.data().get(),
+                                                      global_partition_sizes.data().get());
   }
 
   // Compute exclusive scan of all blocks' partition sizes in-place to determine
   // the starting point for each blocks portion of each partition in the output
-  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
+  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                          block_partition_sizes.begin(),
                          block_partition_sizes.end(),
                          scanned_block_partition_sizes.data().get());
@@ -529,7 +531,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   // of each partition in final output.
   // TODO This can be done independently on a separate stream
   size_type* scanned_global_partition_sizes{global_partition_sizes.data().get()};
-  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream),
+  thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                          global_partition_sizes.begin(),
                          global_partition_sizes.end(),
                          scanned_global_partition_sizes);
@@ -541,7 +543,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
                            scanned_global_partition_sizes,
                            num_partitions * sizeof(size_type),
                            cudaMemcpyDeviceToHost,
-                           stream));
+                           stream.value()));
 
   // When the number of partitions is less than a threshold, we can apply an
   // optimization using shared memory to copy values to the output buffer.
@@ -566,8 +568,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
                                    block_partition_sizes_ptr,
                                    scanned_block_partition_sizes_ptr,
                                    grid_size,
-                                   mr,
-                                   stream);
+                                   stream,
+                                   mr);
     });
 
     if (has_nulls(input)) {
@@ -596,12 +598,17 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     compute_row_output_locations<<<grid_size,
                                    block_size,
                                    num_partitions * sizeof(size_type),
-                                   stream>>>(
+                                   stream.value()>>>(
       row_output_locations, num_rows, num_partitions, scanned_block_partition_sizes_ptr);
 
     // Use the resulting scatter map to materialize the output
-    auto output = detail::scatter(
-      input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, mr, stream);
+    auto output = detail::scatter(input,
+                                  row_partition_numbers.begin(),
+                                  row_partition_numbers.end(),
+                                  input,
+                                  false,
+                                  mr,
+                                  stream.value());
 
     return std::make_pair(std::move(output), std::move(partition_offsets));
   }
@@ -620,8 +627,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   table_view const& input,
   std::vector<size_type> const& columns_to_hash,
   int num_partitions,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -633,28 +640,28 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   }
 
   if (has_nulls(table_to_hash)) {
-    return hash_partition_table<true>(input, table_to_hash, num_partitions, mr, stream);
+    return hash_partition_table<true>(input, table_to_hash, num_partitions, stream, mr);
   } else {
-    return hash_partition_table<false>(input, table_to_hash, num_partitions, mr, stream);
+    return hash_partition_table<false>(input, table_to_hash, num_partitions, stream, mr);
   }
 }
 
 std::unique_ptr<column> hash(table_view const& input,
                              hash_id hash_function,
                              std::vector<uint32_t> const& initial_hash,
-                             rmm::mr::device_memory_resource* mr,
-                             cudaStream_t stream)
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   switch (hash_function) {
-    case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, mr, stream);
-    case (hash_id::HASH_MD5): return md5_hash(input, mr, stream);
+    case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, stream, mr);
+    case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
     default: return nullptr;
   }
 }
 
 std::unique_ptr<column> md5_hash(table_view const& input,
-                                 rmm::mr::device_memory_resource* mr,
-                                 cudaStream_t stream)
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   if (input.num_columns() == 0 || input.num_rows() == 0) {
     const string_scalar string_128bit("d41d8cd98f00b204e9orig98ecf8427e");
@@ -673,14 +680,14 @@ std::unique_ptr<column> md5_hash(table_view const& input,
     "MD5 unsupported column type");
 
   // Result column allocation and creation
-  auto begin = thrust::make_constant_iterator(32);
-  auto offsets_column =
-    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), mr, stream);
+  auto begin          = thrust::make_constant_iterator(32);
+  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
+    begin, begin + input.num_rows(), mr, stream.value());
   auto offsets_view  = offsets_column->view();
   auto d_new_offsets = offsets_view.data<int32_t>();
 
   auto chars_column = strings::detail::create_chars_child_column(
-    input.num_rows(), 0, input.num_rows() * 32, mr, stream);
+    input.num_rows(), 0, input.num_rows() * 32, mr, stream.value());
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
 
@@ -689,7 +696,7 @@ std::unique_ptr<column> md5_hash(table_view const& input,
   auto const device_input = table_device_view::create(input, stream);
 
   // Hash each row, hashing each element sequentially left to right
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each(rmm::exec_policy(stream)->on(stream.value()),
                    thrust::make_counting_iterator(0),
                    thrust::make_counting_iterator(input.num_rows()),
                    [d_chars, device_input = *device_input] __device__(auto row_index) {
@@ -718,8 +725,8 @@ std::unique_ptr<column> md5_hash(table_view const& input,
 
 std::unique_ptr<column> murmur_hash3_32(table_view const& input,
                                         std::vector<uint32_t> const& initial_hash,
-                                        rmm::mr::device_memory_resource* mr,
-                                        cudaStream_t stream)
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 {
   // TODO this should be UINT32
   auto output = make_numeric_column(
@@ -739,13 +746,13 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
     auto device_initial_hash = rmm::device_vector<uint32_t>(initial_hash);
 
     if (nullable) {
-      thrust::tabulate(rmm::exec_policy(stream)->on(stream),
+      thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()),
                        output_view.begin<int32_t>(),
                        output_view.end<int32_t>(),
                        row_hasher_initial_values<MurmurHash3_32, true>(
                          *device_input, device_initial_hash.data().get()));
     } else {
-      thrust::tabulate(rmm::exec_policy(stream)->on(stream),
+      thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()),
                        output_view.begin<int32_t>(),
                        output_view.end<int32_t>(),
                        row_hasher_initial_values<MurmurHash3_32, false>(
@@ -753,12 +760,12 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
     }
   } else {
     if (nullable) {
-      thrust::tabulate(rmm::exec_policy(stream)->on(stream),
+      thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()),
                        output_view.begin<int32_t>(),
                        output_view.end<int32_t>(),
                        row_hasher<MurmurHash3_32, true>(*device_input));
     } else {
-      thrust::tabulate(rmm::exec_policy(stream)->on(stream),
+      thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()),
                        output_view.begin<int32_t>(),
                        output_view.end<int32_t>(),
                        row_hasher<MurmurHash3_32, false>(*device_input));
@@ -776,7 +783,7 @@ std::unique_ptr<column> hash(table_view const& input,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash(input, hash_function, initial_hash, mr);
+  return detail::hash(input, hash_function, initial_hash, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> murmur_hash3_32(table_view const& input,
@@ -784,7 +791,7 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::murmur_hash3_32(input, initial_hash, mr);
+  return detail::murmur_hash3_32(input, initial_hash, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf

From 2da7bb140261672aa751862c1a53d86f754d4fad Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 11:07:56 +1100
Subject: [PATCH 25/51] Add conda_include_dirs to benchmarks cmakelists

---
 cpp/benchmarks/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 667498fa965..893ca87e169 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -58,6 +58,11 @@ include_directories("${CMAKE_BINARY_DIR}/include"
                     "${RMM_INCLUDE}"
                     "${CMAKE_CURRENT_SOURCE_DIR}")
 
+if(CONDA_INCLUDE_DIRS)
+  include_directories("${CONDA_INCLUDE_DIRS}")
+endif(CONDA_INCLUDE_DIRS)
+                    
+
 ###################################################################################################
 # - library paths ---------------------------------------------------------------------------------
 

From be38bdaf1c6ddbe46ce92cd028b8cd9e457c85fc Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 11:39:55 +1100
Subject: [PATCH 26/51] Convert interop to rmm::cuda_stream_view

---
 cpp/include/cudf/detail/interop.hpp | 18 +++---
 cpp/src/interop/dlpack.cpp          | 17 ++---
 cpp/src/interop/from_arrow.cpp      | 99 +++++++++++++++--------------
 cpp/src/interop/to_arrow.cpp        | 35 +++++-----
 4 files changed, 88 insertions(+), 81 deletions(-)

diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 8271a04265d..c6d2014f80e 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -21,6 +21,8 @@
 #include <cudf/utilities/error.hpp>
 #include <string>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 
@@ -31,8 +33,8 @@ namespace detail {
  */
 std::unique_ptr<table> from_dlpack(
   DLManagedTensor const* managed_tensor,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::to_dlpack
@@ -41,8 +43,8 @@ std::unique_ptr<table> from_dlpack(
  */
 DLManagedTensor* to_dlpack(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 // Creating arrow as per given type_id and buffer arguments
 template <typename... Ts>
@@ -101,8 +103,8 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type);
  **/
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata = {},
-                                       arrow::MemoryPool* ar_mr = arrow::default_memory_pool(),
-                                       cudaStream_t stream      = 0);
+                                       rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+                                       arrow::MemoryPool* ar_mr     = arrow::default_memory_pool());
 
 /**
  * @copydoc cudf::arrow_to_cudf
@@ -111,8 +113,8 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  **/
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input_table,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 714e95f28b8..bb79a1d437e 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -21,6 +21,7 @@
 #include <dlpack/dlpack.h>
 
 #include <algorithm>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace {
@@ -113,8 +114,8 @@ struct dltensor_context {
 
 namespace detail {
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
-                                   rmm::mr::device_memory_resource* mr,
-                                   cudaStream_t stream)
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(nullptr != managed_tensor, "managed_tensor is null");
   auto const& tensor = managed_tensor->dl_tensor;
@@ -171,7 +172,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
                              reinterpret_cast<void*>(tensor_data),
                              bytes,
                              cudaMemcpyDefault,
-                             stream));
+                             stream.value()));
 
     tensor_data += col_stride;
   }
@@ -180,8 +181,8 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
 }
 
 DLManagedTensor* to_dlpack(table_view const& input,
-                           rmm::mr::device_memory_resource* mr,
-                           cudaStream_t stream)
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr)
 {
   auto const num_rows = input.num_rows();
   auto const num_cols = input.num_columns();
@@ -241,7 +242,7 @@ DLManagedTensor* to_dlpack(table_view const& input,
                              get_column_data(col),
                              stride_bytes,
                              cudaMemcpyDefault,
-                             stream));
+                             stream.value()));
     tensor_data += stride_bytes;
   }
 
@@ -256,12 +257,12 @@ DLManagedTensor* to_dlpack(table_view const& input,
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
                                    rmm::mr::device_memory_resource* mr)
 {
-  return detail::from_dlpack(managed_tensor, mr);
+  return detail::from_dlpack(managed_tensor, rmm::cuda_stream_default, mr);
 }
 
 DLManagedTensor* to_dlpack(table_view const& input, rmm::mr::device_memory_resource* mr)
 {
-  return detail::to_dlpack(input, mr);
+  return detail::to_dlpack(input, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp
index 045c1174b08..690647d9306 100644
--- a/cpp/src/interop/from_arrow.cpp
+++ b/cpp/src/interop/from_arrow.cpp
@@ -92,8 +92,8 @@ struct dispatch_to_cudf_column {
    * @brief Returns mask from an array withut any offsets.
    */
   std::unique_ptr<rmm::device_buffer> get_mask_buffer(arrow::Array const& array,
-                                                      rmm::mr::device_memory_resource* mr,
-                                                      cudaStream_t stream)
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::mr::device_memory_resource* mr)
   {
     if (array.null_bitmap_data() == nullptr) {
       return std::make_unique<rmm::device_buffer>(0, stream, mr);
@@ -107,7 +107,7 @@ struct dispatch_to_cudf_column {
                              reinterpret_cast<const uint8_t*>(mask_buffer->address()),
                              array.null_bitmap()->size(),
                              cudaMemcpyDefault,
-                             stream));
+                             stream.value()));
     return mask;
   }
 
@@ -115,8 +115,8 @@ struct dispatch_to_cudf_column {
   std::unique_ptr<column> operator()(arrow::Array const& array,
                                      data_type type,
                                      bool skip_mask,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto data_buffer         = array.data()->buffers[1];
     size_type const num_rows = array.length();
@@ -128,9 +128,9 @@ struct dispatch_to_cudf_column {
       reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(T),
       sizeof(T) * num_rows,
       cudaMemcpyDefault,
-      stream));
+      stream.value()));
     if (has_nulls) {
-      auto tmp_mask = get_mask_buffer(array, mr, stream);
+      auto tmp_mask = get_mask_buffer(array, stream, mr);
 
       // If array is sliced, we have to copy whole mask and then take copy.
       auto out_mask = (num_rows == static_cast<size_type>(data_buffer->size() / sizeof(T)))
@@ -138,7 +138,7 @@ struct dispatch_to_cudf_column {
                         : cudf::detail::copy_bitmask(static_cast<bitmask_type*>(tmp_mask->data()),
                                                      array.offset(),
                                                      array.offset() + num_rows,
-                                                     rmm::cuda_stream_view{stream},
+                                                     stream,
                                                      mr);
 
       col->set_null_mask(std::move(out_mask));
@@ -162,16 +162,16 @@ std::unique_ptr<column> get_empty_type_column(size_type size)
 std::unique_ptr<column> get_column(arrow::Array const& array,
                                    data_type type,
                                    bool skip_mask,
-                                   rmm::mr::device_memory_resource* mr,
-                                   cudaStream_t stream);
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 template <>
 std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
   arrow::Array const& array,
   data_type type,
   bool skip_mask,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto data_buffer = array.data()->buffers[1];
   auto data        = rmm::device_buffer(data_buffer->size(), stream, mr);
@@ -179,20 +179,20 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
                            reinterpret_cast<const uint8_t*>(data_buffer->address()),
                            data_buffer->size(),
                            cudaMemcpyDefault,
-                           stream));
+                           stream.value()));
   auto out_col = mask_to_bools(static_cast<bitmask_type*>(data.data()),
                                array.offset(),
                                array.offset() + array.length(),
-                               stream,
+                               stream.value(),
                                mr);
 
   auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr;
   if (has_nulls) {
     auto out_mask =
-      detail::copy_bitmask(static_cast<bitmask_type*>(get_mask_buffer(array, mr, stream)->data()),
+      detail::copy_bitmask(static_cast<bitmask_type*>(get_mask_buffer(array, stream, mr)->data()),
                            array.offset(),
                            array.offset() + array.length(),
-                           rmm::cuda_stream_view{stream},
+                           stream,
                            mr);
 
     out_col->set_null_mask(std::move(out_mask));
@@ -206,10 +206,12 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   arrow::Array const& array,
   data_type type,
   bool skip_mask,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
-  if (array.length() == 0) { return cudf::strings::detail::make_empty_strings_column(mr, stream); }
+  if (array.length() == 0) {
+    return cudf::strings::detail::make_empty_strings_column(mr, stream.value());
+  }
   auto str_array    = static_cast<arrow::StringArray const*>(&array);
   auto offset_array = std::make_unique<arrow::Int32Array>(
     str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
@@ -217,16 +219,16 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
     str_array->value_data()->size(), str_array->value_data(), nullptr);
 
   auto offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
-    *offset_array, data_type(type_id::INT32), true, mr, stream);
+    *offset_array, data_type(type_id::INT32), true, stream, mr);
   auto chars_column = dispatch_to_cudf_column{}.operator()<int8_t>(
-    *char_array, data_type(type_id::INT8), true, mr, stream);
+    *char_array, data_type(type_id::INT8), true, stream, mr);
 
   auto const num_rows = offsets_column->size() - 1;
   auto out_col        = make_strings_column(num_rows,
                                      std::move(offsets_column),
                                      std::move(chars_column),
                                      UNKNOWN_NULL_COUNT,
-                                     std::move(*get_mask_buffer(array, mr, stream)),
+                                     std::move(*get_mask_buffer(array, stream, mr)),
                                      stream,
                                      mr);
 
@@ -242,19 +244,20 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::dictionary32>(
   arrow::Array const& array,
   data_type type,
   bool skip_mask,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto dict_array  = static_cast<arrow::DictionaryArray const*>(&array);
   auto dict_type   = arrow_to_cudf_type(*(dict_array->dictionary()->type()));
-  auto keys_column = get_column(*(dict_array->dictionary()), dict_type, true, mr, stream);
+  auto keys_column = get_column(*(dict_array->dictionary()), dict_type, true, stream, mr);
   auto ind_type    = arrow_to_cudf_type(*(dict_array->indices()->type()));
 
-  auto indices_column = get_column(*(dict_array->indices()), ind_type, false, mr, stream);
+  auto indices_column = get_column(*(dict_array->indices()), ind_type, false, stream, mr);
   // If index type is not of type uint32_t, then cast it to uint32_t
   auto const dict_indices_type = data_type{type_id::UINT32};
   if (indices_column->type().id() != dict_indices_type.id())
-    indices_column = cudf::detail::cast(indices_column->view(), dict_indices_type, mr, stream);
+    indices_column =
+      cudf::detail::cast(indices_column->view(), dict_indices_type, mr, stream.value());
 
   // Child columns shouldn't have masks and we need the mask in main column
   auto column_contents = indices_column->release();
@@ -272,8 +275,8 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::struct_view>(
   arrow::Array const& array,
   data_type type,
   bool skip_mask,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto struct_array = static_cast<arrow::StructArray const*>(&array);
   std::vector<std::unique_ptr<column>> child_columns;
@@ -284,10 +287,10 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::struct_view>(
                  std::back_inserter(child_columns),
                  [&mr, &stream](auto const& child_array) {
                    auto type = arrow_to_cudf_type(*(child_array->type()));
-                   return get_column(*child_array, type, false, mr, stream);
+                   return get_column(*child_array, type, false, stream, mr);
                  });
 
-  auto out_mask = *(get_mask_buffer(array, mr, stream));
+  auto out_mask = *(get_mask_buffer(array, stream, mr));
   if (struct_array->null_bitmap_data() != nullptr) {
     out_mask = detail::copy_bitmask(static_cast<bitmask_type*>(out_mask.data()),
                                     array.offset(),
@@ -305,24 +308,24 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::list_view>(
   arrow::Array const& array,
   data_type type,
   bool skip_mask,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto list_array   = static_cast<arrow::ListArray const*>(&array);
   auto offset_array = std::make_unique<arrow::Int32Array>(
     list_array->value_offsets()->size() / sizeof(int32_t), list_array->value_offsets(), nullptr);
   auto offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
-    *offset_array, data_type(type_id::INT32), true, mr, stream);
+    *offset_array, data_type(type_id::INT32), true, stream, mr);
 
   auto child_type   = arrow_to_cudf_type(*(list_array->values()->type()));
-  auto child_column = get_column(*(list_array->values()), child_type, false, mr, stream);
+  auto child_column = get_column(*(list_array->values()), child_type, false, stream, mr);
 
   auto const num_rows = offsets_column->size() - 1;
   auto out_col        = make_lists_column(num_rows,
                                    std::move(offsets_column),
                                    std::move(child_column),
                                    UNKNOWN_NULL_COUNT,
-                                   std::move(*get_mask_buffer(array, mr, stream)),
+                                   std::move(*get_mask_buffer(array, stream, mr)),
                                    stream,
                                    mr);
 
@@ -336,19 +339,19 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::list_view>(
 std::unique_ptr<column> get_column(arrow::Array const& array,
                                    data_type type,
                                    bool skip_mask,
-                                   rmm::mr::device_memory_resource* mr,
-                                   cudaStream_t stream)
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   return type.id() != type_id::EMPTY
-           ? type_dispatcher(type, dispatch_to_cudf_column{}, array, type, skip_mask, mr, stream)
+           ? type_dispatcher(type, dispatch_to_cudf_column{}, array, type, skip_mask, stream, mr)
            : get_empty_type_column(array.length());
 }
 
 }  // namespace
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   if (input_table.num_columns() == 0) { return std::make_unique<table>(); }
   std::vector<std::unique_ptr<column>> columns;
@@ -363,12 +366,12 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                    if (cudf_type.id() == type_id::EMPTY) {
                      return get_empty_type_column(chunked_array->length());
                    }
-                   transform(array_chunks.begin(),
-                             array_chunks.end(),
-                             std::back_inserter(concat_columns),
-                             [&cudf_type, &mr, &stream](auto const& array_chunk) {
-                               return get_column(*array_chunk, cudf_type, false, mr, stream);
-                             });
+                   std::transform(array_chunks.begin(),
+                                  array_chunks.end(),
+                                  std::back_inserter(concat_columns),
+                                  [&cudf_type, &mr, &stream](auto const& array_chunk) {
+                                    return get_column(*array_chunk, cudf_type, false, stream, mr);
+                                  });
                    if (concat_columns.empty()) {
                      return std::make_unique<column>(cudf_type, 0, rmm::device_buffer(0));
                    } else if (concat_columns.size() == 1) {
@@ -393,7 +396,7 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 {
   CUDF_FUNC_RANGE();
 
-  return detail::from_arrow(input_table, mr);
+  return detail::from_arrow(input_table, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp
index 13afde2ee4b..4f7a939b055 100644
--- a/cpp/src/interop/to_arrow.cpp
+++ b/cpp/src/interop/to_arrow.cpp
@@ -27,6 +27,7 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
 namespace cudf {
@@ -39,7 +40,7 @@ namespace {
 template <typename T>
 std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
                                                  arrow::MemoryPool* ar_mr,
-                                                 cudaStream_t stream)
+                                                 rmm::cuda_stream_view stream)
 {
   const int64_t data_size_in_bytes = sizeof(T) * input_view.size();
 
@@ -52,7 +53,7 @@ std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
                            input_view.data<T>(),
                            data_size_in_bytes,
                            cudaMemcpyDeviceToHost,
-                           stream));
+                           stream.value()));
 
   return data_buffer;
 }
@@ -62,7 +63,7 @@ std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
  */
 std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
                                                  arrow::MemoryPool* ar_mr,
-                                                 cudaStream_t stream)
+                                                 rmm::cuda_stream_view stream)
 {
   const int64_t mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size());
 
@@ -75,7 +76,7 @@ std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
       (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(),
       mask_size_in_bytes,
       cudaMemcpyDeviceToHost,
-      stream));
+      stream.value()));
 
     // Resets all padded bits to 0
     mask_buffer->ZeroPadding();
@@ -97,7 +98,7 @@ struct dispatch_to_arrow {
     column_view input_view,
     std::vector<column_metadata> const& metadata,
     arrow::MemoryPool* ar_mr,
-    cudaStream_t stream)
+    rmm::cuda_stream_view stream)
   {
     std::vector<std::shared_ptr<arrow::Array>> child_arrays;
     std::vector<size_type> child_indices(input_view.num_children());
@@ -119,7 +120,7 @@ struct dispatch_to_arrow {
                                            cudf::type_id id,
                                            column_metadata const& metadata,
                                            arrow::MemoryPool* ar_mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream)
   {
     return to_arrow_array(id,
                           static_cast<int64_t>(input_view.size()),
@@ -134,9 +135,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                                                                   cudf::type_id id,
                                                                   column_metadata const& metadata,
                                                                   arrow::MemoryPool* ar_mr,
-                                                                  cudaStream_t stream)
+                                                                  rmm::cuda_stream_view stream)
 {
-  auto bitmask = bools_to_mask(input, rmm::mr::get_current_device_resource(), stream);
+  auto bitmask = bools_to_mask(input, rmm::mr::get_current_device_resource(), stream.value());
 
   auto result = arrow::AllocateBuffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
   CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
@@ -147,7 +148,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                            bitmask.first->data(),
                            bitmask.first->size(),
                            cudaMemcpyDeviceToHost,
-                           stream));
+                           stream.value()));
   return to_arrow_array(id,
                         static_cast<int64_t>(input.size()),
                         data_buffer,
@@ -161,7 +162,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
   cudf::type_id id,
   column_metadata const& metadata,
   arrow::MemoryPool* ar_mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   std::unique_ptr<column> tmp_column =
     ((input.offset() != 0) or
@@ -201,7 +202,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::struct_view>(
   cudf::type_id id,
   column_metadata const& metadata,
   arrow::MemoryPool* ar_mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(metadata.children_meta.size() == input.num_children(),
                "Number of field names and number of children doesn't match\n");
@@ -237,7 +238,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
   cudf::type_id id,
   column_metadata const& metadata,
   arrow::MemoryPool* ar_mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   std::unique_ptr<column> tmp_column = nullptr;
   if ((input.offset() != 0) or
@@ -269,14 +270,14 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::dictionary32>(
   cudf::type_id id,
   column_metadata const& metadata,
   arrow::MemoryPool* ar_mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream)
 {
   // Arrow dictionary requires indices to be signed integer
   std::unique_ptr<column> dict_indices =
     cast(cudf::dictionary_column_view(input).get_indices_annotated(),
          cudf::data_type{type_id::INT32},
          rmm::mr::get_current_device_resource(),
-         stream);
+         stream.value());
   auto indices = dispatch_to_arrow{}.operator()<int32_t>(
     dict_indices->view(), dict_indices->type().id(), {}, ar_mr, stream);
   auto dict_keys = cudf::dictionary_column_view(input).keys();
@@ -296,8 +297,8 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::dictionary32>(
 
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata,
-                                       arrow::MemoryPool* ar_mr,
-                                       cudaStream_t stream)
+                                       rmm::cuda_stream_view stream,
+                                       arrow::MemoryPool* ar_mr)
 {
   CUDF_EXPECTS((metadata.size() == input.num_columns()),
                "columns' metadata should be equal to number of columns in table");
@@ -335,7 +336,7 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
 {
   CUDF_FUNC_RANGE();
 
-  return detail::to_arrow(input, metadata, ar_mr);
+  return detail::to_arrow(input, metadata, rmm::cuda_stream_default, ar_mr);
 }
 
 }  // namespace cudf

From a546bcc8c576c67e5caa39f73d0b44cfa1562981 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 11:55:36 +1100
Subject: [PATCH 27/51] Add missing dlpack and to_arrow synchronization.

---
 cpp/src/interop/dlpack.cpp   | 6 ++++++
 cpp/src/interop/to_arrow.cpp | 9 ++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index bb79a1d437e..1ae6119aefd 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -249,6 +249,12 @@ DLManagedTensor* to_dlpack(table_view const& input,
   // Defer ownership of managed tensor to caller
   managed_tensor->deleter     = dltensor_context::deleter;
   managed_tensor->manager_ctx = context.release();
+
+  // synchronize the stream because after the return the data may be accessed from the host before
+  // the above `cudaMemcpyAsync` calls have completed their copies (especially if pinned host
+  // memory is used).
+  stream.synchronize();
+
   return managed_tensor.release();
 }
 
diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp
index 4f7a939b055..5f270597403 100644
--- a/cpp/src/interop/to_arrow.cpp
+++ b/cpp/src/interop/to_arrow.cpp
@@ -326,7 +326,14 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
     std::back_inserter(fields),
     [](auto const& array, auto const& meta) { return arrow::field(meta.name, array->type()); });
 
-  return arrow::Table::Make(arrow::schema(fields), arrays);
+  auto result = arrow::Table::Make(arrow::schema(fields), arrays);
+
+  // synchronize the stream because after the return the data may be accessed from the host before
+  // the above `cudaMemcpyAsync` calls have completed their copies (especially if pinned host
+  // memory is used).
+  stream.synchronize();
+
+  return result;
 }
 }  // namespace detail
 

From 7d863dc439e145e9bc53797e6ee2ce00acb5e59b Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 12:44:41 +1100
Subject: [PATCH 28/51] Convert reductions, quantiles to cuda_stream_view

---
 cpp/include/cudf/detail/quantiles.hpp         | 56 +++++++++++++++
 cpp/include/cudf/detail/reduction.cuh         | 33 ++++-----
 .../cudf/detail/reduction_functions.hpp       | 51 +++++++------
 cpp/src/quantiles/quantile.cu                 | 71 ++++++++++++++-----
 cpp/src/reductions/all.cu                     | 13 ++--
 cpp/src/reductions/any.cu                     | 13 ++--
 cpp/src/reductions/compound.cuh               | 36 +++++-----
 cpp/src/reductions/max.cu                     | 13 ++--
 cpp/src/reductions/mean.cu                    | 13 ++--
 cpp/src/reductions/min.cu                     | 13 ++--
 cpp/src/reductions/nth_element.cu             | 13 ++--
 cpp/src/reductions/product.cu                 | 13 ++--
 cpp/src/reductions/reductions.cpp             | 71 ++++++++++---------
 cpp/src/reductions/simple.cuh                 | 33 ++++-----
 cpp/src/reductions/std.cu                     | 13 ++--
 cpp/src/reductions/sum.cu                     | 13 ++--
 cpp/src/reductions/sum_of_squares.cu          | 11 +--
 cpp/src/reductions/var.cu                     | 13 ++--
 18 files changed, 315 insertions(+), 177 deletions(-)
 create mode 100644 cpp/include/cudf/detail/quantiles.hpp

diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
new file mode 100644
index 00000000000..e93886c4f11
--- /dev/null
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/quantiles.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace detail {
+
+/** @copydoc cudf::quantile(column_view const&, std::vector<double> const&, interpolation,
+    column_view const&, bool, rmm::mr::device_memory_resource*)
+  *
+  * @param stream CUDA stream used for device memory operations and kernel launches.
+  */
+std::unique_ptr<column> quantile(
+  column_view const& input,
+  std::vector<double> const& q,
+  interpolation interp                = interpolation::LINEAR,
+  column_view const& ordered_indices  = {},
+  bool exact                          = true,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @copydoc cudf::quantiles(table_view const&, std::vector<double> const&, interpolation,
+    cudf::sorted, std::vector<order> const&, std::vector<null_order> const&,
+    rmm::mr::device_memory_resource*)
+  *
+  * @param stream CUDA stream used for device memory operations and kernel launches.
+  */
+std::unique_ptr<table> quantiles(
+  table_view const& input,
+  std::vector<double> const& q,
+  interpolation interp                           = interpolation::NEAREST,
+  cudf::sorted is_input_sorted                   = sorted::NO,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/detail/reduction.cuh
index 84cde38fab8..063114adbc3 100644
--- a/cpp/include/cudf/detail/reduction.cuh
+++ b/cpp/include/cudf/detail/reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <cub/device/device_reduce.cuh>
 #include "reduction_operators.cuh"
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace reduction {
@@ -49,8 +50,8 @@ template <typename Op,
 std::unique_ptr<scalar> reduce(InputIterator d_in,
                                cudf::size_type num_items,
                                op::simple_op<Op> sop,
-                               rmm::mr::device_memory_resource* mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   auto binary_op      = sop.get_binary_op();
   OutputType identity = sop.template get_identity<OutputType>();
@@ -66,7 +67,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                             num_items,
                             binary_op,
                             identity,
-                            stream);
+                            stream.value());
   d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
 
   // Run reduction
@@ -77,7 +78,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                             num_items,
                             binary_op,
                             identity,
-                            stream);
+                            stream.value());
 
   // only for string_view, data is copied
   auto s = new cudf::scalar_type_t<OutputType>(std::move(dev_result), true, stream, mr);
@@ -92,8 +93,8 @@ template <typename Op,
 std::unique_ptr<scalar> reduce(InputIterator d_in,
                                cudf::size_type num_items,
                                op::simple_op<Op> sop,
-                               rmm::mr::device_memory_resource* mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   auto binary_op      = sop.get_binary_op();
   OutputType identity = sop.template get_identity<OutputType>();
@@ -109,7 +110,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                             num_items,
                             binary_op,
                             identity,
-                            stream);
+                            stream.value());
   d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
 
   // Run reduction
@@ -120,7 +121,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                             num_items,
                             binary_op,
                             identity,
-                            stream);
+                            stream.value());
 
   using ScalarType = cudf::scalar_type_t<OutputType>;
   auto s = new ScalarType(dev_result, true, stream, mr);  // only for string_view, data is copied
@@ -135,8 +136,8 @@ template <typename Op,
 std::unique_ptr<scalar> reduce(InputIterator d_in,
                                cudf::size_type num_items,
                                op::simple_op<Op> sop,
-                               rmm::mr::device_memory_resource* mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FAIL("dictionary type not supported");
 }
@@ -169,8 +170,8 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                op::compound_op<Op> cop,
                                cudf::size_type valid_count,
                                cudf::size_type ddof,
-                               rmm::mr::device_memory_resource* mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   auto binary_op            = cop.get_binary_op();
   IntermediateType identity = cop.template get_identity<IntermediateType>();
@@ -186,7 +187,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                             num_items,
                             binary_op,
                             identity,
-                            stream);
+                            stream.value());
   d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
 
   // Run reduction
@@ -197,12 +198,12 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                             num_items,
                             binary_op,
                             identity,
-                            stream);
+                            stream.value());
 
   // compute the result value from intermediate value in device
   using ScalarType = cudf::scalar_type_t<OutputType>;
   auto result      = new ScalarType(OutputType{0}, true, stream, mr);
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      intermediate_result.data(),
                      1,
                      [dres = result->data(), cop, valid_count, ddof] __device__(auto i) {
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp
index 99c7a679600..01df55dea05 100644
--- a/cpp/include/cudf/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/detail/reduction_functions.hpp
@@ -19,6 +19,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace reduction {
 /**
@@ -38,8 +40,9 @@ namespace reduction {
 std::unique_ptr<scalar> sum(
   column_view const& col,
   data_type const output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Computes minimum of elements in input column
  *
@@ -56,8 +59,9 @@ std::unique_ptr<scalar> sum(
 std::unique_ptr<scalar> min(
   column_view const& col,
   data_type const output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Computes maximum of elements in input column
  *
@@ -74,8 +78,9 @@ std::unique_ptr<scalar> min(
 std::unique_ptr<scalar> max(
   column_view const& col,
   data_type const output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Computes any of elements in input column is true when typecasted to bool
  *
@@ -93,8 +98,9 @@ std::unique_ptr<scalar> max(
 std::unique_ptr<scalar> any(
   column_view const& col,
   data_type const output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Computes all of elements in input column is true when typecasted to bool
  *
@@ -112,8 +118,9 @@ std::unique_ptr<scalar> any(
 std::unique_ptr<scalar> all(
   column_view const& col,
   data_type const output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Computes product of elements in input column
  *
@@ -131,8 +138,8 @@ std::unique_ptr<scalar> all(
 std::unique_ptr<scalar> product(
   column_view const& col,
   data_type const output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes sum of squares of elements in input column
@@ -151,8 +158,8 @@ std::unique_ptr<scalar> product(
 std::unique_ptr<scalar> sum_of_squares(
   column_view const& col,
   data_type const output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes mean of elements in input column
@@ -171,8 +178,8 @@ std::unique_ptr<scalar> sum_of_squares(
 std::unique_ptr<scalar> mean(
   column_view const& col,
   data_type const output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes variance of elements in input column
@@ -192,8 +199,8 @@ std::unique_ptr<scalar> variance(
   column_view const& col,
   data_type const output_dtype,
   cudf::size_type ddof,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes standard deviation of elements in input column
@@ -213,8 +220,8 @@ std::unique_ptr<scalar> standard_deviation(
   column_view const& col,
   data_type const output_dtype,
   cudf::size_type ddof,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns nth element in input column
@@ -244,8 +251,8 @@ std::unique_ptr<scalar> nth_element(
   column_view const& col,
   size_type n,
   null_policy null_handling,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 31205f292c0..d4241157817 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -39,8 +39,8 @@ struct quantile_functor {
   std::vector<double> const& q;
   interpolation interp;
   bool retain_types;
-  rmm::mr::device_memory_resource* mr;
   rmm::cuda_stream_view stream;
+  rmm::mr::device_memory_resource* mr;
 
   template <typename T>
   std::enable_if_t<not std::is_arithmetic<T>::value, std::unique_ptr<column>> operator()(
@@ -55,9 +55,8 @@ struct quantile_functor {
   {
     using Result = std::conditional_t<exact, double, T>;
 
-    auto type = data_type{type_to_id<Result>()};
-    auto output =
-      make_fixed_width_column(type, q.size(), mask_state::UNALLOCATED, stream.value(), mr);
+    auto type   = data_type{type_to_id<Result>()};
+    auto output = make_fixed_width_column(type, q.size(), mask_state::UNALLOCATED, stream, mr);
 
     if (output->size() == 0) { return output; }
 
@@ -112,33 +111,42 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  std::vector<double> const& q,
                                  interpolation interp,
                                  bool retain_types,
-                                 rmm::mr::device_memory_resource* mr,
-                                 cudaStream_t stream)
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   auto functor = quantile_functor<exact, SortMapIterator>{
-    ordered_indices, size, q, interp, retain_types, mr, stream};
+    ordered_indices, size, q, interp, retain_types, stream, mr};
 
   return type_dispatcher(input.type(), functor, input);
 }
 
-}  // namespace detail
-
 std::unique_ptr<column> quantile(column_view const& input,
                                  std::vector<double> const& q,
                                  interpolation interp,
                                  column_view const& ordered_indices,
                                  bool exact,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
-
   if (ordered_indices.is_empty()) {
     if (exact) {
-      return detail::quantile<true>(
-        input, thrust::make_counting_iterator<size_type>(0), input.size(), q, interp, exact, mr, 0);
+      return detail::quantile<true>(input,
+                                    thrust::make_counting_iterator<size_type>(0),
+                                    input.size(),
+                                    q,
+                                    interp,
+                                    exact,
+                                    stream,
+                                    mr);
     } else {
-      return detail::quantile<false>(
-        input, thrust::make_counting_iterator<size_type>(0), input.size(), q, interp, exact, mr, 0);
+      return detail::quantile<false>(input,
+                                     thrust::make_counting_iterator<size_type>(0),
+                                     input.size(),
+                                     q,
+                                     interp,
+                                     exact,
+                                     stream,
+                                     mr);
     }
 
   } else {
@@ -146,13 +154,38 @@ std::unique_ptr<column> quantile(column_view const& input,
                  "`ordered_indicies` type must be `INT32`.");
 
     if (exact) {
-      return detail::quantile<true>(
-        input, ordered_indices.data<size_type>(), ordered_indices.size(), q, interp, exact, mr, 0);
+      return detail::quantile<true>(input,
+                                    ordered_indices.data<size_type>(),
+                                    ordered_indices.size(),
+                                    q,
+                                    interp,
+                                    exact,
+                                    stream,
+                                    mr);
     } else {
-      return detail::quantile<false>(
-        input, ordered_indices.data<size_type>(), ordered_indices.size(), q, interp, exact, mr, 0);
+      return detail::quantile<false>(input,
+                                     ordered_indices.data<size_type>(),
+                                     ordered_indices.size(),
+                                     q,
+                                     interp,
+                                     exact,
+                                     stream,
+                                     mr);
     }
   }
 }
 
+}  // namespace detail
+
+std::unique_ptr<column> quantile(column_view const& input,
+                                 std::vector<double> const& q,
+                                 interpolation interp,
+                                 column_view const& ordered_indices,
+                                 bool exact,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::quantile(input, q, interp, ordered_indices, exact, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace cudf
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 80a0b25176e..496ea822e92 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,18 @@
  */
 // The translation unit for reduction `max`
 
-#include <cudf/detail/reduction_functions.hpp>
 #include "simple.cuh"
 
+#include <cudf/detail/reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 std::unique_ptr<cudf::scalar> cudf::reduction::all(column_view const& col,
                                                    cudf::data_type const output_dtype,
-                                                   rmm::mr::device_memory_resource* mr,
-                                                   cudaStream_t stream)
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "all() operation can be applied with output type `bool8` only");
-  return cudf::reduction::min(col, cudf::data_type(cudf::type_id::BOOL8), mr, stream);
+  return cudf::reduction::min(col, cudf::data_type(cudf::type_id::BOOL8), stream, mr);
 }
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index ff04714190b..91d2c2f767a 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,18 @@
  */
 // The translation unit for reduction `max`
 
-#include <cudf/detail/reduction_functions.hpp>
 #include "simple.cuh"
 
+#include <cudf/detail/reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 std::unique_ptr<cudf::scalar> cudf::reduction::any(column_view const& col,
                                                    cudf::data_type const output_dtype,
-                                                   rmm::mr::device_memory_resource* mr,
-                                                   cudaStream_t stream)
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "any() operation can be applied with output type `bool8` only");
-  return cudf::reduction::max(col, cudf::data_type(cudf::type_id::BOOL8), mr, stream);
+  return cudf::reduction::max(col, cudf::data_type(cudf::type_id::BOOL8), stream, mr);
 }
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 4bda26409fd..18baa37bd21 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 
 namespace cudf {
@@ -33,9 +35,9 @@ namespace compound {
  * @param[in] ddof   `Delta Degrees of Freedom` used for `std`, `var`.
  *                   The divisor used in calculations is N - ddof, where N
  *                   represents the number of elements.
- * @param[in] mr     Device memory resource used to allocate the returned scalar's device memory
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- * @returns   Output scalar in device memory
+ * @param[in] mr     Device memory resource used to allocate the returned scalar's device memory
+ * @return    Output scalar in device memory
  *
  * @tparam ElementType  the input column cudf dtype
  * @tparam ResultType   the output cudf dtype
@@ -46,8 +48,8 @@ template <typename ElementType, typename ResultType, typename Op>
 std::unique_ptr<scalar> compound_reduction(column_view const& col,
                                            data_type const output_dtype,
                                            cudf::size_type ddof,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   cudf::size_type valid_count = col.size() - col.null_count();
 
@@ -61,12 +63,12 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
       dcol->pair_begin<ElementType, true>(),
       compound_op.template get_null_replacing_element_transformer<ResultType>());
     result = detail::reduce<Op, decltype(it), ResultType>(
-      it, col.size(), compound_op, valid_count, ddof, mr, stream);
+      it, col.size(), compound_op, valid_count, ddof, stream, mr);
   } else {
     auto it = thrust::make_transform_iterator(
       dcol->begin<ElementType>(), compound_op.template get_element_transformer<ResultType>());
     result = detail::reduce<Op, decltype(it), ResultType>(
-      it, col.size(), compound_op, valid_count, ddof, mr, stream);
+      it, col.size(), compound_op, valid_count, ddof, stream, mr);
   }
   // set scalar is valid
   if (col.null_count() < col.size())
@@ -93,18 +95,18 @@ struct result_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      cudf::data_type const output_dtype,
                                      cudf::size_type ddof,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
-    return compound_reduction<ElementType, ResultType, Op>(col, output_dtype, ddof, mr, stream);
+    return compound_reduction<ElementType, ResultType, Op>(col, output_dtype, ddof, stream, mr);
   }
 
   template <typename ResultType, std::enable_if_t<not is_supported_v<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      cudf::data_type const output_dtype,
                                      cudf::size_type ddof,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Unsupported output data type");
   }
@@ -126,19 +128,19 @@ struct element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      cudf::data_type const output_dtype,
                                      cudf::size_type ddof,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     return cudf::type_dispatcher(
-      output_dtype, result_type_dispatcher<ElementType, Op>(), col, output_dtype, ddof, mr, stream);
+      output_dtype, result_type_dispatcher<ElementType, Op>(), col, output_dtype, ddof, stream, mr);
   }
 
   template <typename ElementType, std::enable_if_t<not is_supported_v<ElementType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      cudf::data_type const output_dtype,
                                      cudf::size_type ddof,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL(
       "Reduction operators other than `min` and `max`"
diff --git a/cpp/src/reductions/max.cu b/cpp/src/reductions/max.cu
index 74084091d5b..88819783af7 100644
--- a/cpp/src/reductions/max.cu
+++ b/cpp/src/reductions/max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,17 @@
  */
 // The translation unit for reduction `max`
 
-#include <cudf/detail/reduction_functions.hpp>
 #include "simple.cuh"
 
+#include <cudf/detail/reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 std::unique_ptr<cudf::scalar> cudf::reduction::max(column_view const& col,
                                                    cudf::data_type const output_dtype,
-                                                   rmm::mr::device_memory_resource* mr,
-                                                   cudaStream_t stream)
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   using reducer = cudf::reduction::simple::element_type_dispatcher<cudf::reduction::op::max>;
-  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, mr, stream);
+  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, stream, mr);
 }
diff --git a/cpp/src/reductions/mean.cu b/cpp/src/reductions/mean.cu
index 5d2d3b17b2c..b05016ed257 100644
--- a/cpp/src/reductions/mean.cu
+++ b/cpp/src/reductions/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,18 @@
  */
 // The translation unit for reduction `mean`
 
-#include <cudf/detail/reduction_functions.hpp>
 #include "compound.cuh"
 
+#include <cudf/detail/reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 std::unique_ptr<cudf::scalar> cudf::reduction::mean(column_view const& col,
                                                     cudf::data_type const output_dtype,
-                                                    rmm::mr::device_memory_resource* mr,
-                                                    cudaStream_t stream)
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
 {
   using reducer = cudf::reduction::compound::element_type_dispatcher<cudf::reduction::op::mean>;
   return cudf::type_dispatcher(
-    col.type(), reducer(), col, output_dtype, /* ddof is not used for mean*/ 1, mr, stream);
+    col.type(), reducer(), col, output_dtype, /* ddof is not used for mean*/ 1, stream, mr);
 }
diff --git a/cpp/src/reductions/min.cu b/cpp/src/reductions/min.cu
index 67c2e714a52..fcbdf456de2 100644
--- a/cpp/src/reductions/min.cu
+++ b/cpp/src/reductions/min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,17 @@
  */
 // The translation unit for reduction `min`
 
-#include <cudf/detail/reduction_functions.hpp>
 #include "simple.cuh"
 
+#include <cudf/detail/reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 std::unique_ptr<cudf::scalar> cudf::reduction::min(column_view const& col,
                                                    data_type const output_dtype,
-                                                   rmm::mr::device_memory_resource* mr,
-                                                   cudaStream_t stream)
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   using reducer = cudf::reduction::simple::element_type_dispatcher<cudf::reduction::op::min>;
-  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, mr, stream);
+  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, stream, mr);
 }
diff --git a/cpp/src/reductions/nth_element.cu b/cpp/src/reductions/nth_element.cu
index f68270cf8b3..85e0b8afde9 100644
--- a/cpp/src/reductions/nth_element.cu
+++ b/cpp/src/reductions/nth_element.cu
@@ -20,14 +20,16 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/reduction_functions.hpp>
 
-#include <thrust/binary_search.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <thrust/binary_search.h>
+
 std::unique_ptr<cudf::scalar> cudf::reduction::nth_element(column_view const& col,
                                                            size_type n,
                                                            null_policy null_handling,
-                                                           rmm::mr::device_memory_resource* mr,
-                                                           cudaStream_t stream)
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(n >= -col.size() and n < col.size(), "Index out of bounds");
   auto wrap_n = [n](size_type size) { return (n < 0 ? size + n : n); };
@@ -41,11 +43,12 @@ std::unique_ptr<cudf::scalar> cudf::reduction::nth_element(column_view const& co
                                       [] __device__(auto b) { return static_cast<size_type>(b); });
     rmm::device_uvector<size_type> null_skipped_index(col.size(), stream);
     // null skipped index for valids only.
-    thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream),
+    thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                            bitmask_iterator,
                            bitmask_iterator + col.size(),
                            null_skipped_index.begin());
-    auto n_pos          = thrust::upper_bound(rmm::exec_policy(stream)->on(stream),
+
+    auto n_pos          = thrust::upper_bound(rmm::exec_policy(stream)->on(stream.value()),
                                      null_skipped_index.begin(),
                                      null_skipped_index.end(),
                                      n);
diff --git a/cpp/src/reductions/product.cu b/cpp/src/reductions/product.cu
index 5b9b78ec2ce..8f23bbb88cc 100644
--- a/cpp/src/reductions/product.cu
+++ b/cpp/src/reductions/product.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,17 @@
  */
 // The translation unit for reduction `product`
 
-#include <cudf/detail/reduction_functions.hpp>
 #include "simple.cuh"
 
+#include <cudf/detail/reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 std::unique_ptr<cudf::scalar> cudf::reduction::product(column_view const& col,
                                                        cudf::data_type const output_dtype,
-                                                       rmm::mr::device_memory_resource* mr,
-                                                       cudaStream_t stream)
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
 {
   using reducer = cudf::reduction::simple::element_type_dispatcher<cudf::reduction::op::product>;
-  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, mr, stream);
+  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, stream, mr);
 }
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index de4608ed391..7afebaab154 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -17,27 +17,28 @@
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/stream_compaction.hpp>
-#include <cudf/quantiles.hpp>
-#include <cudf/sorting.hpp>
-
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/quantiles.hpp>
 #include <cudf/detail/reduction_functions.hpp>
+#include <cudf/detail/sorting.hpp>
+#include <cudf/detail/stream_compaction.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 struct reduce_dispatch_functor {
   column_view const col;
   data_type output_dtype;
   rmm::mr::device_memory_resource *mr;
-  cudaStream_t stream;
+  rmm::cuda_stream_view stream;
 
   reduce_dispatch_functor(column_view const &col,
                           data_type output_dtype,
-                          rmm::mr::device_memory_resource *mr,
-                          cudaStream_t stream)
+                          rmm::cuda_stream_view stream,
+                          rmm::mr::device_memory_resource *mr)
     : col(col), output_dtype(output_dtype), mr(mr), stream(stream)
   {
   }
@@ -46,55 +47,59 @@ struct reduce_dispatch_functor {
   std::unique_ptr<scalar> operator()(std::unique_ptr<aggregation> const &agg)
   {
     switch (k) {
-      case aggregation::SUM: return reduction::sum(col, output_dtype, mr, stream); break;
-      case aggregation::PRODUCT: return reduction::product(col, output_dtype, mr, stream); break;
-      case aggregation::MIN: return reduction::min(col, output_dtype, mr, stream); break;
-      case aggregation::MAX: return reduction::max(col, output_dtype, mr, stream); break;
-      case aggregation::ANY: return reduction::any(col, output_dtype, mr, stream); break;
-      case aggregation::ALL: return reduction::all(col, output_dtype, mr, stream); break;
+      case aggregation::SUM: return reduction::sum(col, output_dtype, stream, mr); break;
+      case aggregation::PRODUCT: return reduction::product(col, output_dtype, stream, mr); break;
+      case aggregation::MIN: return reduction::min(col, output_dtype, stream, mr); break;
+      case aggregation::MAX: return reduction::max(col, output_dtype, stream, mr); break;
+      case aggregation::ANY: return reduction::any(col, output_dtype, stream, mr); break;
+      case aggregation::ALL: return reduction::all(col, output_dtype, stream, mr); break;
       case aggregation::SUM_OF_SQUARES:
-        return reduction::sum_of_squares(col, output_dtype, mr, stream);
+        return reduction::sum_of_squares(col, output_dtype, stream, mr);
         break;
-      case aggregation::MEAN: return reduction::mean(col, output_dtype, mr, stream); break;
+      case aggregation::MEAN: return reduction::mean(col, output_dtype, stream, mr); break;
       case aggregation::VARIANCE: {
         auto var_agg = static_cast<std_var_aggregation const *>(agg.get());
-        return reduction::variance(col, output_dtype, var_agg->_ddof, mr, stream);
+        return reduction::variance(col, output_dtype, var_agg->_ddof, stream, mr);
       } break;
       case aggregation::STD: {
         auto var_agg = static_cast<std_var_aggregation const *>(agg.get());
-        return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, mr, stream);
+        return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, stream, mr);
       } break;
       case aggregation::MEDIAN: {
-        auto sorted_indices       = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr);
+        auto sorted_indices =
+          detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr, stream.value());
         auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0];
-        auto col_ptr = quantile(col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, mr);
+        auto col_ptr              = detail::quantile(
+          col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream, mr);
         return get_element(*col_ptr, 0, mr);
       } break;
       case aggregation::QUANTILE: {
         auto quantile_agg = static_cast<quantile_aggregation const *>(agg.get());
         CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
-        auto sorted_indices       = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr);
+        auto sorted_indices =
+          detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr, stream.value());
         auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0];
-        auto col_ptr              = quantile(col,
-                                quantile_agg->_quantiles,
-                                quantile_agg->_interpolation,
-                                valid_sorted_indices,
-                                true,
-                                mr);
+        auto col_ptr              = detail::quantile(col,
+                                        quantile_agg->_quantiles,
+                                        quantile_agg->_interpolation,
+                                        valid_sorted_indices,
+                                        true,
+                                        stream,
+                                        mr);
         return get_element(*col_ptr, 0, mr);
       } break;
       case aggregation::NUNIQUE: {
         auto nunique_agg = static_cast<nunique_aggregation const *>(agg.get());
         return make_fixed_width_scalar(
           detail::distinct_count(
-            col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream),
-          stream,
+            col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream.value()),
+          stream.value(),
           mr);
       } break;
       case aggregation::NTH_ELEMENT: {
         auto nth_agg = static_cast<nth_element_aggregation const *>(agg.get());
-        return reduction::nth_element(col, nth_agg->_n, nth_agg->_null_handling, mr, stream);
+        return reduction::nth_element(col, nth_agg->_n, nth_agg->_null_handling, stream, mr);
       } break;
       default: CUDF_FAIL("Unsupported reduction operator");
     }
@@ -105,8 +110,8 @@ std::unique_ptr<scalar> reduce(
   column_view const &col,
   std::unique_ptr<aggregation> const &agg,
   data_type output_dtype,
-  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
 {
   std::unique_ptr<scalar> result = make_default_constructed_scalar(output_dtype);
   result->set_valid(false, stream);
@@ -115,7 +120,7 @@ std::unique_ptr<scalar> reduce(
   if (col.size() <= col.null_count()) return result;
 
   result =
-    aggregation_dispatcher(agg->kind, reduce_dispatch_functor{col, output_dtype, mr, stream}, agg);
+    aggregation_dispatcher(agg->kind, reduce_dispatch_functor{col, output_dtype, stream, mr}, agg);
   return result;
 }
 }  // namespace detail
@@ -126,7 +131,7 @@ std::unique_ptr<scalar> reduce(column_view const &col,
                                rmm::mr::device_memory_resource *mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reduce(col, agg, output_dtype, mr);
+  return detail::reduce(col, agg, output_dtype, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 980b709e241..c10c163d0c4 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -17,11 +17,12 @@
 #pragma once
 
 #include <cudf/detail/reduction.cuh>
-
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/structs/struct_view.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include "cudf/structs/struct_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -42,8 +43,8 @@ namespace simple {
 template <typename ElementType, typename ResultType, typename Op>
 std::unique_ptr<scalar> simple_reduction(column_view const& col,
                                          data_type const output_dtype,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   // reduction by iterator
   auto dcol = cudf::column_device_view::create(col, stream);
@@ -54,11 +55,11 @@ std::unique_ptr<scalar> simple_reduction(column_view const& col,
     auto it = thrust::make_transform_iterator(
       dcol->pair_begin<ElementType, true>(),
       simple_op.template get_null_replacing_element_transformer<ResultType>());
-    result = detail::reduce(it, col.size(), Op{}, mr, stream);
+    result = detail::reduce(it, col.size(), Op{}, stream, mr);
   } else {
     auto it = thrust::make_transform_iterator(
       dcol->begin<ElementType>(), simple_op.template get_element_transformer<ResultType>());
-    result = detail::reduce(it, col.size(), Op{}, mr, stream);
+    result = detail::reduce(it, col.size(), Op{}, stream, mr);
   }
   // set scalar is valid
   result->set_valid((col.null_count() < col.size()), stream);
@@ -91,17 +92,17 @@ struct result_type_dispatcher {
   template <typename ResultType, std::enable_if_t<is_supported_v<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      data_type const output_dtype,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
-    return simple_reduction<ElementType, ResultType, Op>(col, output_dtype, mr, stream);
+    return simple_reduction<ElementType, ResultType, Op>(col, output_dtype, stream, mr);
   }
 
   template <typename ResultType, std::enable_if_t<not is_supported_v<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      data_type const output_dtype,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("input data type is not convertible to output data type");
   }
@@ -129,18 +130,18 @@ struct element_type_dispatcher {
   template <typename ElementType, std::enable_if_t<is_supported_v<ElementType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      data_type const output_dtype,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     return cudf::type_dispatcher(
-      output_dtype, result_type_dispatcher<ElementType, Op>(), col, output_dtype, mr, stream);
+      output_dtype, result_type_dispatcher<ElementType, Op>(), col, output_dtype, stream, mr);
   }
 
   template <typename ElementType, std::enable_if_t<not is_supported_v<ElementType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      data_type const output_dtype,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL(
       "Reduction operators other than `min` and `max`"
diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu
index 39ba7e8292c..a3f410f1407 100644
--- a/cpp/src/reductions/std.cu
+++ b/cpp/src/reductions/std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,12 @@
  */
 // The translation unit for reduction `standard deviation`
 
-#include <cudf/detail/reduction_functions.hpp>
 #include "compound.cuh"
 
+#include <cudf/detail/reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 // @param[in] ddof Delta Degrees of Freedom used for `std`, `var`.
 //                 The divisor used in calculations is N - ddof, where N
 //                 represents the number of elements.
@@ -26,14 +29,14 @@ std::unique_ptr<cudf::scalar> cudf::reduction::standard_deviation(
   column_view const& col,
   cudf::data_type const output_dtype,
   cudf::size_type ddof,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
   using reducer =
     cudf::reduction::compound::element_type_dispatcher<cudf::reduction::op::standard_deviation>;
-  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, ddof, mr, stream);
+  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, ddof, stream, mr);
 #else
   // workaround for bug 200529165 which causes compilation error only at device
   // debug build the bug will be fixed at cuda 10.2
diff --git a/cpp/src/reductions/sum.cu b/cpp/src/reductions/sum.cu
index f75002e1eba..d295dfe3706 100644
--- a/cpp/src/reductions/sum.cu
+++ b/cpp/src/reductions/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,14 +15,17 @@
  */
 // The translation unit for reduction `sum`
 
-#include <cudf/detail/reduction_functions.hpp>
 #include "simple.cuh"
 
+#include <cudf/detail/reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 std::unique_ptr<cudf::scalar> cudf::reduction::sum(column_view const& col,
                                                    cudf::data_type const output_dtype,
-                                                   rmm::mr::device_memory_resource* mr,
-                                                   cudaStream_t stream)
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   using reducer = cudf::reduction::simple::element_type_dispatcher<cudf::reduction::op::sum>;
-  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, mr, stream);
+  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, stream, mr);
 }
diff --git a/cpp/src/reductions/sum_of_squares.cu b/cpp/src/reductions/sum_of_squares.cu
index a989eb7ad48..ca898bf9bce 100644
--- a/cpp/src/reductions/sum_of_squares.cu
+++ b/cpp/src/reductions/sum_of_squares.cu
@@ -15,15 +15,18 @@
  */
 // The translation unit for reduction `sum of squares`
 
-#include <cudf/detail/reduction_functions.hpp>
 #include "simple.cuh"
 
+#include <cudf/detail/reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 std::unique_ptr<cudf::scalar> cudf::reduction::sum_of_squares(column_view const& col,
                                                               cudf::data_type const output_dtype,
-                                                              rmm::mr::device_memory_resource* mr,
-                                                              cudaStream_t stream)
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::mr::device_memory_resource* mr)
 {
   using reducer =
     cudf::reduction::simple::element_type_dispatcher<cudf::reduction::op::sum_of_squares>;
-  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, mr, stream);
+  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, stream, mr);
 }
diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu
index 4d180c118c3..eab57344cc6 100644
--- a/cpp/src/reductions/var.cu
+++ b/cpp/src/reductions/var.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,12 @@
 
 // The translation unit for reduction `variance`
 
-#include <cudf/detail/reduction_functions.hpp>
 #include "compound.cuh"
 
+#include <cudf/detail/reduction_functions.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
 // @param[in] ddof Delta Degrees of Freedom used for `std`, `var`.
 //                 The divisor used in calculations is N - ddof, where N
 //                 represents the number of elements.
@@ -26,13 +29,13 @@
 std::unique_ptr<cudf::scalar> cudf::reduction::variance(column_view const& col,
                                                         cudf::data_type const output_dtype,
                                                         cudf::size_type ddof,
-                                                        rmm::mr::device_memory_resource* mr,
-                                                        cudaStream_t stream)
+                                                        rmm::cuda_stream_view stream,
+                                                        rmm::mr::device_memory_resource* mr)
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
   using reducer = cudf::reduction::compound::element_type_dispatcher<cudf::reduction::op::variance>;
-  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, ddof, mr, stream);
+  return cudf::type_dispatcher(col.type(), reducer(), col, output_dtype, ddof, stream, mr);
 #else
   // workaround for bug 200529165 which causes compilation error only at device
   // debug build the bug will be fixed at cuda 10.2

From 84a200ea44274a1467313dc5c835df9ee19950f2 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 12:51:19 +1100
Subject: [PATCH 29/51] Convert repeat to cuda_stream_view

---
 cpp/include/cudf/detail/repeat.hpp | 12 +++++-----
 cpp/src/filling/repeat.cu          | 36 ++++++++++++++++--------------
 cpp/src/join/cross_join.cu         |  2 +-
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index afd6c0b5d5a..1c358b3da71 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 
 namespace cudf {
@@ -33,8 +35,8 @@ std::unique_ptr<table> repeat(
   table_view const& input_table,
   column_view const& count,
   bool check_count,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::repeat(table_view const&, size_type,
@@ -45,8 +47,8 @@ std::unique_ptr<table> repeat(
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   size_type count,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 96e2e15f262..224f6dfe3a0 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <rmm/thrust_rmm_allocator.h>
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
@@ -29,14 +28,16 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <thrust/binary_search.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/scan.h>
 
-#include <cuda_runtime.h>
-
 #include <limits>
 #include <memory>
 
@@ -73,7 +74,7 @@ struct compute_offsets {
 
   template <typename T>
   std::enable_if_t<std::is_integral<T>::value, rmm::device_vector<cudf::size_type>> operator()(
-    bool check_count, cudaStream_t stream = 0)
+    bool check_count, rmm::cuda_stream_view stream)
   {
     // static_cast is necessary due to bool
     if (check_count && static_cast<int64_t>(std::numeric_limits<T>::max()) >
@@ -83,14 +84,15 @@ struct compute_offsets {
                    "count should not have values larger than size_type's limit.");
     }
     rmm::device_vector<cudf::size_type> offsets(p_column->size());
-    thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream),
+    thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream.value()),
                            p_column->begin<T>(),
                            p_column->end<T>(),
                            offsets.begin());
     if (check_count == true) {
-      CUDF_EXPECTS(thrust::is_sorted(
-                     rmm::exec_policy(stream)->on(stream), offsets.begin(), offsets.end()) == true,
-                   "count has negative values or the resulting table has more \
+      CUDF_EXPECTS(
+        thrust::is_sorted(
+          rmm::exec_policy(stream)->on(stream.value()), offsets.begin(), offsets.end()) == true,
+        "count has negative values or the resulting table has more \
                     rows than size_type's limit.");
     }
 
@@ -99,7 +101,7 @@ struct compute_offsets {
 
   template <typename T>
   std::enable_if_t<not std::is_integral<T>::value, rmm::device_vector<cudf::size_type>> operator()(
-    bool check_count, cudaStream_t stream)
+    bool check_count, rmm::cuda_stream_view stream)
   {
     CUDF_FAIL("count value should be a integral type.");
   }
@@ -112,8 +114,8 @@ namespace detail {
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
                               bool check_count,
-                              rmm::mr::device_memory_resource* mr,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(input_table.num_rows() == count.size(), "in and count must have equal size");
   CUDF_EXPECTS(count.has_nulls() == false, "count cannot contain nulls");
@@ -124,7 +126,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 
   size_type output_size{offsets.back()};
   rmm::device_vector<size_type> indices(output_size);
-  thrust::upper_bound(rmm::exec_policy(stream)->on(stream),
+  thrust::upper_bound(rmm::exec_policy(stream)->on(stream.value()),
                       offsets.begin(),
                       offsets.end(),
                       thrust::make_counting_iterator(0),
@@ -136,8 +138,8 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
-                              rmm::mr::device_memory_resource* mr,
-                              cudaStream_t stream)
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(count >= 0, "count value should be non-negative");
   CUDF_EXPECTS(
@@ -162,7 +164,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat(input_table, count, check_count, mr, 0);
+  return detail::repeat(input_table, count, check_count, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> repeat(table_view const& input_table,
@@ -170,7 +172,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat(input_table, count, mr, 0);
+  return detail::repeat(input_table, count, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index 27aa7672825..5a2dc32e27a 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -54,7 +54,7 @@ std::unique_ptr<cudf::table> cross_join(
   }
 
   // Repeat left table
-  auto left_repeated = detail::repeat(left, right.num_rows(), mr, stream);
+  auto left_repeated = detail::repeat(left, right.num_rows(), stream, mr);
 
   // Tile right table
   auto right_tiled = detail::tile(right, left.num_rows(), stream, mr);

From 876d9efe999da86a7c2be7680a901d42b4c7a494 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 14:20:15 +1100
Subject: [PATCH 30/51] Add quantiles.hpp to meta.yaml

---
 conda/recipes/libcudf/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 6cb9ce2adff..b017940eee7 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -75,6 +75,7 @@ test:
     - test -f $PREFIX/include/cudf/detail/replace.hpp
     - test -f $PREFIX/include/cudf/detail/reshape.hpp
     - test -f $PREFIX/include/cudf/detail/round.hpp
+    - test -f $PREFIX/include/cudf/detail/quantiles.hpp
     - test -f $PREFIX/include/cudf/detail/scatter.hpp
     - test -f $PREFIX/include/cudf/detail/search.hpp
     - test -f $PREFIX/include/cudf/detail/sequence.hpp

From 6a7d15ca241d766f012c9c5a1e8b1edaacf90d6c Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 14:20:56 +1100
Subject: [PATCH 31/51] Convert replace to cuda_stream_view

---
 cpp/include/cudf/detail/replace.hpp           | 25 +++---
 .../cudf/dictionary/detail/replace.hpp        | 14 ++--
 cpp/include/cudf/strings/detail/replace.hpp   | 18 +++--
 cpp/src/dictionary/replace.cu                 | 37 +++++----
 cpp/src/io/json/reader_impl.cu                |  2 +-
 cpp/src/replace/nans.cu                       | 33 ++++----
 cpp/src/replace/nulls.cu                      | 76 ++++++++++---------
 cpp/src/replace/replace.cu                    | 59 +++++++-------
 cpp/src/strings/replace/replace.cu            | 66 ++++++++--------
 9 files changed, 172 insertions(+), 158 deletions(-)

diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index 989ea3f7e0f..d872f3edbcd 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,13 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #pragma once
 
 #include <cudf/types.hpp>
-#include <memory>
 
-// Forward declaration
+#include <rmm/cuda_stream_view.hpp>
+
+#include <memory>
 
 namespace cudf {
 namespace detail {
@@ -32,8 +32,8 @@ namespace detail {
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   cudf::column_view const& replacement,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, scalar const&,
@@ -44,8 +44,8 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   scalar const& replacement,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, column_view const&,
@@ -56,7 +56,7 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   column_view const& replacement,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -68,7 +68,7 @@ std::unique_ptr<column> replace_nans(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   scalar const& replacement,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -80,7 +80,8 @@ std::unique_ptr<column> find_and_replace_all(
   column_view const& input_col,
   column_view const& values_to_replace,
   column_view const& replacement_values,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp
index 040f71a5751..7166633c378 100644
--- a/cpp/include/cudf/dictionary/detail/replace.hpp
+++ b/cpp/include/cudf/dictionary/detail/replace.hpp
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -32,15 +34,15 @@ namespace detail {
  *
  * @param input Column with nulls to replace.
  * @param replacement Column with values to use for replacing.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column with null rows replaced.
  */
 std::unique_ptr<column> replace_nulls(
   dictionary_column_view const& input,
   dictionary_column_view const& replacement,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new dictionary column by replacing nulls with a
@@ -50,15 +52,15 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param input Column with nulls to replace.
  * @param replacement Value to use for replacing.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column with null rows replaced.
  */
 std::unique_ptr<column> replace_nulls(
   dictionary_column_view const& input,
   scalar const& replacement,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index 3a665492102..64e626794e7 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -34,8 +36,8 @@ std::unique_ptr<column> replace(
   string_scalar const& target,
   string_scalar const& repl,
   int32_t maxrepl                     = -1,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::strings::replace_slice(strings_column_view const&, string_scalar const&,
@@ -48,8 +50,8 @@ std::unique_ptr<column> replace_slice(
   string_scalar const& repl           = string_scalar(""),
   size_type start                     = 0,
   size_type stop                      = -1,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&,
@@ -61,8 +63,8 @@ std::unique_ptr<column> replace(
   strings_column_view const& strings,
   strings_column_view const& targets,
   strings_column_view const& repls,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, string_scalar const&,
@@ -73,8 +75,8 @@ std::unique_ptr<column> replace(
 std::unique_ptr<column> replace_nulls(
   strings_column_view const& strings,
   string_scalar const& repl           = string_scalar(""),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 918063ac508..097490c4ff3 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -23,6 +23,7 @@
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -104,8 +105,8 @@ auto make_scalar_iterator(scalar const& input)
 template <typename ReplacementIter>
 std::unique_ptr<column> replace_indices(column_view const& input,
                                         ReplacementIter replacement_iter,
-                                        rmm::mr::device_memory_resource* mr,
-                                        cudaStream_t stream)
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 {
   auto const input_view = column_device_view::create(input, stream);
   auto const d_input    = *input_view;
@@ -129,8 +130,8 @@ std::unique_ptr<column> replace_indices(column_view const& input,
  */
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       dictionary_column_view const& replacement,
-                                      rmm::mr::device_memory_resource* mr,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input.parent()); }
@@ -138,7 +139,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match");
 
   // first combine the keys so both input dictionaries have the same set
-  auto matched = match_dictionaries({input, replacement}, mr, stream);
+  auto matched = match_dictionaries({input, replacement}, mr, stream.value());
 
   // now build the new indices by doing replace-null using the updated input indices
   auto const input_indices =
@@ -146,13 +147,15 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   auto const repl_indices = dictionary_column_view(matched.back()->view()).get_indices_annotated();
   auto new_indices =
     repl_indices.has_nulls()
-      ? replace_indices(input_indices, make_nullable_index_iterator<true>(repl_indices), mr, stream)
+      ? replace_indices(input_indices, make_nullable_index_iterator<true>(repl_indices), stream, mr)
       : replace_indices(
-          input_indices, make_nullable_index_iterator<false>(repl_indices), mr, stream);
+          input_indices, make_nullable_index_iterator<false>(repl_indices), stream, mr);
 
   // auto keys_column = ;
-  return make_dictionary_column(
-    std::move(matched.front()->release().children.back()), std::move(new_indices), mr, stream);
+  return make_dictionary_column(std::move(matched.front()->release().children.back()),
+                                std::move(new_indices),
+                                mr,
+                                stream.value());
 }
 
 /**
@@ -161,8 +164,8 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
  */
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       scalar const& replacement,
-                                      rmm::mr::device_memory_resource* mr,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls() || !replacement.is_valid()) {
@@ -173,18 +176,20 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   // first add the replacment to the keys so only the indices need to be processed
   auto const default_mr = rmm::mr::get_current_device_resource();
   auto input_matched    = dictionary::detail::add_keys(
-    input, make_column_from_scalar(replacement, 1, stream, default_mr)->view(), mr, stream);
+    input, make_column_from_scalar(replacement, 1, stream, default_mr)->view(), mr, stream.value());
   auto const input_view   = dictionary_column_view(input_matched->view());
-  auto const scalar_index = get_index(input_view, replacement, default_mr, stream);
+  auto const scalar_index = get_index(input_view, replacement, default_mr, stream.value());
 
   // now build the new indices by doing replace-null on the updated indices
   auto const input_indices = input_view.get_indices_annotated();
   auto new_indices =
-    replace_indices(input_indices, make_scalar_iterator(*scalar_index), mr, stream);
+    replace_indices(input_indices, make_scalar_iterator(*scalar_index), stream, mr);
   new_indices->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
 
-  return make_dictionary_column(
-    std::move(input_matched->release().children.back()), std::move(new_indices), mr, stream);
+  return make_dictionary_column(std::move(input_matched->release().children.back()),
+                                std::move(new_indices),
+                                mr,
+                                stream.value());
 }
 
 }  // namespace detail
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index ae0cb40e522..3246f7e9ed0 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -609,7 +609,7 @@ table_with_metadata reader::impl::convert_data_to_table(cudaStream_t stream)
     if (out_column->type().id() == type_id::STRING) {
       // Need to remove escape character in case of '\"' and '\\'
       out_columns.emplace_back(cudf::strings::detail::replace(
-        out_column->view(), target->view(), repl->view(), mr_, stream));
+        out_column->view(), target->view(), repl->view(), stream, mr_));
     } else {
       out_columns.emplace_back(std::move(out_column));
     }
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index 6232da34f06..d26be0ad47f 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -25,6 +25,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/transform_scan.h>
 
 namespace cudf {
@@ -37,8 +39,8 @@ struct replace_nans_functor {
     column_view const& input,
     Replacement const& replacement,
     bool replacement_nullable,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(input.type() == replacement.type(),
                  "Input and replacement must be of the same type");
@@ -106,9 +108,10 @@ struct replace_nans_functor {
 };
 
 }  // namespace
+
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(input.size() == replacement.size(),
@@ -119,17 +122,17 @@ std::unique_ptr<column> replace_nans(column_view const& input,
                          input,
                          *column_device_view::create(replacement),
                          replacement.nullable(),
-                         mr,
-                         stream);
+                         stream,
+                         mr);
 }
 
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   return type_dispatcher(
-    input.type(), replace_nans_functor{}, input, replacement, true, mr, stream);
+    input.type(), replace_nans_functor{}, input, replacement, true, stream, mr);
 }
 
 }  // namespace detail
@@ -147,7 +150,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nans(input, replacement, 0, mr);
+  return detail::replace_nans(input, replacement, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
@@ -175,9 +178,9 @@ struct normalize_nans_and_zeros_kernel_forwarder {
   template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
   void operator()(cudf::column_device_view in,
                   cudf::mutable_column_device_view out,
-                  cudaStream_t stream)
+                  rmm::cuda_stream_view stream)
   {
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator(0),
                       thrust::make_counting_iterator(in.size()),
                       out.head<T>(),
@@ -188,7 +191,7 @@ struct normalize_nans_and_zeros_kernel_forwarder {
   template <typename T, std::enable_if_t<not std::is_floating_point<T>::value>* = nullptr>
   void operator()(cudf::column_device_view in,
                   cudf::mutable_column_device_view out,
-                  cudaStream_t stream)
+                  rmm::cuda_stream_view stream)
   {
     CUDF_FAIL("Unexpected non floating-point type.");
   }
@@ -198,7 +201,7 @@ struct normalize_nans_and_zeros_kernel_forwarder {
 
 namespace cudf {
 namespace detail {
-void normalize_nans_and_zeros(mutable_column_view in_out, cudaStream_t stream = 0)
+void normalize_nans_and_zeros(mutable_column_view in_out, rmm::cuda_stream_view stream)
 {
   if (in_out.is_empty()) { return; }
   CUDF_EXPECTS(
@@ -240,11 +243,11 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
 {
   CUDF_FUNC_RANGE();
   // output. copies the input
-  std::unique_ptr<column> out = std::make_unique<column>(input, (cudaStream_t)0, mr);
+  std::unique_ptr<column> out = std::make_unique<column>(input, rmm::cuda_stream_default, mr);
   // from device. unique_ptr which gets automatically cleaned up when we leave.
   auto out_view = out->mutable_view();
 
-  detail::normalize_nans_and_zeros(out_view, 0);
+  detail::normalize_nans_and_zeros(out_view, rmm::cuda_stream_default);
 
   return out;
 }
@@ -262,7 +265,7 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
 void normalize_nans_and_zeros(mutable_column_view& in_out)
 {
   CUDF_FUNC_RANGE();
-  detail::normalize_nans_and_zeros(in_out, 0);
+  detail::normalize_nans_and_zeros(in_out, rmm::cuda_stream_default);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index d13d729536b..6f860dfd60d 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -29,11 +30,12 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/replace.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 #include <thrust/transform.h>
 
@@ -148,8 +150,8 @@ struct replace_nulls_column_kernel_forwarder {
   template <typename col_type, std::enable_if_t<cudf::is_fixed_width<col_type>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::column_view const& replacement,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream = 0)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     cudf::size_type nrows = input.size();
     cudf::detail::grid_1d grid{nrows, BLOCK_SIZE};
@@ -174,7 +176,7 @@ struct replace_nulls_column_kernel_forwarder {
     rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
 
-    replace<<<grid.num_blocks, BLOCK_SIZE, 0, stream>>>(
+    replace<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
       *device_in, *device_replacement, *device_out, valid_count);
 
     if (output_view.nullable()) {
@@ -187,8 +189,8 @@ struct replace_nulls_column_kernel_forwarder {
   template <typename col_type, std::enable_if_t<not cudf::is_fixed_width<col_type>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::column_view const& replacement,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream = 0)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("No specialization exists for the given type.");
   }
@@ -198,8 +200,8 @@ template <>
 std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<cudf::string_view>(
   cudf::column_view const& input,
   cudf::column_view const& replacement,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
   cudf::size_type* valid_count = valid_counter.data();
@@ -224,7 +226,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
 
   // Call first pass kernel to get sizes in offsets
   cudf::detail::grid_1d grid{input.size(), BLOCK_SIZE, 1};
-  replace_first<<<grid.num_blocks, BLOCK_SIZE, 0, stream>>>(
+  replace_first<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
     *device_in,
     *device_replacement,
     reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
@@ -233,21 +235,21 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
     valid_count);
 
   std::unique_ptr<cudf::column> offsets = cudf::strings::detail::make_offsets_child_column(
-    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), mr, stream);
+    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), mr, stream.value());
   auto offsets_view = offsets->mutable_view();
 
   int32_t size;
   CUDA_TRY(cudaMemcpyAsync(
-    &size, offsets_view.end<int32_t>() - 1, sizeof(int32_t), cudaMemcpyDefault, stream));
+    &size, offsets_view.end<int32_t>() - 1, sizeof(int32_t), cudaMemcpyDefault, stream.value()));
 
   // Allocate chars array and output null mask
-  cudf::size_type null_count = input.size() - valid_counter.value(stream);
-  std::unique_ptr<cudf::column> output_chars =
-    cudf::strings::detail::create_chars_child_column(input.size(), null_count, size, mr, stream);
+  cudf::size_type null_count                 = input.size() - valid_counter.value(stream);
+  std::unique_ptr<cudf::column> output_chars = cudf::strings::detail::create_chars_child_column(
+    input.size(), null_count, size, mr, stream.value());
 
   auto output_chars_view = output_chars->mutable_view();
 
-  replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream>>>(
+  replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
     *device_in,
     *device_replacement,
     reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
@@ -268,12 +270,12 @@ template <>
 std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<cudf::dictionary32>(
   cudf::column_view const& input,
   cudf::column_view const& replacement,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   cudf::dictionary_column_view dict_input(input);
   cudf::dictionary_column_view dict_repl(replacement);
-  return cudf::dictionary::detail::replace_nulls(dict_input, dict_repl, mr, stream);
+  return cudf::dictionary::detail::replace_nulls(dict_input, dict_repl, stream, mr);
 }
 
 template <typename T>
@@ -292,8 +294,8 @@ struct replace_nulls_scalar_kernel_forwarder {
             typename std::enable_if_t<cudf::is_fixed_width<col_type>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::scalar const& replacement,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream = 0)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
     std::unique_ptr<cudf::column> output =
@@ -306,7 +308,7 @@ struct replace_nulls_scalar_kernel_forwarder {
     auto device_in   = cudf::column_device_view::create(input);
 
     auto func = replace_nulls_functor<Type>{s1.data()};
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       input.data<Type>(),
                       input.data<Type>() + input.size(),
                       cudf::detail::make_validity_iterator(*device_in),
@@ -318,8 +320,8 @@ struct replace_nulls_scalar_kernel_forwarder {
   template <typename col_type, std::enable_if_t<not cudf::is_fixed_width<col_type>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::scalar const& replacement,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream = 0)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("No specialization exists for the given type.");
   }
@@ -329,24 +331,24 @@ template <>
 std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<cudf::string_view>(
   cudf::column_view const& input,
   cudf::scalar const& replacement,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
   cudf::strings_column_view input_s(input);
   const cudf::string_scalar& repl = static_cast<const cudf::string_scalar&>(replacement);
-  return cudf::strings::replace_nulls(input_s, repl, mr);
+  return cudf::strings::detail::replace_nulls(input_s, repl, stream, mr);
 }
 
 template <>
 std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<cudf::dictionary32>(
   cudf::column_view const& input,
   cudf::scalar const& replacement,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   cudf::dictionary_column_view dict_input(input);
-  return cudf::dictionary::detail::replace_nulls(dict_input, replacement, mr, stream);
+  return cudf::dictionary::detail::replace_nulls(dict_input, replacement, stream, mr);
 }
 
 }  // end anonymous namespace
@@ -355,8 +357,8 @@ namespace cudf {
 namespace detail {
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
-                                            rmm::mr::device_memory_resource* mr,
-                                            cudaStream_t stream)
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
   CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch");
@@ -366,13 +368,13 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input); }
 
   return cudf::type_dispatcher(
-    input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, mr, stream);
+    input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, stream, mr);
 }
 
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::scalar const& replacement,
-                                            rmm::mr::device_memory_resource* mr,
-                                            cudaStream_t stream)
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input); }
 
@@ -381,7 +383,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
   }
 
   return cudf::type_dispatcher(
-    input.type(), replace_nulls_scalar_kernel_forwarder{}, input, replacement, mr, stream);
+    input.type(), replace_nulls_scalar_kernel_forwarder{}, input, replacement, stream, mr);
 }
 
 }  // namespace detail
@@ -391,7 +393,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return cudf::detail::replace_nulls(input, replacement, mr, 0);
+  return cudf::detail::replace_nulls(input, replacement, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
@@ -399,6 +401,6 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return cudf::detail::replace_nulls(input, replacement, mr, 0);
+  return cudf::detail::replace_nulls(input, replacement, rmm::cuda_stream_default, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index eef397b6a13..6ca894ac186 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -299,8 +299,8 @@ struct replace_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input_col,
                                            cudf::column_view const& values_to_replace,
                                            cudf::column_view const& replacement_values,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream = 0)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
@@ -330,12 +330,12 @@ struct replace_kernel_forwarder {
     auto device_values_to_replace  = cudf::column_device_view::create(values_to_replace);
     auto device_replacement_values = cudf::column_device_view::create(replacement_values);
 
-    replace<<<grid.num_blocks, BLOCK_SIZE, 0, stream>>>(*device_in,
-                                                        *device_out,
-                                                        valid_count,
-                                                        output_view.size(),
-                                                        *device_values_to_replace,
-                                                        *device_replacement_values);
+    replace<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(*device_in,
+                                                                *device_out,
+                                                                valid_count,
+                                                                output_view.size(),
+                                                                *device_values_to_replace,
+                                                                *device_replacement_values);
 
     if (output_view.nullable()) {
       output->set_null_count(output->size() - valid_counter.value(stream));
@@ -347,8 +347,8 @@ struct replace_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input_col,
                                            cudf::column_view const& values_to_replace,
                                            cudf::column_view const& replacement_values,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream = 0)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("No specialization exists for this type");
   }
@@ -359,8 +359,8 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   cudf::column_view const& input_col,
   cudf::column_view const& values_to_replace,
   cudf::column_view const& replacement_values,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
   cudf::size_type* valid_count = valid_counter.data();
@@ -402,7 +402,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
 
   // Call first pass kernel to get sizes in offsets
   cudf::detail::grid_1d grid{input_col.size(), BLOCK_SIZE, 1};
-  replace_first<<<grid.num_blocks, BLOCK_SIZE, 0, stream>>>(
+  replace_first<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
     *device_in,
     *device_values_to_replace,
     *device_replacement,
@@ -412,22 +412,23 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
     valid_count);
 
   std::unique_ptr<cudf::column> offsets = cudf::strings::detail::make_offsets_child_column(
-    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), mr, stream);
+    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), mr, stream.value());
   auto offsets_view   = offsets->mutable_view();
   auto device_offsets = cudf::mutable_column_device_view::create(offsets_view);
   int32_t size;
   CUDA_TRY(cudaMemcpyAsync(
-    &size, offsets_view.end<int32_t>() - 1, sizeof(int32_t), cudaMemcpyDefault, stream));
+    &size, offsets_view.end<int32_t>() - 1, sizeof(int32_t), cudaMemcpyDefault, stream.value()));
+  stream.synchronize();
 
   // Allocate chars array and output null mask
   cudf::size_type null_count                 = input_col.size() - valid_counter.value(stream);
   std::unique_ptr<cudf::column> output_chars = cudf::strings::detail::create_chars_child_column(
-    input_col.size(), null_count, size, mr, stream);
+    input_col.size(), null_count, size, mr, stream.value());
 
   auto output_chars_view = output_chars->mutable_view();
   auto device_chars      = cudf::mutable_column_device_view::create(output_chars_view);
 
-  replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream>>>(
+  replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
     *device_in, *device_replacement, *device_offsets, *device_chars, *device_indices);
 
   return cudf::make_strings_column(input_col.size(),
@@ -444,8 +445,8 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
   cudf::column_view const& input_col,
   cudf::column_view const& values_to_replace,
   cudf::column_view const& replacement_values,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto input        = cudf::dictionary_column_view(input_col);
   auto values       = cudf::dictionary_column_view(values_to_replace);
@@ -454,13 +455,13 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
   auto matched_input = [&] {
     auto new_keys = cudf::detail::concatenate(
       {values.keys(), replacements.keys()}, stream, rmm::mr::get_current_device_resource());
-    return cudf::dictionary::detail::add_keys(input, new_keys->view(), mr, stream);
+    return cudf::dictionary::detail::add_keys(input, new_keys->view(), mr, stream.value());
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
   auto matched_values = cudf::dictionary::detail::set_keys(
-    values, matched_view.keys(), rmm::mr::get_current_device_resource(), stream);
+    values, matched_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
   auto matched_replacements = cudf::dictionary::detail::set_keys(
-    replacements, matched_view.keys(), rmm::mr::get_current_device_resource(), stream);
+    replacements, matched_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
 
   auto indices_type = matched_view.indices().type();
   auto new_indices  = cudf::type_dispatcher(
@@ -469,8 +470,8 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
     matched_view.get_indices_annotated(),
     cudf::dictionary_column_view(matched_values->view()).indices(),
     cudf::dictionary_column_view(matched_replacements->view()).get_indices_annotated(),
-    mr,
-    stream);
+    stream,
+    mr);
   auto null_count     = new_indices->null_count();
   auto contents       = new_indices->release();
   auto indices_column = std::make_unique<cudf::column>(
@@ -489,8 +490,8 @@ namespace detail {
 std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& input_col,
                                                    cudf::column_view const& values_to_replace,
                                                    cudf::column_view const& replacement_values,
-                                                   rmm::mr::device_memory_resource* mr,
-                                                   cudaStream_t stream)
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(values_to_replace.size() == replacement_values.size(),
                "values_to_replace and replacement_values size mismatch.");
@@ -509,8 +510,8 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
                                input_col,
                                values_to_replace,
                                replacement_values,
-                               mr,
-                               stream);
+                               stream,
+                               mr);
 }
 
 }  // namespace detail
@@ -532,6 +533,6 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
                                                    rmm::mr::device_memory_resource* mr)
 {
   return cudf::detail::find_and_replace_all(
-    input_col, values_to_replace, replacement_values, mr, 0);
+    input_col, values_to_replace, replacement_values, rmm::cuda_stream_default, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index f373c97b1ef..a1aca664e25 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -95,11 +95,11 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 string_scalar const& target,
                                 string_scalar const& repl,
                                 int32_t maxrepl,
-                                rmm::mr::device_memory_resource* mr,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
   CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
   CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string.");
@@ -111,23 +111,22 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
   auto d_strings      = *strings_column;
 
   // copy the null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0),
     replace_fn<two_pass::SIZE_ONLY>{d_strings, d_target, d_repl, maxrepl});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column =
-    create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream);
+    create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value());
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     replace_fn<two_pass::EXECUTE_OP>{d_strings, d_target, d_repl, maxrepl, d_offsets, d_chars});
@@ -184,11 +183,11 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
-                                      rmm::mr::device_memory_resource* mr,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
   if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
 
@@ -205,22 +204,22 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
     thrust::make_counting_iterator<int32_t>(0),
     replace_slice_fn<two_pass::SIZE_ONLY>{d_strings, d_repl, start, stop});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
   auto offsets_view = offsets_column->view();
   auto d_offsets    = offsets_view.data<int32_t>();
 
   // build chars column
   size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column =
-    create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream);
+    create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value());
   auto chars_view = chars_column->mutable_view();
   auto d_chars    = chars_view.data<char>();
   thrust::for_each_n(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     replace_slice_fn<two_pass::EXECUTE_OP>{d_strings, d_repl, start, stop, d_offsets, d_chars});
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -288,11 +287,11 @@ struct replace_multi_fn {
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
-                                rmm::mr::device_memory_resource* mr,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
   CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
                "Parameters targets must not be empty and must not have nulls");
   CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)),
@@ -308,23 +307,22 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
   auto d_repls        = *repls_column;
 
   // copy the null mask
-  rmm::device_buffer null_mask =
-    cudf::detail::copy_bitmask(strings.parent(), rmm::cuda_stream_view{stream}, mr);
+  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
   // build offsets column
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int32_t>(0),
     replace_multi_fn<two_pass::SIZE_ONLY>{d_strings, d_targets, d_repls});
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column =
-    create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream);
+    create_chars_child_column(strings_count, strings.null_count(), bytes, mr, stream.value());
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     replace_multi_fn<two_pass::EXECUTE_OP>{d_strings, d_targets, d_repls, d_offsets, d_chars});
@@ -340,11 +338,11 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
 
 std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                       string_scalar const& repl,
-                                      rmm::mr::device_memory_resource* mr,
-                                      cudaStream_t stream)
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
   CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
 
   string_view d_repl(repl.data(), repl.size());
@@ -359,15 +357,15 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                     : d_strings.element<string_view>(idx).size_bytes();
     });
   auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
+    offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream.value());
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
   size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
   auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, mr, stream);
+    strings_count, strings.null_count(), bytes, mr, stream.value());
   auto d_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream)->on(stream),
+  thrust::for_each_n(rmm::exec_policy(stream)->on(stream.value()),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      [d_strings, d_repl, d_offsets, d_chars] __device__(size_type idx) {
@@ -375,7 +373,7 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                        if (!d_strings.is_null(idx)) d_str = d_strings.element<string_view>(idx);
                        memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
                      });
-  //
+
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              std::move(chars_column),
@@ -396,7 +394,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace(strings, target, repl, maxrepl, mr);
+  return detail::replace(strings, target, repl, maxrepl, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> replace_slice(strings_column_view const& strings,
@@ -406,7 +404,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_slice(strings, repl, start, stop, mr);
+  return detail::replace_slice(strings, repl, start, stop, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> replace(strings_column_view const& strings,
@@ -415,7 +413,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace(strings, targets, repls, mr);
+  return detail::replace(strings, targets, repls, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
@@ -423,7 +421,7 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nulls(strings, repl, mr);
+  return detail::replace_nulls(strings, repl, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings

From 1340241c74a825d9fad939460733f375e7866873 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 14:26:12 +1100
Subject: [PATCH 32/51] Convert reshape/tile to cuda_stream_view

---
 cpp/include/cudf/detail/reshape.hpp | 4 +++-
 cpp/src/reshape/tile.cu             | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index fa56254b998..fb24b7669d7 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -18,6 +18,8 @@
 
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 
 namespace cudf {
@@ -30,7 +32,7 @@ namespace detail {
 std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view               = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 2803ee1bab3..c912143f6d7 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -26,6 +26,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <cudf/detail/gather.cuh>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace {
@@ -39,7 +40,7 @@ struct tile_functor {
 namespace detail {
 std::unique_ptr<table> tile(const table_view &in,
                             size_type count,
-                            cudaStream_t stream,
+                            rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(count >= 0, "Count cannot be negative");
@@ -61,7 +62,7 @@ std::unique_ptr<table> tile(const table_view &in,
                             rmm::mr::device_memory_resource *mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tile(in, count, 0, mr);
+  return detail::tile(in, count, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf

From 7a0c0f2834201ef392fe754e4defd55baba225d9 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 14:28:33 +1100
Subject: [PATCH 33/51] Convert round to cuda_stream_view

---
 cpp/include/cudf/detail/round.hpp |  4 +++-
 cpp/src/round/round.cu            | 16 +++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index 7e9fb03e0b0..c56686fa113 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -18,6 +18,8 @@
 
 #include <cudf/round.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
@@ -32,7 +34,7 @@ std::unique_ptr<column> round(
   column_view const& input,
   int32_t decimal_places,
   rounding_method method,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index dab1dce1a35..701fa35d262 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -28,6 +28,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <type_traits>
 
 namespace cudf {
@@ -201,7 +203,7 @@ template <typename T,
           typename std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr>
 std::unique_ptr<column> round_with(column_view const& input,
                                    int32_t decimal_places,
-                                   cudaStream_t stream,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   using Functor = RoundFunctor<T>;
@@ -215,7 +217,7 @@ std::unique_ptr<column> round_with(column_view const& input,
   auto out_view = result->mutable_view();
   T const n     = std::pow(10, std::abs(decimal_places));
 
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     input.begin<T>(),
                     input.end<T>(),
                     out_view.begin<T>(),
@@ -230,7 +232,7 @@ template <typename T,
           typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
 std::unique_ptr<column> round_with(column_view const& input,
                                    int32_t decimal_places,
-                                   cudaStream_t stream,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   using namespace numeric;
@@ -254,7 +256,7 @@ std::unique_ptr<column> round_with(column_view const& input,
   auto out_view = result->mutable_view();
   Type const n  = std::pow(10, std::abs(decimal_places + input.type().scale()));
 
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     input.begin<Type>(),
                     input.end<Type>(),
                     out_view.begin<Type>(),
@@ -276,7 +278,7 @@ struct round_type_dispatcher {
     column_view const& input,
     int32_t decimal_places,
     cudf::rounding_method method,
-    cudaStream_t stream,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr)
   {
     // clang-format off
@@ -302,7 +304,7 @@ struct round_type_dispatcher {
 std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               cudf::rounding_method method,
-                              cudaStream_t stream,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(cudf::is_numeric(input.type()) || cudf::is_fixed_point(input.type()),
@@ -328,7 +330,7 @@ std::unique_ptr<column> round(column_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return cudf::detail::round(input, decimal_places, method, 0, mr);
+  return cudf::detail::round(input, decimal_places, method, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf

From 61446f81170a2fd23b2913b4edc3bd3b1e3263d1 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 14:54:30 +1100
Subject: [PATCH 34/51] Convert scatter to cuda_steam_view

---
 cpp/include/cudf/detail/scatter.cuh         |  48 ++++-----
 cpp/include/cudf/detail/scatter.hpp         |  32 +++---
 cpp/include/cudf/strings/detail/scatter.cuh |  18 ++--
 cpp/src/copying/scatter.cu                  | 104 +++++++++++---------
 cpp/src/groupby/sort/sort_helper.cu         |   4 +-
 cpp/src/hash/hashing.cu                     |   9 +-
 cpp/src/partitioning/partitioning.cu        |   4 +-
 7 files changed, 113 insertions(+), 106 deletions(-)

diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 6d93c78fd3e..da5814933fa 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -28,6 +28,8 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 
@@ -54,7 +56,7 @@ template <typename MapIterator>
 auto scatter_to_gather(MapIterator scatter_map_begin,
                        MapIterator scatter_map_end,
                        size_type gather_rows,
-                       cudaStream_t stream)
+                       rmm::cuda_stream_view stream)
 {
   using MapValueType = typename thrust::iterator_traits<MapIterator>::value_type;
 
@@ -66,7 +68,7 @@ auto scatter_to_gather(MapIterator scatter_map_begin,
 
   // Convert scatter map to a gather map
   thrust::scatter(
-    rmm::exec_policy(stream)->on(stream),
+    rmm::exec_policy(stream)->on(stream.value()),
     thrust::make_counting_iterator<MapValueType>(0),
     thrust::make_counting_iterator<MapValueType>(std::distance(scatter_map_begin, scatter_map_end)),
     scatter_map_begin,
@@ -81,8 +83,8 @@ struct column_scatterer_impl {
                                      MapIterator scatter_map_begin,
                                      MapIterator scatter_map_end,
                                      column_view const& target,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     auto result      = std::make_unique<column>(target, stream, mr);
     auto result_view = result->mutable_view();
@@ -91,7 +93,7 @@ struct column_scatterer_impl {
 
     // NOTE use source.begin + scatter rows rather than source.end in case the
     // scatter map is smaller than the number of source rows
-    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+    thrust::scatter(rmm::exec_policy(stream)->on(stream.value()),
                     source.begin<Type>(),
                     source.begin<Type>() + cudf::distance(scatter_map_begin, scatter_map_end),
                     scatter_map_begin,
@@ -107,14 +109,14 @@ struct column_scatterer_impl<string_view, MapIterator> {
                                      MapIterator scatter_map_begin,
                                      MapIterator scatter_map_end,
                                      column_view const& target,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     using strings::detail::create_string_vector_from_column;
-    auto const source_vector = create_string_vector_from_column(source, stream);
+    auto const source_vector = create_string_vector_from_column(source, stream.value());
     auto const begin         = source_vector.begin();
     auto const end           = begin + std::distance(scatter_map_begin, scatter_map_end);
-    return strings::detail::scatter(begin, end, scatter_map_begin, target, mr, stream);
+    return strings::detail::scatter(begin, end, scatter_map_begin, target, stream, mr);
   }
 };
 
@@ -125,11 +127,11 @@ struct column_scatterer {
                                      MapIterator scatter_map_begin,
                                      MapIterator scatter_map_end,
                                      column_view const& target,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     column_scatterer_impl<Element, MapIterator> scatterer{};
-    return scatterer(source, scatter_map_begin, scatter_map_end, target, mr, stream);
+    return scatterer(source, scatter_map_begin, scatter_map_end, target, stream, mr);
   }
 };
 
@@ -139,8 +141,8 @@ struct column_scatterer_impl<dictionary32, MapIterator> {
                                      MapIterator scatter_map_begin,
                                      MapIterator scatter_map_end,
                                      column_view const& target_in,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     if (target_in.is_empty())  // empty begets empty
       return make_empty_column(data_type{type_id::DICTIONARY32});
@@ -154,17 +156,17 @@ struct column_scatterer_impl<dictionary32, MapIterator> {
                  "scatter dictionary keys must be the same type");
 
     // first combine keys so both dictionaries have the same set
-    auto target_matched    = dictionary::detail::add_keys(target, source.keys(), mr, stream);
+    auto target_matched = dictionary::detail::add_keys(target, source.keys(), mr, stream.value());
     auto const target_view = dictionary_column_view(target_matched->view());
     auto source_matched    = dictionary::detail::set_keys(
-      source, target_view.keys(), rmm::mr::get_current_device_resource(), stream);
+      source, target_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
     auto const source_view = dictionary_column_view(source_matched->view());
 
     // now build the new indices by doing a scatter on just the matched indices
     auto source_itr  = indexalator_factory::make_input_iterator(source_view.indices());
     auto new_indices = std::make_unique<column>(target_view.get_indices_annotated(), stream, mr);
     auto target_itr  = indexalator_factory::make_output_iterator(new_indices->mutable_view());
-    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+    thrust::scatter(rmm::exec_policy(stream)->on(stream.value()),
                     source_itr,
                     source_itr + std::distance(scatter_map_begin, scatter_map_end),
                     scatter_map_begin,
@@ -221,8 +223,8 @@ struct column_scatterer_impl<dictionary32, MapIterator> {
  * are to be scattered
  * @param[in] check_bounds Optionally perform bounds checking on the values of
  * `scatter_map` and throw an error if any of its values are out of bounds.
- * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used to allocate the returned table's device memory
  *
  * @return Result of scattering values from source to target
  **/
@@ -233,8 +235,8 @@ std::unique_ptr<table> scatter(
   MapIterator scatter_map_end,
   table_view const& target,
   bool check_bounds                   = false,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_FUNC_RANGE();
 
@@ -247,7 +249,7 @@ std::unique_ptr<table> scatter(
     CUDF_EXPECTS(
       std::distance(scatter_map_begin, scatter_map_end) ==
         thrust::count_if(
-          rmm::exec_policy(stream)->on(stream), scatter_map_begin, scatter_map_end, bounds),
+          rmm::exec_policy(stream)->on(stream.value()), scatter_map_begin, scatter_map_end, bounds),
       "Scatter map index out of bounds");
   }
 
@@ -276,8 +278,8 @@ std::unique_ptr<table> scatter(
                                           updated_scatter_map_begin,
                                           updated_scatter_map_end,
                                           target_col,
-                                          mr,
-                                          stream);
+                                          stream,
+                                          mr);
                  });
 
   auto gather_map = scatter_to_gather(
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 6f92ae3b553..a5676c86f49 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,9 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <memory>
 
@@ -55,8 +58,8 @@ namespace detail {
  * are to be scattered
  * @param check_bounds Optionally perform bounds checking on the values of
  * `scatter_map` and throw an error if any of its values are out of bounds.
- * @param mr Device memory resource used to allocate the returned table's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  **/
 std::unique_ptr<table> scatter(
@@ -64,8 +67,8 @@ std::unique_ptr<table> scatter(
   column_view const& scatter_map,
   table_view const& target,
   bool check_bounds                   = false,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -95,8 +98,8 @@ std::unique_ptr<table> scatter(
  * are to be scattered
  * @param check_bounds Optionally perform bounds checking on the values of
  * `scatter_map` and throw an error if any of its values are out of bounds.
- * @param mr Device memory resource used to allocate the returned table's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  **/
 std::unique_ptr<table> scatter(
@@ -104,8 +107,8 @@ std::unique_ptr<table> scatter(
   column_view const& indices,
   table_view const& target,
   bool check_bounds                   = false,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
@@ -115,11 +118,12 @@ std::unique_ptr<table> scatter(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> boolean_mask_scatter(table_view const& source,
-                                            table_view const& target,
-                                            column_view const& boolean_mask,
-                                            rmm::mr::device_memory_resource* mr,
-                                            cudaStream_t stream = 0);
+std::unique_ptr<table> boolean_mask_scatter(
+  table_view const& source,
+  table_view const& target,
+  column_view const& boolean_mask,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
@@ -134,8 +138,8 @@ std::unique_ptr<table> boolean_mask_scatter(
   std::vector<std::reference_wrapper<const scalar>> const& source,
   table_view const& target,
   column_view const& boolean_mask,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 4f495afa099..9e0497052a6 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -57,28 +57,28 @@ std::unique_ptr<column> scatter(
   SourceIterator end,
   MapIterator scatter_map,
   strings_column_view const& target,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = target.size();
-  if (strings_count == 0) return make_empty_strings_column(mr, stream);
+  if (strings_count == 0) return make_empty_strings_column(mr, stream.value());
 
   // create null mask -- caller must update this
   rmm::device_buffer null_mask{0, stream, mr};
-  if (target.has_nulls())
-    null_mask = cudf::detail::copy_bitmask(target.parent(), rmm::cuda_stream_view{stream}, mr);
+  if (target.has_nulls()) null_mask = cudf::detail::copy_bitmask(target.parent(), stream, mr);
 
   // create string vectors
-  rmm::device_vector<string_view> target_vector = create_string_vector_from_column(target, stream);
+  rmm::device_vector<string_view> target_vector =
+    create_string_vector_from_column(target, stream.value());
   // do the scatter
   thrust::scatter(
-    rmm::exec_policy(stream)->on(stream), begin, end, scatter_map, target_vector.begin());
+    rmm::exec_policy(stream)->on(stream.value()), begin, end, scatter_map, target_vector.begin());
 
   // build offsets column
-  auto offsets_column = child_offsets_from_string_vector(target_vector, mr, stream);
+  auto offsets_column = child_offsets_from_string_vector(target_vector, mr, stream.value());
   // build chars column
   auto chars_column = child_chars_from_string_vector(
-    target_vector, offsets_column->view().data<int32_t>(), 0, mr, stream);
+    target_vector, offsets_column->view().data<int32_t>(), 0, mr, stream.value());
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 373ed224f99..7b50477fc20 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -32,6 +32,8 @@
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/sequence.h>
@@ -65,8 +67,8 @@ void scatter_scalar_bitmask(std::vector<std::reference_wrapper<const scalar>> co
                             MapIterator scatter_map,
                             size_type num_scatter_rows,
                             std::vector<std::unique_ptr<column>>& target,
-                            rmm::mr::device_memory_resource* mr,
-                            cudaStream_t stream)
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
 {
   constexpr size_type block_size = 256;
   size_type const grid_size      = grid_1d(num_scatter_rows, block_size).num_blocks;
@@ -84,7 +86,7 @@ void scatter_scalar_bitmask(std::vector<std::reference_wrapper<const scalar>> co
 
       auto bitmask_kernel = source_is_valid ? marking_bitmask_kernel<true, decltype(scatter_map)>
                                             : marking_bitmask_kernel<false, decltype(scatter_map)>;
-      bitmask_kernel<<<grid_size, block_size, 0, stream>>>(
+      bitmask_kernel<<<grid_size, block_size, 0, stream.value()>>>(
         *target_view, scatter_map, num_scatter_rows);
     }
   }
@@ -96,8 +98,8 @@ struct column_scalar_scatterer_impl {
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match");
 
@@ -111,7 +113,7 @@ struct column_scalar_scatterer_impl {
     auto scalar_iter =
       thrust::make_permutation_iterator(scalar_impl->data(), thrust::make_constant_iterator(0));
 
-    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+    thrust::scatter(rmm::exec_policy(stream)->on(stream.value()),
                     scalar_iter,
                     scalar_iter + scatter_rows,
                     scatter_iter,
@@ -127,8 +129,8 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     CUDF_EXPECTS(source.get().type() == target.type(), "scalar and column types must match");
 
@@ -136,7 +138,7 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
     auto const source_view = string_view(scalar_impl->data(), scalar_impl->size());
     auto const begin       = thrust::make_constant_iterator(source_view);
     auto const end         = begin + scatter_rows;
-    return strings::detail::scatter(begin, end, scatter_iter, target, mr, stream);
+    return strings::detail::scatter(begin, end, scatter_iter, target, stream, mr);
   }
 };
 
@@ -146,8 +148,8 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     CUDF_FAIL("scatter scalar to list_view not implemented");
   }
@@ -159,8 +161,8 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     CUDF_FAIL("scatter scalar to struct_view not implemented");
   }
@@ -172,27 +174,29 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     auto dict_target = dictionary::detail::add_keys(
       dictionary_column_view(target),
       make_column_from_scalar(source.get(), 1, stream, rmm::mr::get_current_device_resource())
         ->view(),
       mr,
-      stream);
+      stream.value());
     auto dict_view    = dictionary_column_view(dict_target->view());
     auto scalar_index = dictionary::detail::get_index(
-      dict_view, source.get(), rmm::mr::get_current_device_resource(), stream);
+      dict_view, source.get(), rmm::mr::get_current_device_resource(), stream.value());
     auto scalar_iter = thrust::make_permutation_iterator(
       indexalator_factory::make_input_iterator(*scalar_index), thrust::make_constant_iterator(0));
     auto new_indices = std::make_unique<column>(dict_view.get_indices_annotated(), stream, mr);
     auto target_iter = indexalator_factory::make_output_iterator(new_indices->mutable_view());
-    thrust::scatter(rmm::exec_policy(stream)->on(stream),
+
+    thrust::scatter(rmm::exec_policy(stream)->on(stream.value()),
                     scalar_iter,
                     scalar_iter + scatter_rows,
                     scatter_iter,
                     target_iter);
+
     // build the dictionary indices column from the result
     auto const indices_type = new_indices->type();
     auto const output_size  = new_indices->size();
@@ -220,11 +224,11 @@ struct column_scalar_scatterer {
                                      MapIterator scatter_iter,
                                      size_type scatter_rows,
                                      column_view const& target,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     column_scalar_scatterer_impl<Element, MapIterator> scatterer{};
-    return scatterer(source, scatter_iter, scatter_rows, target, mr, stream);
+    return scatterer(source, scatter_iter, scatter_rows, target, stream, mr);
   }
 };
 
@@ -234,8 +238,8 @@ std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
                                bool check_bounds,
-                               rmm::mr::device_memory_resource* mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(source.num_columns() == target.num_columns(),
                "Number of columns in source and target not equal");
@@ -255,15 +259,15 @@ std::unique_ptr<table> scatter(table_view const& source,
   // create index type normalizing iterator for the scatter_map
   auto map_begin = indexalator_factory::make_input_iterator(scatter_map);
   auto map_end   = map_begin + scatter_map.size();
-  return detail::scatter(source, map_begin, map_end, target, check_bounds, mr, stream);
+  return detail::scatter(source, map_begin, map_end, target, check_bounds, stream, mr);
 }
 
 std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
                                column_view const& indices,
                                table_view const& target,
                                bool check_bounds,
-                               rmm::mr::device_memory_resource* mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(source.size() == static_cast<size_t>(target.num_columns()),
                "Number of columns in source and target not equal");
@@ -279,7 +283,7 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>
   auto const n_rows = target.num_rows();
   if (check_bounds) {
     CUDF_EXPECTS(
-      indices.size() == thrust::count_if(rmm::exec_policy(stream)->on(stream),
+      indices.size() == thrust::count_if(rmm::exec_policy(stream)->on(stream.value()),
                                          map_begin,
                                          map_end,
                                          [n_rows] __device__(size_type index) {
@@ -307,11 +311,11 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>
                                           scatter_iter,
                                           scatter_rows,
                                           target_col,
-                                          mr,
-                                          stream);
+                                          stream,
+                                          mr);
                  });
 
-  scatter_scalar_bitmask(source, scatter_iter, scatter_rows, result, mr, stream);
+  scatter_scalar_bitmask(source, scatter_iter, scatter_rows, result, stream, mr);
 
   return std::make_unique<table>(std::move(result));
 }
@@ -319,27 +323,29 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>
 std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
                                              column_view const& target,
                                              column_view const& boolean_mask,
-                                             rmm::mr::device_memory_resource* mr,
-                                             cudaStream_t stream)
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
 {
   auto indices = cudf::make_numeric_column(
     data_type{type_id::INT32}, target.size(), mask_state::UNALLOCATED, stream);
   auto mutable_indices = indices->mutable_view();
 
-  thrust::sequence(rmm::exec_policy(stream)->on(stream),
+  thrust::sequence(rmm::exec_policy(stream)->on(stream.value()),
                    mutable_indices.begin<size_type>(),
                    mutable_indices.end<size_type>(),
                    0);
 
   // The scatter map is actually a table with only one column, which is scatter map.
-  auto scatter_map = detail::apply_boolean_mask(
-    table_view{{indices->view()}}, boolean_mask, rmm::mr::get_current_device_resource(), stream);
+  auto scatter_map  = detail::apply_boolean_mask(table_view{{indices->view()}},
+                                                boolean_mask,
+                                                rmm::mr::get_current_device_resource(),
+                                                stream.value());
   auto output_table = detail::scatter(table_view{{input}},
                                       scatter_map->get_column(0).view(),
                                       table_view{{target}},
                                       false,
-                                      mr,
-                                      stream);
+                                      stream,
+                                      mr);
 
   // There is only one column in output_table
   return std::make_unique<column>(std::move(output_table->get_column(0)));
@@ -348,8 +354,8 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
 std::unique_ptr<column> boolean_mask_scatter(scalar const& input,
                                              column_view const& target,
                                              column_view const& boolean_mask,
-                                             rmm::mr::device_memory_resource* mr,
-                                             cudaStream_t stream)
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
 {
   return detail::copy_if_else(input, target, boolean_mask, stream, mr);
 }
@@ -357,8 +363,8 @@ std::unique_ptr<column> boolean_mask_scatter(scalar const& input,
 std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                                             table_view const& target,
                                             column_view const& boolean_mask,
-                                            rmm::mr::device_memory_resource* mr,
-                                            cudaStream_t stream)
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(input.num_columns() == target.num_columns(),
                "Mismatch in number of input columns and target columns");
@@ -382,7 +388,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
       target.begin(),
       out_columns.begin(),
       [&boolean_mask, mr, stream](auto const& input_column, auto const& target_column) {
-        return boolean_mask_scatter(input_column, target_column, boolean_mask, mr, stream);
+        return boolean_mask_scatter(input_column, target_column, boolean_mask, stream, mr);
       });
 
     return std::make_unique<table>(std::move(out_columns));
@@ -395,8 +401,8 @@ std::unique_ptr<table> boolean_mask_scatter(
   std::vector<std::reference_wrapper<const scalar>> const& input,
   table_view const& target,
   column_view const& boolean_mask,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(static_cast<size_type>(input.size()) == target.num_columns(),
                "Mismatch in number of scalars and target columns");
@@ -421,7 +427,7 @@ std::unique_ptr<table> boolean_mask_scatter(
                    out_columns.begin(),
                    [&boolean_mask, mr, stream](auto const& scalar, auto const& target_column) {
                      return boolean_mask_scatter(
-                       scalar.get(), target_column, boolean_mask, mr, stream);
+                       scalar.get(), target_column, boolean_mask, stream, mr);
                    });
 
     return std::make_unique<table>(std::move(out_columns));
@@ -439,7 +445,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scatter(source, scatter_map, target, check_bounds, mr);
+  return detail::scatter(source, scatter_map, target, check_bounds, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
@@ -449,7 +455,7 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scatter(source, indices, target, check_bounds, mr);
+  return detail::scatter(source, indices, target, check_bounds, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
@@ -458,7 +464,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::boolean_mask_scatter(input, target, boolean_mask, mr);
+  return detail::boolean_mask_scatter(input, target, boolean_mask, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> boolean_mask_scatter(
@@ -468,7 +474,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::boolean_mask_scatter(input, target, boolean_mask, mr);
+  return detail::boolean_mask_scatter(input, target, boolean_mask, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 064c3e97b20..93d785b78f0 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -231,8 +231,8 @@ column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stre
                           scatter_map,
                           table_view({temp_labels->view()}),
                           false,
-                          rmm::mr::get_current_device_resource(),
-                          stream.value());
+                          stream,
+                          rmm::mr::get_current_device_resource());
 
   _unsorted_keys_labels = std::move(t_unsorted_keys_labels->release()[0]);
 
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index ab703c78261..8e91de9707f 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -602,13 +602,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
       row_output_locations, num_rows, num_partitions, scanned_block_partition_sizes_ptr);
 
     // Use the resulting scatter map to materialize the output
-    auto output = detail::scatter(input,
-                                  row_partition_numbers.begin(),
-                                  row_partition_numbers.end(),
-                                  input,
-                                  false,
-                                  mr,
-                                  stream.value());
+    auto output = detail::scatter(
+      input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, stream, mr);
 
     return std::make_pair(std::move(output), std::move(partition_offsets));
   }
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index c63b7079a07..3d0f35568f4 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -614,7 +614,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
 
     // Use the resulting scatter map to materialize the output
     auto output = detail::scatter(
-      input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, mr, stream);
+      input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, stream, mr);
 
     return std::make_pair(std::move(output), std::move(partition_offsets));
   }
@@ -702,7 +702,7 @@ struct dispatch_map_type {
 
     // Scatter the rows into their partitions
     auto scattered =
-      cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, false, mr, stream);
+      cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, false, stream, mr);
 
     return std::make_pair(std::move(scattered), std::move(partition_offsets));
   }

From 091aa274dbd91e46a81d5894a95a02cc89e95921 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 15:17:11 +1100
Subject: [PATCH 35/51] Convert search to cuda_stream_view

---
 cpp/include/cudf/detail/search.hpp            | 18 ++--
 cpp/include/cudf/dictionary/detail/search.hpp | 12 ++-
 cpp/src/copying/scatter.cu                    |  2 +-
 cpp/src/dictionary/add_keys.cu                |  4 +-
 cpp/src/dictionary/remove_keys.cu             |  4 +-
 cpp/src/dictionary/replace.cu                 |  2 +-
 cpp/src/dictionary/search.cu                  | 44 +++++----
 cpp/src/dictionary/set_keys.cu                |  2 +-
 cpp/src/filling/fill.cu                       |  2 +-
 cpp/src/replace/clamp.cu                      |  8 +-
 cpp/src/search/search.cu                      | 96 ++++++++++---------
 cpp/src/transform/encode.cu                   |  4 +-
 12 files changed, 106 insertions(+), 92 deletions(-)

diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 3eca864ab52..c986418c790 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -21,6 +21,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <vector>
 
 namespace cudf {
@@ -35,8 +37,8 @@ std::unique_ptr<column> lower_bound(
   table_view const& values,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t steam                  = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::upper_bound
@@ -48,8 +50,8 @@ std::unique_ptr<column> upper_bound(
   table_view const& values,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::contains(column_view const&, scalar const&,
@@ -57,7 +59,9 @@ std::unique_ptr<column> upper_bound(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-bool contains(column_view const& col, scalar const& value, cudaStream_t stream = 0);
+bool contains(column_view const& col,
+              scalar const& value,
+              rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @copydoc cudf::contains(column_view const&, column_view const&,
@@ -68,8 +72,8 @@ bool contains(column_view const& col, scalar const& value, cudaStream_t stream =
 std::unique_ptr<column> contains(
   column_view const& haystack,
   column_view const& needles,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp
index 21ab0c92acd..cc0e8d0319b 100644
--- a/cpp/include/cudf/dictionary/detail/search.hpp
+++ b/cpp/include/cudf/dictionary/detail/search.hpp
@@ -17,6 +17,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -30,8 +32,8 @@ namespace detail {
 std::unique_ptr<scalar> get_index(
   dictionary_column_view const& dictionary,
   scalar const& key,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Get the index for a key if it were added to the given dictionary.
@@ -48,15 +50,15 @@ std::unique_ptr<scalar> get_index(
  *
  * @param dictionary The dictionary to search for the key.
  * @param key The value to search for in the dictionary keyset.
- * @param mr Device memory resource used to allocate the returned column's device memory.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Numeric scalar index value of the key within the dictionary
  */
 std::unique_ptr<scalar> get_insert_index(
   dictionary_column_view const& dictionary,
   scalar const& key,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 7b50477fc20..887972f2dd6 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -185,7 +185,7 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
       stream.value());
     auto dict_view    = dictionary_column_view(dict_target->view());
     auto scalar_index = dictionary::detail::get_index(
-      dict_view, source.get(), rmm::mr::get_current_device_resource(), stream.value());
+      dict_view, source.get(), stream, rmm::mr::get_current_device_resource());
     auto scalar_iter = thrust::make_permutation_iterator(
       indexalator_factory::make_input_iterator(*scalar_index), thrust::make_constant_iterator(0));
     auto new_indices = std::make_unique<column>(dict_view.get_indices_annotated(), stream, mr);
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index c02f38e2a0e..64ce8d1e07e 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -76,8 +76,8 @@ std::unique_ptr<column> add_keys(
     table_view{{old_keys}},
     std::vector<order>{order::ASCENDING},
     std::vector<null_order>{null_order::AFTER},  // should be no nulls here
-    mr,
-    stream);
+    stream,
+    mr);
   // now create the indices column -- map old values to the new ones
   // gather([4,0,3,1,2,2,2,4,0],[0,1,2,3,5]) = [5,0,3,1,2,2,2,5,0]
   column_view indices_view(dictionary_column.indices().type(),
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index e04c6257692..f0f86a3dd1a 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -153,7 +153,7 @@ std::unique_ptr<column> remove_keys(
   CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match");
 
   // locate keys to remove by searching the keys column
-  auto const matches = cudf::detail::contains(keys_view, keys_to_remove, mr, stream);
+  auto const matches = cudf::detail::contains(keys_view, keys_to_remove, stream, mr);
   auto d_matches     = matches->view().data<bool>();
   // call common utility method to keep the keys not matched to keys_to_remove
   auto key_matcher = [d_matches] __device__(size_type idx) { return !d_matches[idx]; };
@@ -177,7 +177,7 @@ std::unique_ptr<column> remove_unused_keys(
       rmm::exec_policy(stream)->on(stream), keys_positions.begin(), keys_positions.end());
     // wrap the indices for comparison in contains()
     column_view keys_positions_view(data_type{type_id::UINT32}, keys_size, keys_positions.data());
-    return cudf::detail::contains(keys_positions_view, indices_view, mr, stream);
+    return cudf::detail::contains(keys_positions_view, indices_view, stream, mr);
   }();
   auto d_matches = matches->view().data<bool>();
 
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 097490c4ff3..27a85c03898 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -178,7 +178,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   auto input_matched    = dictionary::detail::add_keys(
     input, make_column_from_scalar(replacement, 1, stream, default_mr)->view(), mr, stream.value());
   auto const input_view   = dictionary_column_view(input_matched->view());
-  auto const scalar_index = get_index(input_view, replacement, default_mr, stream.value());
+  auto const scalar_index = get_index(input_view, replacement, stream, default_mr);
 
   // now build the new indices by doing replace-null on the updated indices
   auto const input_indices = input_view.get_indices_annotated();
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 942415ffb77..5f82fc8a36d 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -21,6 +21,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 
@@ -34,7 +36,7 @@ struct dispatch_scalar_index {
   template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(size_type index,
                                      bool is_valid,
-                                     cudaStream_t stream,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
     return std::make_unique<numeric_scalar<IndexType>>(index, is_valid, stream, mr);
@@ -63,8 +65,8 @@ struct find_index_fn {
                              not std::is_same<Element, struct_view>::value>* = nullptr>
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     if (!key.is_valid())
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
@@ -92,8 +94,8 @@ struct find_index_fn {
             std::enable_if_t<std::is_same<Element, dictionary32>::value>* = nullptr>
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     CUDF_FAIL("dictionary column cannot be the keys column of another dictionary");
   }
@@ -101,8 +103,8 @@ struct find_index_fn {
   template <typename Element, std::enable_if_t<std::is_same<Element, list_view>::value>* = nullptr>
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     CUDF_FAIL("list_view column cannot be the keys column of a dictionary");
   }
@@ -111,8 +113,8 @@ struct find_index_fn {
             std::enable_if_t<std::is_same<Element, struct_view>::value>* = nullptr>
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     CUDF_FAIL("struct_view column cannot be the keys column of a dictionary");
   }
@@ -125,8 +127,8 @@ struct find_insert_index_fn {
                              not std::is_same<Element, struct_view>::value>* = nullptr>
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     if (!key.is_valid())
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
@@ -137,7 +139,7 @@ struct find_insert_index_fn {
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
     auto keys_view   = column_device_view::create(input.keys(), stream);
-    auto iter        = thrust::lower_bound(rmm::exec_policy(stream)->on(stream),
+    auto iter        = thrust::lower_bound(rmm::exec_policy(stream)->on(stream.value()),
                                     keys_view->begin<Type>(),
                                     keys_view->end<Type>(),
                                     find_key);
@@ -155,8 +157,8 @@ struct find_insert_index_fn {
                              std::is_same<Element, struct_view>::value>* = nullptr>
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream) const
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
   {
     CUDF_FAIL("column cannot be the keys for dictionary");
   }
@@ -166,23 +168,23 @@ struct find_insert_index_fn {
 
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   if (dictionary.is_empty())
     return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
-  return type_dispatcher(dictionary.keys().type(), find_index_fn(), dictionary, key, mr, stream);
+  return type_dispatcher(dictionary.keys().type(), find_index_fn(), dictionary, key, stream, mr);
 }
 
 std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionary,
                                          scalar const& key,
-                                         rmm::mr::device_memory_resource* mr,
-                                         cudaStream_t stream)
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   if (dictionary.is_empty())
     return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
   return type_dispatcher(
-    dictionary.keys().type(), find_insert_index_fn(), dictionary, key, mr, stream);
+    dictionary.keys().type(), find_insert_index_fn(), dictionary, key, stream, mr);
 }
 
 }  // namespace detail
@@ -194,7 +196,7 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_index(dictionary, key, mr);
+  return detail::get_index(dictionary, key, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index d95fdefe153..ae4a817f182 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -119,7 +119,7 @@ std::unique_ptr<column> set_keys(
   std::unique_ptr<column> keys_column(std::move(table_keys.front()));
 
   // compute the new nulls
-  auto matches   = cudf::detail::contains(keys, keys_column->view(), mr, stream);
+  auto matches   = cudf::detail::contains(keys, keys_column->view(), stream, mr);
   auto d_matches = matches->view().data<bool>();
   auto indices_itr =
     cudf::detail::indexalator_factory::make_input_iterator(dictionary_column.indices());
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 6fba9bc01a5..d4fd526ff4b 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -171,7 +171,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
 
   // get the index of the key just added
   auto index_of_value = cudf::dictionary::detail::get_index(
-    target_matched->view(), value, rmm::mr::get_current_device_resource(), stream.value());
+    target_matched->view(), value, stream, rmm::mr::get_current_device_resource());
   // now call fill using just the indices column and the new index
   auto new_indices =
     cudf::type_dispatcher(target_indices.type(),
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index a2fd8c91bc7..cdd8d78fdef 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -338,15 +338,15 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
 
   // get the indexes for lo_replace and for hi_replace
   auto lo_replace_index = dictionary::detail::get_index(
-    matched_view, lo_replace, rmm::mr::get_current_device_resource(), stream);
+    matched_view, lo_replace, stream, rmm::mr::get_current_device_resource());
   auto hi_replace_index = dictionary::detail::get_index(
-    matched_view, hi_replace, rmm::mr::get_current_device_resource(), stream);
+    matched_view, hi_replace, stream, rmm::mr::get_current_device_resource());
 
   // get the closest indexes for lo and for hi
   auto lo_index = dictionary::detail::get_insert_index(
-    matched_view, lo, rmm::mr::get_current_device_resource(), stream);
+    matched_view, lo, stream, rmm::mr::get_current_device_resource());
   auto hi_index = dictionary::detail::get_insert_index(
-    matched_view, hi, rmm::mr::get_current_device_resource(), stream);
+    matched_view, hi, stream, rmm::mr::get_current_device_resource());
 
   // call clamp with the scalar indexes and the matched indices
   auto matched_indices = matched_view.get_indices_annotated();
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 92b7ea49fd3..e8d776d0d2a 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -26,9 +26,12 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <thrust/binary_search.h>
 #include <hash/unordered_multiset.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/binary_search.h>
+
 namespace cudf {
 namespace {
 template <typename DataIterator,
@@ -42,10 +45,10 @@ void launch_search(DataIterator it_data,
                    OutputIterator it_output,
                    Comparator comp,
                    bool find_first,
-                   cudaStream_t stream)
+                   rmm::cuda_stream_view stream)
 {
   if (find_first) {
-    thrust::lower_bound(rmm::exec_policy(stream)->on(stream),
+    thrust::lower_bound(rmm::exec_policy(stream)->on(stream.value()),
                         it_data,
                         it_data + data_size,
                         it_vals,
@@ -53,7 +56,7 @@ void launch_search(DataIterator it_data,
                         it_output,
                         comp);
   } else {
-    thrust::upper_bound(rmm::exec_policy(stream)->on(stream),
+    thrust::upper_bound(rmm::exec_policy(stream)->on(stream.value()),
                         it_data,
                         it_data + data_size,
                         it_vals,
@@ -68,8 +71,8 @@ std::unique_ptr<column> search_ordered(table_view const& t,
                                        bool find_first,
                                        std::vector<order> const& column_order,
                                        std::vector<null_order> const& null_precedence,
-                                       rmm::mr::device_memory_resource* mr,
-                                       cudaStream_t stream = 0)
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   // Allocate result column
   std::unique_ptr<column> result = make_numeric_column(
@@ -79,7 +82,8 @@ std::unique_ptr<column> search_ordered(table_view const& t,
 
   // Handle empty inputs
   if (t.num_rows() == 0) {
-    CUDA_TRY(cudaMemset(result_view.data<size_type>(), 0, values.num_rows() * sizeof(size_type)));
+    CUDA_TRY(cudaMemsetAsync(
+      result_view.data<size_type>(), 0, values.num_rows() * sizeof(size_type), stream.value()));
     return result;
   }
 
@@ -96,7 +100,7 @@ std::unique_ptr<column> search_ordered(table_view const& t,
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
   auto matched = dictionary::detail::match_dictionaries(
-    {t, values}, rmm::mr::get_current_device_resource(), stream);
+    {t, values}, rmm::mr::get_current_device_resource(), stream.value());
   auto d_t      = table_device_view::create(matched.second.front(), stream);
   auto d_values = table_device_view::create(matched.second.back(), stream);
   auto count_it = thrust::make_counting_iterator<size_type>(0);
@@ -143,7 +147,7 @@ std::unique_ptr<column> search_ordered(table_view const& t,
 
 struct contains_scalar_dispatch {
   template <typename Element>
-  bool operator()(column_view const& col, scalar const& value, cudaStream_t stream)
+  bool operator()(column_view const& col, scalar const& value, rmm::cuda_stream_view stream)
   {
     CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match");
 
@@ -153,14 +157,14 @@ struct contains_scalar_dispatch {
     auto s           = static_cast<const ScalarType*>(&value);
 
     if (col.has_nulls()) {
-      auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream),
+      auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream.value()),
                                      d_col->pair_begin<Type, true>(),
                                      d_col->pair_end<Type, true>(),
                                      thrust::make_pair(s->value(), true));
 
       return found_iter != d_col->pair_end<Type, true>();
     } else {
-      auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream),  //
+      auto found_iter = thrust::find(rmm::exec_policy(stream)->on(stream.value()),  //
                                      d_col->begin<Type>(),
                                      d_col->end<Type>(),
                                      s->value());
@@ -173,7 +177,7 @@ struct contains_scalar_dispatch {
 template <>
 bool contains_scalar_dispatch::operator()<cudf::list_view>(column_view const& col,
                                                            scalar const& value,
-                                                           cudaStream_t stream)
+                                                           rmm::cuda_stream_view stream)
 {
   CUDF_FAIL("list_view type not supported yet");
 }
@@ -181,7 +185,7 @@ bool contains_scalar_dispatch::operator()<cudf::list_view>(column_view const& co
 template <>
 bool contains_scalar_dispatch::operator()<cudf::struct_view>(column_view const& col,
                                                              scalar const& value,
-                                                             cudaStream_t stream)
+                                                             rmm::cuda_stream_view stream)
 {
   CUDF_FAIL("struct_view type not supported yet");
 }
@@ -189,12 +193,12 @@ bool contains_scalar_dispatch::operator()<cudf::struct_view>(column_view const&
 template <>
 bool contains_scalar_dispatch::operator()<cudf::dictionary32>(column_view const& col,
                                                               scalar const& value,
-                                                              cudaStream_t stream)
+                                                              rmm::cuda_stream_view stream)
 {
   auto dict_col = cudf::dictionary_column_view(col);
   // first, find the value in the dictionary's key set
   auto index = cudf::dictionary::detail::get_index(
-    dict_col, value, rmm::mr::get_current_device_resource(), stream);
+    dict_col, value, stream, rmm::mr::get_current_device_resource());
   // if found, check the index is actually in the indices column
   return index->is_valid() ? cudf::type_dispatcher(dict_col.indices().type(),
                                                    contains_scalar_dispatch{},
@@ -207,7 +211,7 @@ bool contains_scalar_dispatch::operator()<cudf::dictionary32>(column_view const&
 }  // namespace
 
 namespace detail {
-bool contains(column_view const& col, scalar const& value, cudaStream_t stream)
+bool contains(column_view const& col, scalar const& value, rmm::cuda_stream_view stream)
 {
   if (col.is_empty()) { return false; }
 
@@ -220,8 +224,8 @@ struct multi_contains_dispatch {
   template <typename Element>
   std::unique_ptr<column> operator()(column_view const& haystack,
                                      column_view const& needles,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     std::unique_ptr<column> result = make_numeric_column(data_type{type_to_id<bool>()},
                                                          haystack.size(),
@@ -235,21 +239,21 @@ struct multi_contains_dispatch {
     mutable_column_view result_view = result.get()->mutable_view();
 
     if (needles.is_empty()) {
-      thrust::fill(rmm::exec_policy(stream)->on(stream),
+      thrust::fill(rmm::exec_policy(stream)->on(stream.value()),
                    result_view.begin<bool>(),
                    result_view.end<bool>(),
                    false);
       return result;
     }
 
-    auto hash_set        = cudf::detail::unordered_multiset<Element>::create(needles, stream);
+    auto hash_set = cudf::detail::unordered_multiset<Element>::create(needles, stream.value());
     auto device_hash_set = hash_set.to_device();
 
     auto d_haystack_ptr = column_device_view::create(haystack, stream);
     auto d_haystack     = *d_haystack_ptr;
 
     if (haystack.has_nulls()) {
-      thrust::transform(rmm::exec_policy(stream)->on(stream),
+      thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                         thrust::make_counting_iterator<size_type>(0),
                         thrust::make_counting_iterator<size_type>(haystack.size()),
                         result_view.begin<bool>(),
@@ -258,7 +262,7 @@ struct multi_contains_dispatch {
                                  device_hash_set.contains(d_haystack.element<Element>(index));
                         });
     } else {
-      thrust::transform(rmm::exec_policy(stream)->on(stream),
+      thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                         thrust::make_counting_iterator<size_type>(0),
                         thrust::make_counting_iterator<size_type>(haystack.size()),
                         result_view.begin<bool>(),
@@ -275,8 +279,8 @@ template <>
 std::unique_ptr<column> multi_contains_dispatch::operator()<list_view>(
   column_view const& haystack,
   column_view const& needles,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FAIL("list_view type not supported");
 }
@@ -285,8 +289,8 @@ template <>
 std::unique_ptr<column> multi_contains_dispatch::operator()<struct_view>(
   column_view const& haystack,
   column_view const& needles,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FAIL("struct_view type not supported");
 }
@@ -295,17 +299,17 @@ template <>
 std::unique_ptr<column> multi_contains_dispatch::operator()<dictionary32>(
   column_view const& haystack_in,
   column_view const& needles_in,
-  rmm::mr::device_memory_resource* mr,
-  cudaStream_t stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   dictionary_column_view const haystack(haystack_in);
   dictionary_column_view const needles(needles_in);
   // first combine keys so both dictionaries have the same set
   auto haystack_matched = dictionary::detail::add_keys(
-    haystack, needles.keys(), rmm::mr::get_current_device_resource(), stream);
+    haystack, needles.keys(), rmm::mr::get_current_device_resource(), stream.value());
   auto const haystack_view = dictionary_column_view(haystack_matched->view());
   auto needles_matched     = dictionary::detail::set_keys(
-    needles, haystack_view.keys(), rmm::mr::get_current_device_resource(), stream);
+    needles, haystack_view.keys(), rmm::mr::get_current_device_resource(), stream.value());
   auto const needles_view = dictionary_column_view(needles_matched->view());
 
   // now just use the indices for the contains
@@ -315,39 +319,39 @@ std::unique_ptr<column> multi_contains_dispatch::operator()<dictionary32>(
                                multi_contains_dispatch{},
                                haystack_indices,
                                needles_indices,
-                               mr,
-                               stream);
+                               stream,
+                               mr);
 }
 
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
-                                 rmm::mr::device_memory_resource* mr,
-                                 cudaStream_t stream)
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(haystack.type() == needles.type(), "DTYPE mismatch");
 
   return cudf::type_dispatcher(
-    haystack.type(), multi_contains_dispatch{}, haystack, needles, mr, stream);
+    haystack.type(), multi_contains_dispatch{}, haystack, needles, stream, mr);
 }
 
 std::unique_ptr<column> lower_bound(table_view const& t,
                                     table_view const& values,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
-                                    rmm::mr::device_memory_resource* mr,
-                                    cudaStream_t stream)
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
-  return search_ordered(t, values, true, column_order, null_precedence, mr, stream);
+  return search_ordered(t, values, true, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<column> upper_bound(table_view const& t,
                                     table_view const& values,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
-                                    rmm::mr::device_memory_resource* mr,
-                                    cudaStream_t stream)
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
-  return search_ordered(t, values, false, column_order, null_precedence, mr, stream);
+  return search_ordered(t, values, false, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace detail
@@ -361,7 +365,8 @@ std::unique_ptr<column> lower_bound(table_view const& t,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::lower_bound(t, values, column_order, null_precedence, mr);
+  return detail::lower_bound(
+    t, values, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> upper_bound(table_view const& t,
@@ -371,13 +376,14 @@ std::unique_ptr<column> upper_bound(table_view const& t,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::upper_bound(t, values, column_order, null_precedence, mr);
+  return detail::upper_bound(
+    t, values, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
 bool contains(column_view const& col, scalar const& value)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(col, value);
+  return detail::contains(col, value, rmm::cuda_stream_default);
 }
 
 std::unique_ptr<column> contains(column_view const& haystack,
@@ -385,7 +391,7 @@ std::unique_ptr<column> contains(column_view const& haystack,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(haystack, needles, mr);
+  return detail::contains(haystack, needles, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 57475e0f59e..a9bb84bd1c3 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -87,8 +87,8 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
                               input_table,
                               std::vector<order>(input_table.num_columns(), order::ASCENDING),
                               std::vector<null_order>(input_table.num_columns(), null_order::AFTER),
-                              mr,
-                              stream);
+                              stream,
+                              mr);
 
   return std::make_pair(std::move(keys_table), std::move(indices_column));
 }

From 9087cf537f43f3dc714a4679d53858385250f6f3 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 15:21:14 +1100
Subject: [PATCH 36/51] Convert sequence to cuda_stream_view

---
 cpp/include/cudf/detail/sequence.hpp | 10 ++++---
 cpp/src/filling/sequence.cu          | 43 ++++++++++++++--------------
 2 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index c71e97fe79b..c3bbb734476 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -20,6 +20,8 @@
 #include <cudf/filling.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 /**
@@ -33,8 +35,8 @@ std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
   scalar const& step,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init,
@@ -46,8 +48,8 @@ std::unique_ptr<column> sequence(
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index 50b00c74882..c09eebd8f5a 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -14,15 +14,16 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
@@ -59,8 +60,8 @@ struct sequence_functor {
   std::unique_ptr<column> operator()(size_type size,
                                      scalar const& init,
                                      scalar const& step,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto result = make_fixed_width_column(init.type(), size, mask_state::UNALLOCATED, stream, mr);
     auto result_device_view = mutable_column_device_view::create(*result, stream);
@@ -73,7 +74,7 @@ struct sequence_functor {
     // not using thrust::sequence because it requires init and step to be passed as
     // constants, not iterators. to do that we would have to retrieve the scalar values off the gpu,
     // which is undesirable from a performance perspective.
-    thrust::tabulate(rmm::exec_policy(stream)->on(stream),
+    thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()),
                      result_device_view->begin<T>(),
                      result_device_view->end<T>(),
                      tabulator<T>{n_init, n_step});
@@ -87,8 +88,8 @@ struct sequence_functor {
   std::unique_ptr<column> operator()(size_type size,
                                      scalar const& init,
                                      scalar const& step,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Unsupported sequence scalar type");
   }
@@ -98,8 +99,8 @@ struct sequence_functor {
     typename std::enable_if_t<cudf::is_numeric<T>() and not cudf::is_boolean<T>()>* = nullptr>
   std::unique_ptr<column> operator()(size_type size,
                                      scalar const& init,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto result = make_fixed_width_column(init.type(), size, mask_state::UNALLOCATED, stream, mr);
     auto result_device_view = mutable_column_device_view::create(*result, stream);
@@ -110,7 +111,7 @@ struct sequence_functor {
     // not using thrust::sequence because it requires init and step to be passed as
     // constants, not iterators. to do that we would have to retrieve the scalar values off the gpu,
     // which is undesirable from a performance perspective.
-    thrust::tabulate(rmm::exec_policy(stream)->on(stream),
+    thrust::tabulate(rmm::exec_policy(stream)->on(stream.value()),
                      result_device_view->begin<T>(),
                      result_device_view->end<T>(),
                      const_tabulator<T>{n_init});
@@ -123,8 +124,8 @@ struct sequence_functor {
     typename std::enable_if_t<not cudf::is_numeric<T>() or cudf::is_boolean<T>()>* = nullptr>
   std::unique_ptr<column> operator()(size_type size,
                                      scalar const& init,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Unsupported sequence scalar type");
   }
@@ -135,26 +136,26 @@ struct sequence_functor {
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
-                                 rmm::mr::device_memory_resource* mr,
-                                 cudaStream_t stream)
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(init.type() == step.type(), "init and step must be of the same type.");
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "Input scalar types must be numeric");
 
-  return type_dispatcher(init.type(), sequence_functor{}, size, init, step, mr, stream);
+  return type_dispatcher(init.type(), sequence_functor{}, size, init, step, stream, mr);
 }
 
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "init scalar type must be numeric");
 
-  return type_dispatcher(init.type(), sequence_functor{}, size, init, mr, stream);
+  return type_dispatcher(init.type(), sequence_functor{}, size, init, stream, mr);
 }
 
 }  // namespace detail
@@ -164,14 +165,14 @@ std::unique_ptr<column> sequence(size_type size,
                                  scalar const& step,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return detail::sequence(size, init, step, mr, 0);
+  return detail::sequence(size, init, step, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return detail::sequence(size, init, mr, 0);
+  return detail::sequence(size, init, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf

From 576dae1afcf1cb747c1fe8031fd66e794e875688 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 15:51:38 +1100
Subject: [PATCH 37/51] Convert sorting and stream compaction to
 cuda_stream_view

---
 cpp/include/cudf/detail/sorting.hpp           | 14 ++++---
 cpp/include/cudf/detail/stream_compaction.hpp | 26 ++++++------
 cpp/include/cudf/strings/sorting.hpp          |  6 ++-
 cpp/src/copying/scatter.cu                    |  6 +--
 cpp/src/dictionary/add_keys.cu                |  4 +-
 cpp/src/dictionary/detail/concatenate.cu      |  4 +-
 cpp/src/dictionary/set_keys.cu                |  4 +-
 cpp/src/groupby/sort/sort_helper.cu           | 12 +++---
 cpp/src/reductions/reductions.cpp             |  4 +-
 cpp/src/sort/rank.cu                          |  4 +-
 cpp/src/sort/sort.cu                          | 21 ++++++----
 cpp/src/sort/sort_impl.cuh                    | 16 ++++----
 cpp/src/sort/stable_sort.cu                   | 13 +++---
 .../stream_compaction/apply_boolean_mask.cu   |  8 ++--
 cpp/src/stream_compaction/drop_duplicates.cu  | 41 ++++++++++---------
 cpp/src/stream_compaction/drop_nans.cu        | 10 +++--
 cpp/src/stream_compaction/drop_nulls.cu       |  9 ++--
 cpp/src/strings/sorting/sorting.cu            |  8 ++--
 cpp/src/transform/encode.cu                   |  2 +-
 19 files changed, 117 insertions(+), 95 deletions(-)

diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 635678fa813..0ac20ed3c94 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -18,6 +18,8 @@
 
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <vector>
 
@@ -33,8 +35,8 @@ std::unique_ptr<column> sorted_order(
   table_view input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                            = 0);
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::stable_sorted_order
@@ -45,8 +47,8 @@ std::unique_ptr<column> stable_sorted_order(
   table_view input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                            = 0);
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::sort_by_key
@@ -58,8 +60,8 @@ std::unique_ptr<table> sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                            = 0);
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 46068b64a93..5bc12fb0713 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 /**
@@ -32,8 +34,8 @@ std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::drop_nans(table_view const&, std::vector<size_type> const&,
@@ -45,8 +47,8 @@ std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::apply_boolean_mask
@@ -56,8 +58,8 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> apply_boolean_mask(
   table_view const& input,
   column_view const& boolean_mask,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::drop_duplicates
@@ -69,8 +71,8 @@ std::unique_ptr<table> drop_duplicates(
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
   null_equality nulls_equal           = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
@@ -80,7 +82,7 @@ std::unique_ptr<table> drop_duplicates(
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
                                nan_policy nan_handling,
-                               cudaStream_t stream = 0);
+                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @copydoc cudf::distinct_count(table_view const&, null_equality)
@@ -88,8 +90,8 @@ cudf::size_type distinct_count(column_view const& input,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 cudf::size_type distinct_count(table_view const& input,
-                               null_equality nulls_equal = null_equality::EQUAL,
-                               cudaStream_t stream       = 0);
+                               null_equality nulls_equal    = null_equality::EQUAL,
+                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/sorting.hpp b/cpp/include/cudf/strings/sorting.hpp
index 8ce5d43ca12..84ce2e4ec2b 100644
--- a/cpp/include/cudf/strings/sorting.hpp
+++ b/cpp/include/cudf/strings/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -48,7 +50,7 @@ std::unique_ptr<cudf::column> sort(
   sort_type stype,
   cudf::order order                   = cudf::order::ASCENDING,
   cudf::null_order null_order         = cudf::null_order::BEFORE,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 887972f2dd6..036962ab744 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -336,10 +336,8 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
                    0);
 
   // The scatter map is actually a table with only one column, which is scatter map.
-  auto scatter_map  = detail::apply_boolean_mask(table_view{{indices->view()}},
-                                                boolean_mask,
-                                                rmm::mr::get_current_device_resource(),
-                                                stream.value());
+  auto scatter_map = detail::apply_boolean_mask(
+    table_view{{indices->view()}}, boolean_mask, stream, rmm::mr::get_current_device_resource());
   auto output_table = detail::scatter(table_view{{input}},
                                       scatter_map->get_column(0).view(),
                                       table_view{{target}},
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 64ce8d1e07e..b72b4f38a56 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -65,8 +65,8 @@ std::unique_ptr<column> add_keys(
                                                   std::vector<size_type>{0},  // only one key column
                                                   duplicate_keep_option::KEEP_FIRST,
                                                   null_equality::EQUAL,
-                                                  mr,
-                                                  stream)
+                                                  stream,
+                                                  mr)
                       ->release();
   std::unique_ptr<column> keys_column(std::move(table_keys.front()));
   // create a map for the indices
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index b83de6575e8..223e2d7c331 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -213,8 +213,8 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
                                                   std::vector<size_type>{0},
                                                   duplicate_keep_option::KEEP_FIRST,
                                                   null_equality::EQUAL,
-                                                  mr,
-                                                  stream.value())
+                                                  stream,
+                                                  mr)
                       ->release();
   std::unique_ptr<column> keys_column(std::move(table_keys.front()));
 
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index ae4a817f182..c934e495de3 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -113,8 +113,8 @@ std::unique_ptr<column> set_keys(
                                                   std::vector<size_type>{0},
                                                   duplicate_keep_option::KEEP_FIRST,
                                                   null_equality::EQUAL,
-                                                  mr,
-                                                  stream)
+                                                  stream,
+                                                  mr)
                       ->release();
   std::unique_ptr<column> keys_column(std::move(table_keys.front()));
 
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 93d785b78f0..4b4c6a96688 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -132,8 +132,8 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
       _keys,
       {},
       std::vector<null_order>(_keys.num_columns(), null_order::AFTER),
-      rmm::mr::get_current_device_resource(),
-      stream.value());
+      stream,
+      rmm::mr::get_current_device_resource());
   } else {  // Pandas style
     // Temporarily prepend the keys table with a column that indicates the
     // presence of a null value within a row. This allows moving all rows that
@@ -145,8 +145,8 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
       augmented_keys,
       {},
       std::vector<null_order>(_keys.num_columns() + 1, null_order::AFTER),
-      rmm::mr::get_current_device_resource(),
-      stream.value());
+      stream,
+      rmm::mr::get_current_device_resource());
 
     // All rows with one or more null values are at the end of the resulting sorted order.
   }
@@ -269,8 +269,8 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
     cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(), values}),
                                       {},
                                       std::vector<null_order>(2, null_order::AFTER),
-                                      mr,
-                                      stream.value());
+                                      stream,
+                                      mr);
 
   // Zero-copy slice this sort order so that its new size is num_keys()
   column_view gather_map = cudf::detail::slice(values_sort_order->view(), 0, num_keys(stream));
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 7afebaab154..8677065a74a 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -67,7 +67,7 @@ struct reduce_dispatch_functor {
       } break;
       case aggregation::MEDIAN: {
         auto sorted_indices =
-          detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr, stream.value());
+          detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr);
         auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0];
         auto col_ptr              = detail::quantile(
           col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream, mr);
@@ -78,7 +78,7 @@ struct reduce_dispatch_functor {
         CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
         auto sorted_indices =
-          detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, mr, stream.value());
+          detail::sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr);
         auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0];
         auto col_ptr              = detail::quantile(col,
                                         quantile_agg->_quantiles,
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index a3a16130dfb..50f8155313f 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -259,8 +259,8 @@ std::unique_ptr<column> rank(column_view const &input,
   std::unique_ptr<column> sorted_order =
     (method == rank_method::FIRST)
       ? detail::stable_sorted_order(
-          table_view{{input}}, {column_order}, {null_precedence}, mr, stream)
-      : detail::sorted_order(table_view{{input}}, {column_order}, {null_precedence}, mr, stream);
+          table_view{{input}}, {column_order}, {null_precedence}, stream, mr)
+      : detail::sorted_order(table_view{{input}}, {column_order}, {null_precedence}, stream, mr);
   column_view sorted_order_view = sorted_order->view();
 
   // dense: All equal values have same rank and rank always increases by 1 between groups
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index d163c4e5be8..18d6839e2a2 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "rmm/cuda_stream_view.hpp"
 #include "sort_impl.cuh"
 
 #include <cudf/column/column.hpp>
@@ -27,23 +28,23 @@ namespace detail {
 std::unique_ptr<column> sorted_order(table_view input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
-  return sorted_order<false>(input, column_order, null_precedence, mr, stream);
+  return sorted_order<false>(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> sort_by_key(table_view const& values,
                                    table_view const& keys,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
-                                   rmm::mr::device_memory_resource* mr,
-                                   cudaStream_t stream)
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
 
-  auto sorted_order = detail::sorted_order(keys, column_order, null_precedence, mr, stream);
+  auto sorted_order = detail::sorted_order(keys, column_order, null_precedence, stream, mr);
 
   return detail::gather(values,
                         sorted_order->view(),
@@ -61,7 +62,7 @@ std::unique_ptr<column> sorted_order(table_view input,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sorted_order(input, column_order, null_precedence, mr);
+  return detail::sorted_order(input, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> sort(table_view input,
@@ -70,7 +71,8 @@ std::unique_ptr<table> sort(table_view input,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort_by_key(input, input, column_order, null_precedence, mr);
+  return detail::sort_by_key(
+    input, input, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<table> sort_by_key(table_view const& values,
@@ -80,7 +82,8 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort_by_key(values, keys, column_order, null_precedence, mr);
+  return detail::sort_by_key(
+    values, keys, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index d043f2e8947..97de42d805d 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -23,6 +23,8 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/sequence.h>
 
 namespace cudf {
@@ -32,8 +34,8 @@ template <bool stable = false>
 std::unique_ptr<column> sorted_order(table_view input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0) {
     return cudf::make_numeric_column(data_type(type_to_id<size_type>()), 0);
@@ -56,7 +58,7 @@ std::unique_ptr<column> sorted_order(table_view input,
 
   auto device_table = table_device_view::create(input, stream);
 
-  thrust::sequence(rmm::exec_policy(stream)->on(stream),
+  thrust::sequence(rmm::exec_policy(stream)->on(stream.value()),
                    mutable_indices_view.begin<size_type>(),
                    mutable_indices_view.end<size_type>(),
                    0);
@@ -68,12 +70,12 @@ std::unique_ptr<column> sorted_order(table_view input,
     auto comparator = row_lexicographic_comparator<true>(
       *device_table, *device_table, d_column_order.data().get(), d_null_precedence.data().get());
     if (stable) {
-      thrust::stable_sort(rmm::exec_policy(stream)->on(stream),
+      thrust::stable_sort(rmm::exec_policy(stream)->on(stream.value()),
                           mutable_indices_view.begin<size_type>(),
                           mutable_indices_view.end<size_type>(),
                           comparator);
     } else {
-      thrust::sort(rmm::exec_policy(stream)->on(stream),
+      thrust::sort(rmm::exec_policy(stream)->on(stream.value()),
                    mutable_indices_view.begin<size_type>(),
                    mutable_indices_view.end<size_type>(),
                    comparator);
@@ -82,12 +84,12 @@ std::unique_ptr<column> sorted_order(table_view input,
     auto comparator = row_lexicographic_comparator<false>(
       *device_table, *device_table, d_column_order.data().get());
     if (stable) {
-      thrust::stable_sort(rmm::exec_policy(stream)->on(stream),
+      thrust::stable_sort(rmm::exec_policy(stream)->on(stream.value()),
                           mutable_indices_view.begin<size_type>(),
                           mutable_indices_view.end<size_type>(),
                           comparator);
     } else {
-      thrust::sort(rmm::exec_policy(stream)->on(stream),
+      thrust::sort(rmm::exec_policy(stream)->on(stream.value()),
                    mutable_indices_view.begin<size_type>(),
                    mutable_indices_view.end<size_type>(),
                    comparator);
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 982d5df6a9a..860e88ae76e 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-20, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,15 +21,17 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 std::unique_ptr<column> stable_sorted_order(table_view input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
-                                            rmm::mr::device_memory_resource* mr,
-                                            cudaStream_t stream)
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
-  return sorted_order<true>(input, column_order, null_precedence, mr, stream);
+  return sorted_order<true>(input, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace detail
@@ -39,7 +41,8 @@ std::unique_ptr<column> stable_sorted_order(table_view input,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::mr::device_memory_resource* mr)
 {
-  return detail::stable_sorted_order(input, column_order, null_precedence, mr);
+  return detail::stable_sorted_order(
+    input, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index ccb31898e95..3eb10f9f717 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -25,6 +25,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <algorithm>
 
 namespace {
@@ -61,8 +63,8 @@ namespace detail {
  */
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
-                                          rmm::mr::device_memory_resource* mr,
-                                          cudaStream_t stream)
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
   if (boolean_mask.is_empty()) { return empty_like(input); }
 
@@ -90,6 +92,6 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::apply_boolean_mask(input, boolean_mask, mr);
+  return detail::apply_boolean_mask(input, boolean_mask, rmm::cuda_stream_default, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu
index 970ce7eb198..0208272a1d4 100644
--- a/cpp/src/stream_compaction/drop_duplicates.cu
+++ b/cpp/src/stream_compaction/drop_duplicates.cu
@@ -29,9 +29,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/logical.h>
+
 #include <algorithm>
 #include <cmath>
 
@@ -105,14 +108,14 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
                                        cudf::mutable_column_view& unique_indices,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
-                                       cudaStream_t stream = 0)
+                                       rmm::cuda_stream_view stream)
 {
   // sort only indices
   auto sorted_indices = sorted_order(keys,
                                      std::vector<order>{},
                                      std::vector<null_order>{},
-                                     rmm::mr::get_current_device_resource(),
-                                     stream);
+                                     stream,
+                                     rmm::mr::get_current_device_resource());
 
   // extract unique indices
   auto device_input_table = cudf::table_device_view::create(keys, stream);
@@ -120,7 +123,7 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
   if (cudf::has_nulls(keys)) {
     auto comp = row_equality_comparator<true>(
       *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
-    auto result_end = unique_copy(rmm::exec_policy(stream)->on(stream),
+    auto result_end = unique_copy(rmm::exec_policy(stream)->on(stream.value()),
                                   sorted_indices->view().begin<cudf::size_type>(),
                                   sorted_indices->view().end<cudf::size_type>(),
                                   unique_indices.begin<cudf::size_type>(),
@@ -134,7 +137,7 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
   } else {
     auto comp = row_equality_comparator<false>(
       *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
-    auto result_end = unique_copy(rmm::exec_policy(stream)->on(stream),
+    auto result_end = unique_copy(rmm::exec_policy(stream)->on(stream.value()),
                                   sorted_indices->view().begin<cudf::size_type>(),
                                   sorted_indices->view().end<cudf::size_type>(),
                                   unique_indices.begin<cudf::size_type>(),
@@ -150,14 +153,14 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
 
 cudf::size_type distinct_count(table_view const& keys,
                                null_equality nulls_equal,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream)
 {
   // sort only indices
   auto sorted_indices = sorted_order(keys,
                                      std::vector<order>{},
                                      std::vector<null_order>{},
-                                     rmm::mr::get_current_device_resource(),
-                                     stream);
+                                     stream,
+                                     rmm::mr::get_current_device_resource());
 
   // count unique elements
   auto sorted_row_index   = sorted_indices->view().data<cudf::size_type>();
@@ -167,7 +170,7 @@ cudf::size_type distinct_count(table_view const& keys,
     row_equality_comparator<true> comp(
       *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
     return thrust::count_if(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream)->on(stream.value()),
       thrust::counting_iterator<cudf::size_type>(0),
       thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
       [sorted_row_index, comp] __device__(cudf::size_type i) {
@@ -177,7 +180,7 @@ cudf::size_type distinct_count(table_view const& keys,
     row_equality_comparator<false> comp(
       *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
     return thrust::count_if(
-      rmm::exec_policy(stream)->on(stream),
+      rmm::exec_policy(stream)->on(stream.value()),
       thrust::counting_iterator<cudf::size_type>(0),
       thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
       [sorted_row_index, comp] __device__(cudf::size_type i) {
@@ -190,8 +193,8 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
                                        std::vector<size_type> const& keys,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
-                                       rmm::mr::device_memory_resource* mr,
-                                       cudaStream_t stream)
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   if (0 == input.num_rows() || 0 == input.num_columns() || 0 == keys.size()) {
     return empty_like(input);
@@ -263,11 +266,11 @@ struct has_nans {
    * @returns bool true if `input` has `NAN` else false
    */
   template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
-  bool operator()(column_view const& input, cudaStream_t stream)
+  bool operator()(column_view const& input, rmm::cuda_stream_view stream)
   {
     auto input_device_view = cudf::column_device_view::create(input, stream);
     auto device_view       = *input_device_view;
-    auto count             = thrust::count_if(rmm::exec_policy(stream)->on(stream),
+    auto count             = thrust::count_if(rmm::exec_policy(stream)->on(stream.value()),
                                   thrust::counting_iterator<cudf::size_type>(0),
                                   thrust::counting_iterator<cudf::size_type>(input.size()),
                                   check_for_nan<T>(device_view));
@@ -287,7 +290,7 @@ struct has_nans {
    * @returns bool Always false as non-floating point columns can't have `NAN`
    */
   template <typename T, std::enable_if_t<not std::is_floating_point<T>::value>* = nullptr>
-  bool operator()(column_view const& input, cudaStream_t stream)
+  bool operator()(column_view const& input, rmm::cuda_stream_view stream)
   {
     return false;
   }
@@ -296,7 +299,7 @@ struct has_nans {
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
                                nan_policy nan_handling,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream)
 {
   if (0 == input.size() || input.null_count() == input.size()) { return 0; }
 
@@ -332,7 +335,7 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_duplicates(input, keys, keep, nulls_equal, mr);
+  return detail::drop_duplicates(input, keys, keep, nulls_equal, rmm::cuda_stream_default, mr);
 }
 
 cudf::size_type distinct_count(column_view const& input,
@@ -340,13 +343,13 @@ cudf::size_type distinct_count(column_view const& input,
                                nan_policy nan_handling)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, null_handling, nan_handling);
+  return detail::distinct_count(input, null_handling, nan_handling, rmm::cuda_stream_default);
 }
 
 cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, nulls_equal);
+  return detail::distinct_count(input, nulls_equal, rmm::cuda_stream_default);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index ddd5d0c9934..80d92669344 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -23,6 +23,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace {
 
 struct dispatch_is_not_nan {
@@ -82,8 +84,8 @@ namespace detail {
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
-                                 rmm::mr::device_memory_resource* mr,
-                                 cudaStream_t stream)
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   auto keys_view = input.select(keys);
   if (keys_view.num_columns() == 0 || keys_view.num_rows() == 0) {
@@ -113,7 +115,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return cudf::detail::drop_nans(input, keys, keep_threshold, mr);
+  return cudf::detail::drop_nans(input, keys, keep_threshold, rmm::cuda_stream_default, mr);
 }
 /*
  * Filters a table to remove nan null elements.
@@ -123,7 +125,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return cudf::detail::drop_nans(input, keys, keys.size(), mr);
+  return cudf::detail::drop_nans(input, keys, keys.size(), rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index 49708b635d8..71aa8f6c63c 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -21,6 +21,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace {
 // Returns true if the mask is true for index i in at least keep_threshold
@@ -61,8 +62,8 @@ namespace detail {
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   auto keys_view = input.select(keys);
   if (keys_view.num_columns() == 0 || keys_view.num_rows() == 0 || not cudf::has_nulls(keys_view)) {
@@ -86,7 +87,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return cudf::detail::drop_nulls(input, keys, keep_threshold, mr);
+  return cudf::detail::drop_nulls(input, keys, keep_threshold, rmm::cuda_stream_default, mr);
 }
 /*
  * Filters a table to remove null elements.
@@ -96,7 +97,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return cudf::detail::drop_nulls(input, keys, keys.size(), mr);
+  return cudf::detail::drop_nulls(input, keys, keys.size(), rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/strings/sorting/sorting.cu b/cpp/src/strings/sorting/sorting.cu
index 0a5a2238d9b..3d78024064e 100644
--- a/cpp/src/strings/sorting/sorting.cu
+++ b/cpp/src/strings/sorting/sorting.cu
@@ -21,6 +21,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
@@ -32,7 +34,7 @@ std::unique_ptr<cudf::column> sort(strings_column_view strings,
                                    sort_type stype,
                                    cudf::order order,
                                    cudf::null_order null_order,
-                                   cudaStream_t stream,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   auto execpol        = rmm::exec_policy(stream);
@@ -42,8 +44,8 @@ std::unique_ptr<cudf::column> sort(strings_column_view strings,
   // sort the indices of the strings
   size_type num_strings = strings.size();
   rmm::device_vector<size_type> indices(num_strings);
-  thrust::sequence(execpol->on(stream), indices.begin(), indices.end());
-  thrust::sort(execpol->on(stream),
+  thrust::sequence(execpol->on(stream.value()), indices.begin(), indices.end());
+  thrust::sort(execpol->on(stream.value()),
                indices.begin(),
                indices.end(),
                [d_column, stype, order, null_order] __device__(size_type lhs, size_type rhs) {
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index a9bb84bd1c3..895f6309886 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -43,7 +43,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
   // - resulting column elements are sorted ascending
   // - nulls are sorted to the beginning
   auto keys_table = cudf::detail::drop_duplicates(
-    input_table, drop_keys, duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, mr, stream);
+    input_table, drop_keys, duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, stream, mr);
 
   if (cudf::has_nulls(keys_table->view())) {
     // Rows with nulls appear at the top of `keys_table`, but we want them to appear at

From 6ab4384fd7e12d755a085df66ddbcea0001fb5fd Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 17:53:00 +1100
Subject: [PATCH 38/51] convert transform to cuda_stream_view

---
 cpp/include/cudf/detail/transform.hpp | 22 ++++++++++++----------
 cpp/src/dictionary/encode.cu          |  2 +-
 cpp/src/interop/to_arrow.cpp          |  2 +-
 cpp/src/jit/launcher.cpp              |  7 +++++--
 cpp/src/jit/launcher.h                | 14 +++++++++-----
 cpp/src/transform/bools_to_mask.cu    |  8 +++++---
 cpp/src/transform/encode.cu           |  7 ++++---
 cpp/src/transform/mask_to_bools.cu    |  8 +++++---
 cpp/src/transform/nans_to_nulls.cu    | 18 ++++++++++++------
 cpp/src/transform/transform.cpp       |  9 +++++----
 10 files changed, 59 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 9cffbd3be70..0309542d01f 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/transform.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 /**
@@ -30,8 +32,8 @@ std::unique_ptr<column> transform(
   std::string const& unary_udf,
   data_type output_type,
   bool is_ptx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::nans_to_nulls
@@ -40,8 +42,8 @@ std::unique_ptr<column> transform(
  **/
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::bools_to_mask
@@ -50,8 +52,8 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  **/
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::encode
@@ -60,8 +62,8 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  **/
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::mask_to_bools
@@ -72,7 +74,7 @@ std::unique_ptr<column> mask_to_bools(
   bitmask_type const* null_mask,
   size_type begin_bit,
   size_type end_bit,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index 613974efde7..9c20d5006bf 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -45,7 +45,7 @@ std::unique_ptr<column> encode(column_view const& input_column,
   CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32,
                "cannot encode a dictionary from a dictionary");
 
-  auto codified       = cudf::detail::encode(cudf::table_view({input_column}), mr, stream);
+  auto codified       = cudf::detail::encode(cudf::table_view({input_column}), stream, mr);
   auto keys_table     = std::move(codified.first);
   auto indices_column = std::move(codified.second);
   auto keys_column    = std::move(keys_table->release().front());
diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp
index 5f270597403..ec58da6bf0b 100644
--- a/cpp/src/interop/to_arrow.cpp
+++ b/cpp/src/interop/to_arrow.cpp
@@ -137,7 +137,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                                                                   arrow::MemoryPool* ar_mr,
                                                                   rmm::cuda_stream_view stream)
 {
-  auto bitmask = bools_to_mask(input, rmm::mr::get_current_device_resource(), stream.value());
+  auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
 
   auto result = arrow::AllocateBuffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
   CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
diff --git a/cpp/src/jit/launcher.cpp b/cpp/src/jit/launcher.cpp
index 704379ee82e..65bb55f9102 100644
--- a/cpp/src/jit/launcher.cpp
+++ b/cpp/src/jit/launcher.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -22,14 +22,17 @@
 #include <chrono>
 #include <cstdint>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace jit {
+
 launcher::launcher(const std::string& hash,
                    const std::string& cuda_source,
                    const std::vector<std::string>& header_names,
                    const std::vector<std::string>& compiler_flags,
                    jitify::experimental::file_callback_type file_callback,
-                   cudaStream_t stream)
+                   rmm::cuda_stream_view stream)
   : cache_instance{cudf::jit::cudfJitCache::Instance()}, stream(stream)
 {
   program = cache_instance.getProgram(
diff --git a/cpp/src/jit/launcher.h b/cpp/src/jit/launcher.h
index 3745854e277..60720816bc1 100644
--- a/cpp/src/jit/launcher.h
+++ b/cpp/src/jit/launcher.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -20,9 +20,13 @@
 #pragma once
 
 #include <jit/cache.h>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <jitify.hpp>
+
 #include <chrono>
 #include <fstream>
-#include <jitify.hpp>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -58,7 +62,7 @@ class launcher {
            const std::vector<std::string>& header_names,
            const std::vector<std::string>& compiler_flags,
            jitify::experimental::file_callback_type file_callback,
-           cudaStream_t stream = 0);
+           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
   launcher(launcher&&);
   launcher(const launcher&) = delete;
   launcher& operator=(launcher&&) = delete;
@@ -91,14 +95,14 @@ class launcher {
   template <typename... Args>
   void launch(Args... args)
   {
-    get_kernel().configure_1d_max_occupancy(0, 0, 0, stream).launch(args...);
+    get_kernel().configure_1d_max_occupancy(0, 0, 0, stream.value()).launch(args...);
   }
 
  private:
   cudf::jit::cudfJitCache& cache_instance;
   cudf::jit::named_prog<jitify::experimental::Program> program;
   cudf::jit::named_prog<jitify::experimental::KernelInstantiation> kernel_inst;
-  cudaStream_t stream;
+  rmm::cuda_stream_view stream;
 
   jitify::experimental::KernelInstantiation& get_kernel() { return *std::get<1>(kernel_inst); }
 };
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index f7e029f5ed7..2cf4771890b 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,10 +25,12 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::mr::device_memory_resource* mr, cudaStream_t stream)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(input.type().id() == type_id::BOOL8, "Input is not of type bool");
 
@@ -58,7 +60,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::bools_to_mask(input, mr);
+  return detail::bools_to_mask(input, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 895f6309886..1ecf8a7814a 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -26,6 +26,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/bit.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <numeric>
@@ -34,7 +35,7 @@ namespace cudf {
 namespace detail {
 
 std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
-  table_view const& input_table, rmm::mr::device_memory_resource* mr, cudaStream_t stream)
+  table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   std::vector<size_type> drop_keys(input_table.num_columns());
   std::iota(drop_keys.begin(), drop_keys.end(), 0);
@@ -59,7 +60,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
 
     rmm::device_vector<cudf::size_type> gather_map(num_rows);
     auto execpol = rmm::exec_policy(stream);
-    thrust::transform(execpol->on(stream),
+    thrust::transform(execpol->on(stream.value()),
                       thrust::make_counting_iterator<cudf::size_type>(0),
                       thrust::make_counting_iterator<cudf::size_type>(num_rows),
                       gather_map.begin(),
@@ -98,7 +99,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input, rmm::mr::device_memory_resource* mr)
 {
-  return detail::encode(input, mr, 0);
+  return detail::encode(input, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index fb6e0a7148c..1202c754287 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -23,12 +23,14 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type begin_bit,
                                       size_type end_bit,
-                                      cudaStream_t stream,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   auto const length = end_bit - begin_bit;
@@ -41,7 +43,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
   if (length > 0) {
     auto mutable_view = out_col->mutable_view();
 
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       thrust::make_counting_iterator<cudf::size_type>(begin_bit),
                       thrust::make_counting_iterator<cudf::size_type>(end_bit),
                       mutable_view.begin<bool>(),
@@ -57,6 +59,6 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type end_bit,
                                       rmm::mr::device_memory_resource* mr)
 {
-  return detail::mask_to_bools(bitmask, begin_bit, end_bit, 0, mr);
+  return detail::mask_to_bools(bitmask, begin_bit, end_bit, rmm::cuda_stream_default, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index 977073ce48f..93a7521546d 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,13 +24,17 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 struct dispatch_nan_to_null {
   template <typename T>
   std::enable_if_t<std::is_floating_point<T>::value,
                    std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type>>
-  operator()(column_view const& input, rmm::mr::device_memory_resource* mr, cudaStream_t stream)
+  operator()(column_view const& input,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
     auto input_device_view_ptr = column_device_view::create(input, stream);
     auto input_device_view     = *input_device_view_ptr;
@@ -68,18 +72,20 @@ struct dispatch_nan_to_null {
   template <typename T>
   std::enable_if_t<!std::is_floating_point<T>::value,
                    std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type>>
-  operator()(column_view const& input, rmm::mr::device_memory_resource* mr, cudaStream_t stream)
+  operator()(column_view const& input,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Input column can't be a non-floating type");
   }
 };
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
-  column_view const& input, rmm::mr::device_memory_resource* mr, cudaStream_t stream)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) { return std::make_pair(std::make_unique<rmm::device_buffer>(), 0); }
 
-  return cudf::type_dispatcher(input.type(), dispatch_nan_to_null{}, input, mr, stream);
+  return cudf::type_dispatcher(input.type(), dispatch_nan_to_null{}, input, stream, mr);
 }
 
 }  // namespace detail
@@ -88,7 +94,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
   column_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::nans_to_nulls(input, mr);
+  return detail::nans_to_nulls(input, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 587b29201e9..2372382d178 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -26,6 +26,7 @@
 #include <jit/parser.h>
 #include <jit/type.h>
 #include "jit/code/code.h"
+#include "rmm/cuda_stream_view.hpp"
 
 #include <jit/common_headers.hpp>
 #include <jit/timestamps.hpp.jit>
@@ -52,7 +53,7 @@ void unary_operation(mutable_column_view output,
                      const std::string& udf,
                      data_type output_type,
                      bool is_ptx,
-                     cudaStream_t stream)
+                     rmm::cuda_stream_view stream)
 {
   std::string hash = "prog_transform" + std::to_string(std::hash<std::string>{}(udf));
 
@@ -86,8 +87,8 @@ std::unique_ptr<column> transform(column_view const& input,
                                   std::string const& unary_udf,
                                   data_type output_type,
                                   bool is_ptx,
-                                  rmm::mr::device_memory_resource* mr,
-                                  cudaStream_t stream)
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(is_fixed_width(input.type()), "Unexpected non-fixed-width type.");
 
@@ -113,7 +114,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::transform(input, unary_udf, output_type, is_ptx, mr);
+  return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf

From d08b2d061b8888907bda8be899204f0678c056dc Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 9 Nov 2020 17:55:01 +1100
Subject: [PATCH 39/51] Convert transpose to cuda_stream_view

---
 cpp/include/cudf/detail/transpose.hpp | 8 +++++---
 cpp/src/transpose/transpose.cu        | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index 468409c1443..be2c567df35 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 /**
@@ -27,8 +29,8 @@ namespace detail {
  */
 std::pair<std::unique_ptr<column>, table_view> transpose(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index 3b4439f63f3..67a06e60dd3 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -24,11 +24,13 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
-                                                         rmm::mr::device_memory_resource* mr,
-                                                         cudaStream_t stream)
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr)
 {
   // If there are no rows in the input, return successfully
   if (input.num_columns() == 0 || input.num_rows() == 0) {
@@ -57,7 +59,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
                                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::transpose(input, mr);
+  return detail::transpose(input, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf

From 1715b80859b89873d797632b29d2efeb770a62c0 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 10 Nov 2020 08:49:13 +1100
Subject: [PATCH 40/51] Convert unary ops to cuda_stream_view

---
 cpp/include/cudf/detail/unary.hpp          |  26 +--
 cpp/src/dictionary/add_keys.cu             |   2 +-
 cpp/src/dictionary/dictionary_factories.cu |   2 +-
 cpp/src/dictionary/encode.cu               |   2 +-
 cpp/src/groupby/sort/groupby.cu            |   2 +-
 cpp/src/interop/from_arrow.cpp             |   3 +-
 cpp/src/interop/to_arrow.cpp               |   4 +-
 cpp/src/unary/cast_ops.cu                  |  32 ++--
 cpp/src/unary/math_ops.cu                  | 197 +++++++++++----------
 cpp/src/unary/nan_ops.cu                   |  41 +++--
 cpp/src/unary/null_ops.cu                  |   3 +
 cpp/src/unary/unary_ops.cuh                |  12 +-
 12 files changed, 172 insertions(+), 154 deletions(-)

diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index cd8749cae0b..fb5416fe750 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -19,6 +19,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/unary.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 /**
@@ -32,8 +34,8 @@ namespace detail {
  * @param begin Beginning of the sequence of elements
  * @param end End of the sequence of elements
  * @param p Predicate to be applied to each element in `[begin,end)`
- * @param mr Device memory resource used to allocate the returned column's device memory
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns A column of type `type_id::BOOL8,` with `true` representing predicate is satisfied.
  */
@@ -44,15 +46,15 @@ std::unique_ptr<column> true_if(
   InputIterator end,
   size_type size,
   Predicate p,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0)
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto output =
     make_numeric_column(data_type(type_id::BOOL8), size, mask_state::UNALLOCATED, stream, mr);
   auto output_mutable_view = output->mutable_view();
   auto output_data         = output_mutable_view.data<bool>();
 
-  thrust::transform(rmm::exec_policy(stream)->on(stream), begin, end, output_data, p);
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()), begin, end, output_data, p);
 
   return output;
 }
@@ -65,8 +67,8 @@ std::unique_ptr<column> true_if(
 std::unique_ptr<cudf::column> unary_operation(
   cudf::column_view const& input,
   cudf::unary_op op,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::cast
@@ -76,8 +78,8 @@ std::unique_ptr<cudf::column> unary_operation(
 std::unique_ptr<column> cast(
   column_view const& input,
   data_type type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::is_nan
@@ -86,8 +88,8 @@ std::unique_ptr<column> cast(
  */
 std::unique_ptr<column> is_nan(
   cudf::column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::is_not_nan
@@ -96,8 +98,8 @@ std::unique_ptr<column> is_nan(
  */
 std::unique_ptr<column> is_not_nan(
   cudf::column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-  cudaStream_t stream                 = 0);
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index b72b4f38a56..6a9b294758d 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -111,7 +111,7 @@ std::unique_ptr<column> add_keys(
     }
     // otherwise we need to convert the gather result
     column_view cast_view(gather_result.type(), indices_size, gather_result.head(), nullptr, 0);
-    return cudf::detail::cast(cast_view, indices_type, mr, stream);
+    return cudf::detail::cast(cast_view, indices_type, stream, mr);
   }();
 
   // create new dictionary column with keys_column and indices_column
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index ec598b71f88..17a09e26f7b 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -139,7 +139,7 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys,
     }
     // If the new type does not match, then convert the data.
     cudf::column_view cast_view{cudf::data_type{indices_type}, indices_size, contents.data->data()};
-    return cudf::detail::cast(cast_view, new_type, mr, stream);
+    return cudf::detail::cast(cast_view, new_type, stream, mr);
   }();
 
   return make_dictionary_column(std::move(keys),
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index 9c20d5006bf..129c9345d4b 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -60,7 +60,7 @@ std::unique_ptr<column> encode(column_view const& input_column,
 
   // the encode() returns INT32 for indices
   if (indices_column->type().id() != indices_type.id())
-    indices_column = cudf::detail::cast(indices_column->view(), indices_type, mr, stream);
+    indices_column = cudf::detail::cast(indices_column->view(), indices_type, stream, mr);
 
   // create column with keys_column and indices_column
   return make_dictionary_column(
diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu
index 7077e6f089c..cc77e9b8af8 100644
--- a/cpp/src/groupby/sort/groupby.cu
+++ b/cpp/src/groupby/sort/groupby.cu
@@ -307,7 +307,7 @@ void store_result_functor::operator()<aggregation::STD>(aggregation const& agg)
   operator()<aggregation::VARIANCE>(*var_agg);
   column_view var_result = cache.get_result(col_idx, *var_agg);
 
-  auto result = cudf::detail::unary_operation(var_result, unary_op::SQRT, mr, stream);
+  auto result = cudf::detail::unary_operation(var_result, unary_op::SQRT, stream, mr);
   cache.add_result(col_idx, agg, std::move(result));
 };
 
diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cpp
index 690647d9306..4f208d8985c 100644
--- a/cpp/src/interop/from_arrow.cpp
+++ b/cpp/src/interop/from_arrow.cpp
@@ -256,8 +256,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::dictionary32>(
   // If index type is not of type uint32_t, then cast it to uint32_t
   auto const dict_indices_type = data_type{type_id::UINT32};
   if (indices_column->type().id() != dict_indices_type.id())
-    indices_column =
-      cudf::detail::cast(indices_column->view(), dict_indices_type, mr, stream.value());
+    indices_column = cudf::detail::cast(indices_column->view(), dict_indices_type, stream, mr);
 
   // Child columns shouldn't have masks and we need the mask in main column
   auto column_contents = indices_column->release();
diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp
index ec58da6bf0b..c36b2be77e8 100644
--- a/cpp/src/interop/to_arrow.cpp
+++ b/cpp/src/interop/to_arrow.cpp
@@ -276,8 +276,8 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::dictionary32>(
   std::unique_ptr<column> dict_indices =
     cast(cudf::dictionary_column_view(input).get_indices_annotated(),
          cudf::data_type{type_id::INT32},
-         rmm::mr::get_current_device_resource(),
-         stream.value());
+         stream,
+         rmm::mr::get_current_device_resource());
   auto indices = dispatch_to_arrow{}.operator()<int32_t>(
     dict_indices->view(), dict_indices->type().id(), {}, ar_mr, stream);
   auto dict_keys = cudf::dictionary_column_view(input).keys();
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index e8cc606865b..4f006527bbc 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -109,8 +109,8 @@ struct dispatch_unary_cast_to {
                               !(cudf::is_timestamp<SourceT>() && is_numeric<TargetT>()) &&
                               !(cudf::is_timestamp<TargetT>() && is_numeric<SourceT>())>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto size = input.size();
     auto output =
@@ -122,7 +122,7 @@ struct dispatch_unary_cast_to {
 
     mutable_column_view output_mutable = *output;
 
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       input.begin<SourceT>(),
                       input.end<SourceT>(),
                       output_mutable.begin<TargetT>(),
@@ -137,8 +137,8 @@ struct dispatch_unary_cast_to {
                               (cudf::is_timestamp<SourceT>() && is_numeric<TargetT>()) ||
                               (cudf::is_timestamp<TargetT>() && is_numeric<SourceT>())>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     if (!cudf::is_fixed_width<TargetT>())
       CUDF_FAIL("Column type must be numeric or chrono");
@@ -160,24 +160,24 @@ struct dispatch_unary_cast_from {
     typename T,
     typename std::enable_if_t<cudf::is_fixed_width<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
-    return type_dispatcher(type, dispatch_unary_cast_to<T>{input}, type, mr, stream);
+    return type_dispatcher(type, dispatch_unary_cast_to<T>{input}, type, stream, mr);
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Fixed point unary ops not supported yet");
   }
 
   template <typename T, typename std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Column type must be numeric or chrono");
   }
@@ -185,12 +185,12 @@ struct dispatch_unary_cast_from {
 
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
-                             rmm::mr::device_memory_resource* mr,
-                             cudaStream_t stream)
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(is_fixed_width(type), "Unary cast type must be fixed-width.");
 
-  return type_dispatcher(input.type(), detail::dispatch_unary_cast_from{input}, type, mr, stream);
+  return type_dispatcher(input.type(), detail::dispatch_unary_cast_from{input}, type, stream, mr);
 }
 
 }  // namespace detail
@@ -200,7 +200,7 @@ std::unique_ptr<column> cast(column_view const& input,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::cast(input, type, mr);
+  return detail::cast(input, type, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 08b653c7353..348f829d192 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -235,8 +235,8 @@ std::unique_ptr<cudf::column> transform_fn(InputIterator begin,
                                            InputIterator end,
                                            rmm::device_buffer&& null_mask,
                                            size_type null_count,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   auto const size = cudf::distance(begin, end);
 
@@ -250,60 +250,65 @@ std::unique_ptr<cudf::column> transform_fn(InputIterator begin,
   if (size == 0) return output;
 
   auto output_view = output->mutable_view();
-  thrust::transform(
-    rmm::exec_policy(stream)->on(stream), begin, end, output_view.begin<OutputType>(), UFN{});
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
+                    begin,
+                    end,
+                    output_view.begin<OutputType>(),
+                    UFN{});
   return output;
 }
 
 template <typename T, typename UFN>
 std::unique_ptr<cudf::column> transform_fn(cudf::dictionary_column_view const& input,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
   auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
   auto default_mr      = rmm::mr::get_current_device_resource();
   // call unary-op using temporary output buffer
-  auto output = transform_fn<T, UFN>(
-    dictionary_itr,
-    dictionary_itr + input.size(),
-    detail::copy_bitmask(input.parent(), rmm::cuda_stream_view{stream}, default_mr),
-    input.null_count(),
-    default_mr,
-    stream);
+  auto output = transform_fn<T, UFN>(dictionary_itr,
+                                     dictionary_itr + input.size(),
+                                     detail::copy_bitmask(input.parent(), stream, default_mr),
+                                     input.null_count(),
+                                     stream,
+                                     default_mr);
   return cudf::dictionary::detail::encode(
-    output->view(), dictionary::detail::get_indices_type_for_size(output->size()), mr, stream);
+    output->view(),
+    dictionary::detail::get_indices_type_for_size(output->size()),
+    mr,
+    stream.value());
 }
 
 template <typename UFN>
 struct MathOpDispatcher {
   template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     return transform_fn<T, UFN>(
       input.begin<T>(),
       input.end<T>(),
       cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr),
       input.null_count(),
-      mr,
-      stream);
+      stream,
+      mr);
   }
 
   struct dictionary_dispatch {
     template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
-                                             rmm::mr::device_memory_resource* mr,
-                                             cudaStream_t stream)
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
     {
-      return transform_fn<T, UFN>(input, mr, stream);
+      return transform_fn<T, UFN>(input, stream, mr);
     }
 
     template <typename T, typename std::enable_if_t<!std::is_arithmetic<T>::value>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
-                                             rmm::mr::device_memory_resource* mr,
-                                             cudaStream_t stream)
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
     {
       CUDF_FAIL("dictionary keys must be numeric for this operation");
     }
@@ -313,21 +318,21 @@ struct MathOpDispatcher {
             typename std::enable_if_t<!std::is_arithmetic<T>::value and
                                       std::is_same<T, dictionary32>::value>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     if (input.is_empty()) return empty_like(input);
     auto dictionary_col = dictionary_column_view(input);
     return type_dispatcher(
-      dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, mr, stream);
+      dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, stream, mr);
   }
 
   template <typename T,
             typename std::enable_if_t<!std::is_arithmetic<T>::value and
                                       !std::is_same<T, dictionary32>::value>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Unsupported data type for operation");
   }
@@ -337,31 +342,30 @@ template <typename UFN>
 struct BitwiseOpDispatcher {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
-    return transform_fn<T, UFN>(
-      input.begin<T>(),
-      input.end<T>(),
-      cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr),
-      input.null_count(),
-      mr,
-      stream);
+    return transform_fn<T, UFN>(input.begin<T>(),
+                                input.end<T>(),
+                                cudf::detail::copy_bitmask(input, stream, mr),
+                                input.null_count(),
+                                stream,
+                                mr);
   }
 
   struct dictionary_dispatch {
     template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
-                                             rmm::mr::device_memory_resource* mr,
-                                             cudaStream_t stream)
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
     {
-      return transform_fn<T, UFN>(input, mr, stream);
+      return transform_fn<T, UFN>(input, stream, mr);
     }
 
     template <typename T, typename std::enable_if_t<!std::is_integral<T>::value>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
-                                             rmm::mr::device_memory_resource* mr,
-                                             cudaStream_t stream)
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
     {
       CUDF_FAIL("dictionary keys type not supported for this operation");
     }
@@ -371,21 +375,21 @@ struct BitwiseOpDispatcher {
             typename std::enable_if_t<!std::is_integral<T>::value and
                                       std::is_same<T, dictionary32>::value>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     if (input.is_empty()) return empty_like(input);
     auto dictionary_col = dictionary_column_view(input);
     return type_dispatcher(
-      dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, mr, stream);
+      dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, stream, mr);
   }
 
   template <typename T,
             typename std::enable_if_t<!std::is_integral<T>::value and
                                       !std::is_same<T, dictionary32>::value>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Unsupported datatype for operation");
   }
@@ -403,39 +407,38 @@ struct LogicalOpDispatcher {
  public:
   template <typename T, typename std::enable_if_t<is_supported<T>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
-    return transform_fn<bool, UFN>(
-      input.begin<T>(),
-      input.end<T>(),
-      cudf::detail::copy_bitmask(input, rmm::cuda_stream_view{stream}, mr),
-      input.null_count(),
-      mr,
-      stream);
+    return transform_fn<bool, UFN>(input.begin<T>(),
+                                   input.end<T>(),
+                                   cudf::detail::copy_bitmask(input, stream, mr),
+                                   input.null_count(),
+
+                                   stream,
+                                   mr);
   }
 
   struct dictionary_dispatch {
     template <typename T, typename std::enable_if_t<is_supported<T>()>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
-                                             rmm::mr::device_memory_resource* mr,
-                                             cudaStream_t stream)
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
     {
       auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
       auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
-      return transform_fn<bool, UFN>(
-        dictionary_itr,
-        dictionary_itr + input.size(),
-        cudf::detail::copy_bitmask(input.parent(), rmm::cuda_stream_view{stream}, mr),
-        input.null_count(),
-        mr,
-        stream);
+      return transform_fn<bool, UFN>(dictionary_itr,
+                                     dictionary_itr + input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
+                                     stream,
+                                     mr);
     }
 
     template <typename T, typename std::enable_if_t<!is_supported<T>()>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
-                                             rmm::mr::device_memory_resource* mr,
-                                             cudaStream_t stream)
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
     {
       CUDF_FAIL("dictionary keys type not supported for this operation");
     }
@@ -445,13 +448,13 @@ struct LogicalOpDispatcher {
             typename std::enable_if_t<!is_supported<T>() and
                                       std::is_same<T, dictionary32>::value>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     if (input.is_empty()) return make_empty_column(cudf::data_type{cudf::type_id::BOOL8});
     auto dictionary_col = dictionary_column_view(input);
     return type_dispatcher(
-      dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, mr, stream);
+      dictionary_col.keys().type(), dictionary_dispatch{}, dictionary_col, stream, mr);
   }
 
   // template <typename T, typename std::enable_if_t<!is_supported<T>()>* = nullptr>
@@ -459,8 +462,8 @@ struct LogicalOpDispatcher {
             typename std::enable_if_t<!is_supported<T>() and
                                       !std::is_same<T, dictionary32>::value>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           rmm::mr::device_memory_resource* mr,
-                                           cudaStream_t stream)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("Unsupported datatype for operation");
   }
@@ -470,79 +473,79 @@ struct LogicalOpDispatcher {
 
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_op op,
-                                              rmm::mr::device_memory_resource* mr,
-                                              cudaStream_t stream)
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
 {
   switch (op) {
     case cudf::unary_op::SIN:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceSin>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceSin>{}, input, stream, mr);
     case cudf::unary_op::COS:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceCos>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceCos>{}, input, stream, mr);
     case cudf::unary_op::TAN:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceTan>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceTan>{}, input, stream, mr);
     case cudf::unary_op::ARCSIN:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceArcSin>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceArcSin>{}, input, stream, mr);
     case cudf::unary_op::ARCCOS:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceArcCos>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceArcCos>{}, input, stream, mr);
     case cudf::unary_op::ARCTAN:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceArcTan>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceArcTan>{}, input, stream, mr);
     case cudf::unary_op::SINH:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceSinH>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceSinH>{}, input, stream, mr);
     case cudf::unary_op::COSH:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceCosH>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceCosH>{}, input, stream, mr);
     case cudf::unary_op::TANH:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceTanH>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceTanH>{}, input, stream, mr);
     case cudf::unary_op::ARCSINH:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceArcSinH>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceArcSinH>{}, input, stream, mr);
     case cudf::unary_op::ARCCOSH:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceArcCosH>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceArcCosH>{}, input, stream, mr);
     case cudf::unary_op::ARCTANH:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceArcTanH>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceArcTanH>{}, input, stream, mr);
     case cudf::unary_op::EXP:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceExp>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceExp>{}, input, stream, mr);
     case cudf::unary_op::LOG:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceLog>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceLog>{}, input, stream, mr);
     case cudf::unary_op::SQRT:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceSqrt>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceSqrt>{}, input, stream, mr);
     case cudf::unary_op::CBRT:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceCbrt>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceCbrt>{}, input, stream, mr);
     case cudf::unary_op::CEIL:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceCeil>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceCeil>{}, input, stream, mr);
     case cudf::unary_op::FLOOR:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceFloor>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceFloor>{}, input, stream, mr);
     case cudf::unary_op::ABS:
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceAbs>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceAbs>{}, input, stream, mr);
     case cudf::unary_op::RINT:
       CUDF_EXPECTS(
         (input.type().id() == type_id::FLOAT32) or (input.type().id() == type_id::FLOAT64),
         "rint expects floating point values");
       return cudf::type_dispatcher(
-        input.type(), detail::MathOpDispatcher<detail::DeviceRInt>{}, input, mr, stream);
+        input.type(), detail::MathOpDispatcher<detail::DeviceRInt>{}, input, stream, mr);
     case cudf::unary_op::BIT_INVERT:
       return cudf::type_dispatcher(
-        input.type(), detail::BitwiseOpDispatcher<detail::DeviceInvert>{}, input, mr, stream);
+        input.type(), detail::BitwiseOpDispatcher<detail::DeviceInvert>{}, input, stream, mr);
     case cudf::unary_op::NOT:
       return cudf::type_dispatcher(
-        input.type(), detail::LogicalOpDispatcher<detail::DeviceNot>{}, input, mr, stream);
+        input.type(), detail::LogicalOpDispatcher<detail::DeviceNot>{}, input, stream, mr);
     default: CUDF_FAIL("Undefined unary operation");
   }
 }
@@ -554,7 +557,7 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::unary_operation(input, op, mr);
+  return detail::unary_operation(input, op, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 33600e83530..9f8f0e53cb2 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
@@ -29,19 +30,27 @@ struct nan_dispatcher {
   std::enable_if_t<std::is_floating_point<T>::value, std::unique_ptr<column>> operator()(
     cudf::column_view const& input,
     Predicate predicate,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
   {
     auto input_device_view = column_device_view::create(input);
 
     if (input.has_nulls()) {
       auto input_pair_iterator = make_pair_iterator<T, true>(*input_device_view);
-      return true_if(
-        input_pair_iterator, input_pair_iterator + input.size(), input.size(), predicate, mr);
+      return true_if(input_pair_iterator,
+                     input_pair_iterator + input.size(),
+                     input.size(),
+                     predicate,
+                     stream,
+                     mr);
     } else {
       auto input_pair_iterator = make_pair_iterator<T, false>(*input_device_view);
-      return true_if(
-        input_pair_iterator, input_pair_iterator + input.size(), input.size(), predicate, mr);
+      return true_if(input_pair_iterator,
+                     input_pair_iterator + input.size(),
+                     input.size(),
+                     predicate,
+                     stream,
+                     mr);
     }
   }
 
@@ -49,33 +58,33 @@ struct nan_dispatcher {
   std::enable_if_t<!std::is_floating_point<T>::value, std::unique_ptr<column>> operator()(
     cudf::column_view const& input,
     Predicate predicate,
-    rmm::mr::device_memory_resource* mr,
-    cudaStream_t stream)
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr)
   {
     CUDF_FAIL("NAN is not supported in a Non-floating point type column");
   }
 };
 
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
-                               rmm::mr::device_memory_resource* mr,
-                               cudaStream_t stream)
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   auto predicate = [] __device__(auto element_validity_pair) {
     return element_validity_pair.second and std::isnan(element_validity_pair.first);
   };
 
-  return cudf::type_dispatcher(input.type(), nan_dispatcher{}, input, predicate, mr, stream);
+  return cudf::type_dispatcher(input.type(), nan_dispatcher{}, input, predicate, stream, mr);
 }
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
-                                   rmm::mr::device_memory_resource* mr,
-                                   cudaStream_t stream)
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   auto predicate = [] __device__(auto element_validity_pair) {
     return !element_validity_pair.second or !std::isnan(element_validity_pair.first);
   };
 
-  return cudf::type_dispatcher(input.type(), nan_dispatcher{}, input, predicate, mr, stream);
+  return cudf::type_dispatcher(input.type(), nan_dispatcher{}, input, predicate, stream, mr);
 }
 
 }  // namespace detail
@@ -83,14 +92,14 @@ std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
 std::unique_ptr<column> is_nan(cudf::column_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_nan(input, mr);
+  return detail::is_nan(input, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_not_nan(input, mr);
+  return detail::is_not_nan(input, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu
index 3355cfc348d..699439da1c9 100644
--- a/cpp/src/unary/null_ops.cu
+++ b/cpp/src/unary/null_ops.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 std::unique_ptr<column> is_null(cudf::column_view const& input, rmm::mr::device_memory_resource* mr)
@@ -32,6 +33,7 @@ std::unique_ptr<column> is_null(cudf::column_view const& input, rmm::mr::device_
                          thrust::make_counting_iterator(input.size()),
                          input.size(),
                          predicate,
+                         rmm::cuda_stream_default,
                          mr);
 }
 
@@ -46,6 +48,7 @@ std::unique_ptr<column> is_valid(cudf::column_view const& input,
                          thrust::make_counting_iterator(input.size()),
                          input.size(),
                          predicate,
+                         rmm::cuda_stream_default,
                          mr);
 }
 
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index 51b63806cfa..a74a05437be 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -20,8 +20,10 @@
 #include <rmm/thrust_rmm_allocator.h>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace unary {
@@ -29,8 +31,8 @@ template <typename T, typename Tout, typename F>
 struct launcher {
   static std::unique_ptr<cudf::column> launch(cudf::column_view const& input,
                                               cudf::unary_op op,
-                                              rmm::mr::device_memory_resource* mr,
-                                              cudaStream_t stream = 0)
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
   {
     std::unique_ptr<cudf::column> output = [&] {
       if (op == cudf::unary_op::NOT) {
@@ -40,12 +42,12 @@ struct launcher {
         return std::make_unique<column>(type,
                                         size,
                                         rmm::device_buffer{size * cudf::size_of(type), 0, mr},
-                                        copy_bitmask(input, 0, mr),
+                                        cudf::detail::copy_bitmask(input, stream, mr),
                                         input.null_count());
 
       } else {
         return cudf::detail::allocate_like(
-          input, input.size(), mask_allocation_policy::NEVER, mr, stream);
+          input, input.size(), mask_allocation_policy::NEVER, stream, mr);
       }
     }();
 
@@ -62,7 +64,7 @@ struct launcher {
         rmm::device_buffer{input.null_mask(), bitmask_allocation_size_bytes(input.size())},
         input.null_count());
 
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       input.begin<T>(),
                       input.end<T>(),
                       output_view.begin<Tout>(),

From 6250687520808c5f86e67fc895d8f0cedbbe3e8a Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 10 Nov 2020 15:53:53 +1100
Subject: [PATCH 41/51] Fix JNI build after cuda_stream_view changes

---
 java/pom.xml                                |    1 +
 java/src/main/native/src/TableJni.cpp       |  319 +++---
 java/src/main/native/src/map_lookup.cu      |   77 +-
 java/src/main/native/src/map_lookup.hpp     |   61 +-
 java/src/main/native/src/row_conversion.cu  | 1012 +++++++++----------
 java/src/main/native/src/row_conversion.hpp |   25 +-
 6 files changed, 690 insertions(+), 805 deletions(-)

diff --git a/java/pom.xml b/java/pom.xml
index d14d4202e21..8894d9eae46 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -351,6 +351,7 @@
                                     <arg value="-DRMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL}" />
                                     <arg value="-DCMAKE_CXX_FLAGS=${cxx.flags}"/>
                                     <arg value="-DCMAKE_EXPORT_COMPILE_COMMANDS=${CMAKE_EXPORT_COMPILE_COMMANDS}"/>
+                                    <arg value="-DCUDF_CPP_BUILD_DIR=${CUDF_CPP_BUILD_DIR}"/>
                                     <arg value="-DGPU_ARCHS=ALL"/>
                                 </exec>
                                 <exec dir="${native.build.path}"
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index eb3b3df6ee4..086a6bd68f9 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -24,8 +24,8 @@
 #include <cudf/interop.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/data_sink.hpp>
-#include <cudf/io/parquet.hpp>
 #include <cudf/io/orc.hpp>
+#include <cudf/io/parquet.hpp>
 #include <cudf/join.hpp>
 #include <cudf/merge.hpp>
 #include <cudf/partitioning.hpp>
@@ -34,10 +34,11 @@
 #include <cudf/search.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 #include "cudf_jni_apis.hpp"
-#include "row_conversion.hpp"
 #include "dtype_utils.hpp"
+#include "row_conversion.hpp"
 
 namespace cudf {
 namespace jni {
@@ -108,7 +109,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
 
   bool supports_device_write() const override { return true; }
 
-  void device_write(void const *gpu_data, size_t size, cudaStream_t stream) {
+  void device_write(void const *gpu_data, size_t size, rmm::cuda_stream_view stream) {
     JNIEnv *env = cudf::jni::get_jni_env(jvm);
     size_t left_to_copy = size;
     const char *copy_from = static_cast<const char *>(gpu_data);
@@ -116,7 +117,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
         // should never be < 0, but just to be safe
-        CUDA_TRY(cudaStreamSynchronize(stream));
+        stream.synchronize();
         rotate_buffer(env);
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
@@ -124,14 +125,15 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
           left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
       char *copy_to = current_buffer_data + current_buffer_written;
 
-      CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, stream));
+      CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost,
+                               stream.value()));
 
       copy_from = copy_from + amount_to_copy;
       current_buffer_written += amount_to_copy;
       total_written += amount_to_copy;
       left_to_copy -= amount_to_copy;
     }
-    CUDA_TRY(cudaStreamSynchronize(stream));
+    stream.synchronize();
   }
 
   void flush() override {
@@ -195,26 +197,18 @@ template <typename STATE> class jni_table_writer_handle final {
   std::unique_ptr<jni_writer_data_sink> sink;
 };
 
-typedef jni_table_writer_handle<cudf::io::pq_chunked_state>
-    native_parquet_writer_handle;
+typedef jni_table_writer_handle<cudf::io::pq_chunked_state> native_parquet_writer_handle;
 typedef jni_table_writer_handle<cudf::io::orc_chunked_state> native_orc_writer_handle;
 
 class native_arrow_ipc_writer_handle final {
 public:
-  explicit native_arrow_ipc_writer_handle(
-          const std::vector<std::string>& col_names,
-          const std::string& file_name): 
-      initialized(false),
-      column_names(col_names),
-      file_name(file_name) {}
-
-  explicit native_arrow_ipc_writer_handle(
-          const std::vector<std::string>& col_names,
-          const std::shared_ptr<arrow::io::OutputStream>& sink): 
-      initialized(false),
-      column_names(col_names),
-      sink(sink),
-      file_name("") {}
+  explicit native_arrow_ipc_writer_handle(const std::vector<std::string> &col_names,
+                                          const std::string &file_name)
+      : initialized(false), column_names(col_names), file_name(file_name) {}
+
+  explicit native_arrow_ipc_writer_handle(const std::vector<std::string> &col_names,
+                                          const std::shared_ptr<arrow::io::OutputStream> &sink)
+      : initialized(false), column_names(col_names), sink(sink), file_name("") {}
 
   bool initialized;
   std::vector<std::string> column_names;
@@ -222,7 +216,7 @@ class native_arrow_ipc_writer_handle final {
   std::shared_ptr<arrow::io::OutputStream> sink;
   std::shared_ptr<arrow::ipc::RecordBatchWriter> writer;
 
-  void write(std::shared_ptr<arrow::Table>& arrow_tab, int64_t max_chunk) {
+  void write(std::shared_ptr<arrow::Table> &arrow_tab, int64_t max_chunk) {
     if (!initialized) {
       if (!sink) {
         auto tmp_sink = arrow::io::FileOutputStream::Open(file_name);
@@ -252,7 +246,6 @@ class native_arrow_ipc_writer_handle final {
   }
 };
 
-
 class jni_arrow_output_stream final : public arrow::io::OutputStream {
 public:
   explicit jni_arrow_output_stream(JNIEnv *env, jobject callback) {
@@ -292,11 +285,11 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     current_buffer = nullptr;
   }
 
-  arrow::Status Write(const std::shared_ptr<arrow::Buffer> & data) override {
+  arrow::Status Write(const std::shared_ptr<arrow::Buffer> &data) override {
     return Write(data->data(), data->size());
   }
 
-  arrow::Status Write(const void* data, int64_t nbytes) override {
+  arrow::Status Write(const void *data, int64_t nbytes) override {
     JNIEnv *env = cudf::jni::get_jni_env(jvm);
     int64_t left_to_copy = nbytes;
     const char *copy_from = static_cast<const char *>(data);
@@ -346,13 +339,9 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     return arrow::Status::OK();
   }
 
-  arrow::Result<int64_t> Tell() const override {
-    return total_written;
-  }
+  arrow::Result<int64_t> Tell() const override { return total_written; }
 
-  bool closed() const override {
-    return is_closed;
-  }
+  bool closed() const override { return is_closed; }
 
 private:
   void rotate_buffer(JNIEnv *env) {
@@ -389,8 +378,8 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
 
 class jni_arrow_input_stream final : public arrow::io::InputStream {
 public:
-  explicit jni_arrow_input_stream(JNIEnv *env, jobject callback) :
-      mm(arrow::default_cpu_memory_manager()) {
+  explicit jni_arrow_input_stream(JNIEnv *env, jobject callback)
+      : mm(arrow::default_cpu_memory_manager()) {
     if (env->GetJavaVM(&jvm) < 0) {
       throw std::runtime_error("GetJavaVM failed");
     }
@@ -400,8 +389,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
       throw cudf::jni::jni_exception("class not found");
     }
 
-    read_into_method =
-        env->GetMethodID(cls, "readInto", "(JJ)J");
+    read_into_method = env->GetMethodID(cls, "readInto", "(JJ)J");
     if (read_into_method == nullptr) {
       throw cudf::jni::jni_exception("readInto method");
     }
@@ -423,7 +411,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
     callback = nullptr;
   }
 
-  arrow::Result<int64_t> Read(int64_t nbytes, void* out) override {
+  arrow::Result<int64_t> Read(int64_t nbytes, void *out) override {
     JNIEnv *env = cudf::jni::get_jni_env(jvm);
     jlong ret = read_into(env, reinterpret_cast<jlong>(out), nbytes);
     total_read += ret;
@@ -432,7 +420,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
 
   arrow::Result<std::shared_ptr<arrow::Buffer>> Read(int64_t nbytes) override {
     JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    arrow::Result<std::shared_ptr<arrow::ResizableBuffer>> tmp_buffer = 
+    arrow::Result<std::shared_ptr<arrow::ResizableBuffer>> tmp_buffer =
         arrow::AllocateResizableBuffer(nbytes);
     if (!tmp_buffer.ok()) {
       return tmp_buffer;
@@ -444,7 +432,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
     }
     return tmp_buffer;
   }
-  
+
   arrow::Status Close() override {
     is_closed = true;
     return arrow::Status::OK();
@@ -455,13 +443,9 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
     return arrow::Status::OK();
   }
 
-  arrow::Result<int64_t> Tell() const override {
-    return total_read;
-  }
+  arrow::Result<int64_t> Tell() const override { return total_read; }
 
-  bool closed() const override {
-    return is_closed;
-  }
+  bool closed() const override { return is_closed; }
 
 private:
   jlong read_into(JNIEnv *env, jlong addr, jlong len) {
@@ -483,8 +467,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
 
 class native_arrow_ipc_reader_handle final {
 public:
-  explicit native_arrow_ipc_reader_handle(
-          const std::string& file_name) {
+  explicit native_arrow_ipc_reader_handle(const std::string &file_name) {
     auto tmp_source = arrow::io::ReadableFile::Open(file_name);
     if (!tmp_source.ok()) {
       throw std::runtime_error(tmp_source.status().message());
@@ -497,9 +480,8 @@ class native_arrow_ipc_reader_handle final {
     reader = *tmp_reader;
   }
 
-  explicit native_arrow_ipc_reader_handle(
-          std::shared_ptr<arrow::io::InputStream> source):
-     source(source) {
+  explicit native_arrow_ipc_reader_handle(std::shared_ptr<arrow::io::InputStream> source)
+      : source(source) {
     auto tmp_reader = arrow::ipc::RecordBatchStreamReader::Open(source);
     if (!tmp_reader.ok()) {
       throw std::runtime_error(tmp_reader.status().message());
@@ -528,7 +510,7 @@ class native_arrow_ipc_reader_handle final {
       // EOF
       return std::unique_ptr<arrow::Table>();
     }
-    arrow::Result<std::shared_ptr<arrow::Table>> tmp = 
+    arrow::Result<std::shared_ptr<arrow::Table>> tmp =
         arrow::Table::FromRecordBatches(reader->schema(), batches);
     if (!tmp.ok()) {
       throw std::runtime_error(tmp.status().message());
@@ -539,9 +521,7 @@ class native_arrow_ipc_reader_handle final {
   std::shared_ptr<arrow::io::InputStream> source;
   std::shared_ptr<arrow::ipc::RecordBatchReader> reader;
 
-  void close() {
-    source->Close();
-  }
+  void close() { source->Close(); }
 };
 
 /**
@@ -584,8 +564,8 @@ bool valid_window_parameters(native_jintArray const &values,
 // Check that time-range window parameters are valid.
 bool valid_window_parameters(native_jintArray const &values, native_jintArray const &timestamps,
                              native_jpointerArray<cudf::aggregation> const &ops,
-                             native_jintArray const &min_periods,
-                             native_jintArray const &preceding, native_jintArray const &following) {
+                             native_jintArray const &min_periods, native_jintArray const &preceding,
+                             native_jintArray const &following) {
   return values.size() == timestamps.size() &&
          valid_window_parameters(values, ops, min_periods, preceding, following);
 }
@@ -697,8 +677,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jpointerArray<cudf::table_view> n_table_handles(env,
-                                                                      j_table_handles);
+    cudf::jni::native_jpointerArray<cudf::table_view> n_table_handles(env, j_table_handles);
 
     const cudf::jni::native_jintArray n_sort_key_indexes(env, j_sort_key_indexes);
     jsize num_columns = n_sort_key_indexes.size();
@@ -738,10 +717,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
       tables.push_back(*n_table_handles[i]);
     }
 
-    std::unique_ptr<cudf::table> result = cudf::merge(tables,
-            indexes,
-            order,
-            null_order);
+    std::unique_ptr<cudf::table> result = cudf::merge(tables, indexes, order, null_order);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, NULL);
@@ -790,19 +766,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     }
 
     cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(*source)
-      .delimiter(delim)
-      .header(header_row)
-      .names(n_col_names.as_cpp_vector())
-      .dtypes(n_data_types.as_cpp_vector())
-      .use_cols_names(n_filter_col_names.as_cpp_vector())
-      .true_values(n_true_values.as_cpp_vector())
-      .false_values(n_false_values.as_cpp_vector())
-      .na_values(n_null_values.as_cpp_vector())
-      .keep_default_na(false)
-      .na_filter(n_null_values.size() > 0)
-      .quotechar(quote)
-      .comment(comment)
-      .build();
+                                            .delimiter(delim)
+                                            .header(header_row)
+                                            .names(n_col_names.as_cpp_vector())
+                                            .dtypes(n_data_types.as_cpp_vector())
+                                            .use_cols_names(n_filter_col_names.as_cpp_vector())
+                                            .true_values(n_true_values.as_cpp_vector())
+                                            .false_values(n_false_values.as_cpp_vector())
+                                            .na_values(n_null_values.as_cpp_vector())
+                                            .keep_default_na(false)
+                                            .na_filter(n_null_values.size() > 0)
+                                            .quotechar(quote)
+                                            .comment(comment)
+                                            .build();
     cudf::io::table_with_metadata result = cudf::io::read_csv(opts);
     return cudf::jni::convert_table_for_return(env, result.tbl);
   }
@@ -842,11 +818,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
     }
 
     cudf::io::parquet_reader_options opts =
-      cudf::io::parquet_reader_options::builder(*source)
-        .columns(n_filter_col_names.as_cpp_vector())
-        .convert_strings_to_categories(false)
-        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-        .build();
+        cudf::io::parquet_reader_options::builder(*source)
+            .columns(n_filter_col_names.as_cpp_vector())
+            .convert_strings_to_categories(false)
+            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+            .build();
     cudf::io::table_with_metadata result = cudf::io::read_parquet(opts);
     return cudf::jni::convert_table_for_return(env, result.tbl);
   }
@@ -883,11 +859,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
         new cudf::jni::jni_writer_data_sink(env, consumer));
     sink_info sink{data_sink.get()};
     chunked_parquet_writer_options opts =
-      chunked_parquet_writer_options::builder(sink)
-        .nullable_metadata(&metadata)
-        .compression(static_cast<compression_type>(j_compression))
-        .stats_level(static_cast<statistics_freq>(j_stats_freq))
-        .build();
+        chunked_parquet_writer_options::builder(sink)
+            .nullable_metadata(&metadata)
+            .compression(static_cast<compression_type>(j_compression))
+            .stats_level(static_cast<statistics_freq>(j_stats_freq))
+            .build();
     std::shared_ptr<pq_chunked_state> state = write_parquet_chunked_begin(opts);
     cudf::jni::native_parquet_writer_handle *ret =
         new cudf::jni::native_parquet_writer_handle(state, data_sink);
@@ -925,11 +901,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
 
     sink_info sink{output_path.get()};
     chunked_parquet_writer_options opts =
-      chunked_parquet_writer_options::builder(sink)
-        .nullable_metadata(&metadata)
-        .compression(static_cast<compression_type>(j_compression))
-        .stats_level(static_cast<statistics_freq>(j_stats_freq))
-        .build();
+        chunked_parquet_writer_options::builder(sink)
+            .nullable_metadata(&metadata)
+            .compression(static_cast<compression_type>(j_compression))
+            .stats_level(static_cast<statistics_freq>(j_stats_freq))
+            .build();
     std::shared_ptr<pq_chunked_state> state = write_parquet_chunked_begin(opts);
     cudf::jni::native_parquet_writer_handle *ret =
         new cudf::jni::native_parquet_writer_handle(state);
@@ -1007,12 +983,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
       source.reset(new cudf::io::source_info(filename.get()));
     }
 
-    cudf::io::orc_reader_options opts = cudf::io::orc_reader_options::builder(*source)
-      .columns(n_filter_col_names.as_cpp_vector())
-      .use_index(false)
-      .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
-      .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-      .build();
+    cudf::io::orc_reader_options opts =
+        cudf::io::orc_reader_options::builder(*source)
+            .columns(n_filter_col_names.as_cpp_vector())
+            .use_index(false)
+            .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
+            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+            .build();
     cudf::io::table_with_metadata result = cudf::io::read_orc(opts);
     return cudf::jni::convert_table_for_return(env, result.tbl);
   }
@@ -1048,12 +1025,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
         new cudf::jni::jni_writer_data_sink(env, consumer));
     sink_info sink{data_sink.get()};
-    chunked_orc_writer_options opts =
-      chunked_orc_writer_options::builder(sink)
-        .metadata(&metadata)
-        .compression(static_cast<compression_type>(j_compression))
-        .enable_statistics(true)
-        .build();
+    chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
+                                          .metadata(&metadata)
+                                          .compression(static_cast<compression_type>(j_compression))
+                                          .enable_statistics(true)
+                                          .build();
     std::shared_ptr<orc_chunked_state> state = write_orc_chunked_begin(opts);
     cudf::jni::native_orc_writer_handle *ret =
         new cudf::jni::native_orc_writer_handle(state, data_sink);
@@ -1090,12 +1066,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
     }
 
     sink_info sink{output_path.get()};
-    chunked_orc_writer_options opts =
-      chunked_orc_writer_options::builder(sink)
-        .metadata(&metadata)
-        .compression(static_cast<compression_type>(j_compression))
-        .enable_statistics(true)
-        .build();
+    chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
+                                          .metadata(&metadata)
+                                          .compression(static_cast<compression_type>(j_compression))
+                                          .enable_statistics(true)
+                                          .build();
     std::shared_ptr<orc_chunked_state> state = write_orc_chunked_begin(opts);
     cudf::jni::native_orc_writer_handle *ret = new cudf::jni::native_orc_writer_handle(state);
     return reinterpret_cast<jlong>(ret);
@@ -1138,10 +1113,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass
   CATCH_STD(env, )
 }
 
-
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names,
-    jobject consumer) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(JNIEnv *env, jclass,
+                                                                          jobjectArray j_col_names,
+                                                                          jobject consumer) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
@@ -1152,17 +1126,15 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(
         new cudf::jni::jni_arrow_output_stream(env, consumer));
 
     cudf::jni::native_arrow_ipc_writer_handle *ret =
-        new cudf::jni::native_arrow_ipc_writer_handle(
-                col_names.as_cpp_vector(),
-                data_sink);
+        new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names,
-    jstring j_output_path) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *env, jclass,
+                                                                        jobjectArray j_col_names,
+                                                                        jstring j_output_path) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_output_path, "null output path", 0);
   try {
@@ -1171,9 +1143,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(
     cudf::jni::native_jstring output_path(env, j_output_path);
 
     cudf::jni::native_arrow_ipc_writer_handle *ret =
-        new cudf::jni::native_arrow_ipc_writer_handle(
-                col_names.as_cpp_vector(),
-                output_path.get());
+        new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get());
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
@@ -1191,15 +1161,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv
 
   try {
     cudf::jni::auto_set_device(env);
-    std::unique_ptr<std::shared_ptr<arrow::Table>> result(new std::shared_ptr<arrow::Table>(nullptr));
+    std::unique_ptr<std::shared_ptr<arrow::Table>> result(
+        new std::shared_ptr<arrow::Table>(nullptr));
     auto column_metadata = std::vector<cudf::column_metadata>{};
     column_metadata.reserve(state->column_names.size());
-    std::transform(
-      std::begin(state->column_names),
-      std::end(state->column_names),
-      std::back_inserter(column_metadata),
-      [](auto const& column_name) { return cudf::column_metadata{column_name}; }
-    );
+    std::transform(std::begin(state->column_names), std::end(state->column_names),
+                   std::back_inserter(column_metadata),
+                   [](auto const &column_name) { return cudf::column_metadata{column_name}; });
     *result = cudf::to_arrow(*tview, column_metadata);
     if (!result->get()) {
       return 0;
@@ -1243,7 +1211,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv *env, j
 }
 
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *env, jclass,
-    jstring j_input_path) {
+                                                                       jstring j_input_path) {
   JNI_NULL_CHECK(env, j_input_path, "null input path", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -1257,7 +1225,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *e
 }
 
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv *env, jclass,
-    jobject provider) {
+                                                                         jobject provider) {
   JNI_NULL_CHECK(env, provider, "null provider", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -1272,10 +1240,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL 
-Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv *env, jclass,
-                                                        jlong j_state,
-                                                        jint row_target) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv *env, jclass,
+                                                                                jlong j_state,
+                                                                                jint row_target) {
   JNI_NULL_CHECK(env, j_state, "null state", 0);
 
   cudf::jni::native_arrow_ipc_reader_handle *state =
@@ -1285,7 +1252,8 @@ Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv *env, jclass,
     cudf::jni::auto_set_device(env);
     // This is a little odd because we have to return a pointer
     // and arrow wants to deal with shared pointers for everything.
-    std::unique_ptr<std::shared_ptr<arrow::Table>> result(new std::shared_ptr<arrow::Table>(nullptr));
+    std::unique_ptr<std::shared_ptr<arrow::Table>> result(
+        new std::shared_ptr<arrow::Table>(nullptr));
     *result = state->next(row_target);
     if (!result->get()) {
       return 0;
@@ -1307,8 +1275,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv *env, jc
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass,
-                                                                               jlong arrow_table_handle) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arrow_table_handle) {
   JNI_NULL_CHECK(env, arrow_table_handle, "null arrow handle", 0);
 
   std::shared_ptr<arrow::Table> *handle =
@@ -1323,7 +1291,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertArrowTableToCudf(J
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jclass,
-                                                                  jlong j_state) {
+                                                                 jlong j_state) {
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   cudf::jni::native_arrow_ipc_reader_handle *state =
@@ -1336,12 +1304,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jc
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoin(JNIEnv *env, jclass clazz,
-                                                                jlong left_table,
-                                                                jintArray left_col_join_indices,
-                                                                jlong right_table,
-                                                                jintArray right_col_join_indices,
-                                                                jboolean compare_nulls_equal) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoin(
+    JNIEnv *env, jclass clazz, jlong left_table, jintArray left_col_join_indices, jlong right_table,
+    jintArray right_col_join_indices, jboolean compare_nulls_equal) {
   JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL);
   JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
@@ -1367,19 +1332,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoin(JNIEnv *env, jcl
 
     std::unique_ptr<cudf::table> result =
         cudf::left_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols, dedupe,
-            static_cast<bool>(compare_nulls_equal)? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL);
+                        static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
+                                                                 cudf::null_equality::UNEQUAL);
 
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoin(JNIEnv *env, jclass clazz,
-                                                                 jlong left_table,
-                                                                 jintArray left_col_join_indices,
-                                                                 jlong right_table,
-                                                                 jintArray right_col_join_indices,
-                                                                 jboolean compare_nulls_equal) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoin(
+    JNIEnv *env, jclass clazz, jlong left_table, jintArray left_col_join_indices, jlong right_table,
+    jintArray right_col_join_indices, jboolean compare_nulls_equal) {
   JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL);
   JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
@@ -1405,19 +1368,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoin(JNIEnv *env, jc
 
     std::unique_ptr<cudf::table> result =
         cudf::inner_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols, dedupe,
-            static_cast<bool>(compare_nulls_equal)? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL);
+                         static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
+                                                                  cudf::null_equality::UNEQUAL);
 
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoin(JNIEnv *env, jclass clazz,
-                                                                jlong left_table,
-                                                                jintArray left_col_join_indices,
-                                                                jlong right_table,
-                                                                jintArray right_col_join_indices,
-                                                                jboolean compare_nulls_equal) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoin(
+    JNIEnv *env, jclass clazz, jlong left_table, jintArray left_col_join_indices, jlong right_table,
+    jintArray right_col_join_indices, jboolean compare_nulls_equal) {
   JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL);
   JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
@@ -1443,7 +1404,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoin(JNIEnv *env, jcl
 
     std::unique_ptr<cudf::table> result =
         cudf::full_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols, dedupe,
-            static_cast<bool>(compare_nulls_equal)? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL);
+                        static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
+                                                                 cudf::null_equality::UNEQUAL);
 
     return cudf::jni::convert_table_for_return(env, result);
   }
@@ -1475,7 +1437,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoin(
 
     std::unique_ptr<cudf::table> result = cudf::left_semi_join(
         *n_left_table, *n_right_table, left_join_cols, right_join_cols, return_cols,
-          static_cast<bool>(compare_nulls_equal)? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL);
+        static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
+                                                 cudf::null_equality::UNEQUAL);
 
     return cudf::jni::convert_table_for_return(env, result);
   }
@@ -1507,7 +1470,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoin(
 
     std::unique_ptr<cudf::table> result = cudf::left_anti_join(
         *n_left_table, *n_right_table, left_join_cols, right_join_cols, return_cols,
-            static_cast<bool>(compare_nulls_equal)? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL);
+        static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
+                                                 cudf::null_equality::UNEQUAL);
 
     return cudf::jni::convert_table_for_return(env, result);
   }
@@ -1525,8 +1489,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jc
     cudf::table_view *n_left_table = reinterpret_cast<cudf::table_view *>(left_table);
     cudf::table_view *n_right_table = reinterpret_cast<cudf::table_view *>(right_table);
 
-    std::unique_ptr<cudf::table> result =
-        cudf::cross_join(*n_left_table, *n_right_table);
+    std::unique_ptr<cudf::table> result = cudf::cross_join(*n_left_table, *n_right_table);
 
     return cudf::jni::convert_table_for_return(env, result);
   }
@@ -1702,10 +1665,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclass,
-                                                              jlong j_input,
-                                                              jlong j_map,
-                                                              jboolean check_bounds) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclass, jlong j_input,
+                                                              jlong j_map, jboolean check_bounds) {
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   try {
@@ -1718,8 +1679,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(
-    JNIEnv *env, jclass clazz, jlong input_table) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass clazz,
+                                                                     jlong input_table) {
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
 
   try {
@@ -1736,8 +1697,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(
-    JNIEnv *env, jclass clazz, jlong input_column, jintArray types, jintArray scale) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *env, jclass clazz,
+                                                                       jlong input_column,
+                                                                       jintArray types,
+                                                                       jintArray scale) {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
   JNI_NULL_CHECK(env, types, "types is null", 0);
 
@@ -1851,10 +1814,8 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
-    JNIEnv *env, jclass clazz, jlong j_input_table, jintArray j_keys,
-    jlongArray j_default_output, 
-    jintArray j_aggregate_column_indices, jlongArray j_agg_instances, 
-    jintArray j_min_periods,
+    JNIEnv *env, jclass clazz, jlong j_input_table, jintArray j_keys, jlongArray j_default_output,
+    jintArray j_aggregate_column_indices, jlongArray j_agg_instances, jintArray j_min_periods,
     jintArray j_preceding, jintArray j_following, jboolean ignore_null_keys) {
 
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
@@ -1893,13 +1854,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
       int agg_column_index = values[i];
       if (default_output[i] != nullptr) {
         result_columns.emplace_back(std::move(cudf::grouped_rolling_window(
-            groupby_keys, input_table->column(agg_column_index), *default_output[i],
-            preceding[i], following[i],
-            min_periods[i], agg_instances[i]->clone())));
+            groupby_keys, input_table->column(agg_column_index), *default_output[i], preceding[i],
+            following[i], min_periods[i], agg_instances[i]->clone())));
       } else {
         result_columns.emplace_back(std::move(cudf::grouped_rolling_window(
-            groupby_keys, input_table->column(agg_column_index),
-            preceding[i], following[i],
+            groupby_keys, input_table->column(agg_column_index), preceding[i], following[i],
             min_periods[i], agg_instances[i]->clone())));
       }
     }
diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu
index 03ffddecf1f..a3e25ce8905 100644
--- a/java/src/main/native/src/map_lookup.cu
+++ b/java/src/main/native/src/map_lookup.cu
@@ -27,25 +27,25 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
+#include "rmm/cuda_stream_view.hpp"
+
 namespace cudf {
 namespace {
 
 /**
  * @brief Device function that searches for the specified lookup_key
- * in the list at index `row_index`, and writes out the index of the 
+ * in the list at index `row_index`, and writes out the index of the
  * first match to the output.
- * 
+ *
  * This function is called once per row of the `input` column
  * If the lookup_key is not found, (-1) is returned for that list row.
  */
 template <bool has_nulls>
-void __device__ search_each_list(size_type row_index,
-                                 column_device_view input,
+void __device__ search_each_list(size_type row_index, column_device_view input,
                                  mutable_column_device_view output,
-                                 string_scalar_device_view lookup_key)
-{
-  if (has_nulls && input.is_null(row_index)) {  // List row is null.
-    output.element<size_type>(row_index) = -1;  // Not found.
+                                 string_scalar_device_view lookup_key) {
+  if (has_nulls && input.is_null(row_index)) { // List row is null.
+    output.element<size_type>(row_index) = -1; // Not found.
     return;
   }
 
@@ -68,7 +68,7 @@ void __device__ search_each_list(size_type row_index,
     }
   }
 
-  output.element<size_type>(row_index) = -1;  // Not found.
+  output.element<size_type>(row_index) = -1; // Not found.
 }
 
 /**
@@ -76,17 +76,16 @@ void __device__ search_each_list(size_type row_index,
  * string in each list<string> row of the `input` column.
  *
  * The kernel writes the index (into the `input` list-column's child) where the `lookup_key`
- * is found, to the `output` column. If the `lookup_key` is not found, (-1) is written instead. 
+ * is found, to the `output` column. If the `lookup_key` is not found, (-1) is written instead.
  *
  * The produces one output row per input, with no nulls. The output may then be used
  * with `cudf::gather()`, to find the values corresponding to the `lookup_key`.
  */
 template <int block_size, bool has_nulls>
-__launch_bounds__(block_size) __global__ void gpu_find_first(column_device_view input,
-                                                             mutable_column_device_view output,
-                                                             string_scalar_device_view lookup_key)
-{
-  size_type tid      = blockIdx.x * block_size + threadIdx.x;
+__launch_bounds__(block_size) __global__
+    void gpu_find_first(column_device_view input, mutable_column_device_view output,
+                        string_scalar_device_view lookup_key) {
+  size_type tid = blockIdx.x * block_size + threadIdx.x;
   size_type stride = block_size * gridDim.x;
 
   // Each CUDA thread processes one row of `input`. Each row is a list<string>.
@@ -106,37 +105,32 @@ __launch_bounds__(block_size) __global__ void gpu_find_first(column_device_view
  * for each row.
  */
 template <bool has_nulls>
-std::unique_ptr<column> get_gather_map_for_map_values(column_view const& input,
-                                                      string_scalar& lookup_key,
-                                                      rmm::mr::device_memory_resource* mr,
-                                                      cudaStream_t stream)
-{
+std::unique_ptr<column>
+get_gather_map_for_map_values(column_view const &input, string_scalar &lookup_key,
+                              rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
   constexpr size_type block_size{256};
   cudf::detail::grid_1d grid{input.size(), block_size};
 
   auto input_device_view = cudf::column_device_view::create(input, stream);
   auto lookup_key_device_view{get_scalar_device_view(lookup_key)};
-  auto gather_map = make_numeric_column(
-    data_type{cudf::type_to_id<size_type>()}, input.size(), mask_state::ALL_VALID, stream, mr);
+  auto gather_map = make_numeric_column(data_type{cudf::type_to_id<size_type>()}, input.size(),
+                                        mask_state::ALL_VALID, stream, mr);
   auto output_view = mutable_column_device_view::create(gather_map->mutable_view(), stream);
 
-  gpu_find_first<block_size, has_nulls><<<grid.num_blocks, block_size, 0, stream>>>(
-    *input_device_view, *output_view, lookup_key_device_view);
+  gpu_find_first<block_size, has_nulls><<<grid.num_blocks, block_size, 0, stream.value()>>>(
+      *input_device_view, *output_view, lookup_key_device_view);
 
   CHECK_CUDA(stream);
 
   return gather_map;
 }
 
-}  // namespace
+} // namespace
 
 namespace jni {
-std::unique_ptr<column> map_lookup(column_view const& map_column,
-                                   string_scalar lookup_key,
-                                   bool has_nulls,
-                                   rmm::mr::device_memory_resource* mr,
-                                   cudaStream_t stream)
-{
+std::unique_ptr<column> map_lookup(column_view const &map_column, string_scalar lookup_key,
+                                   bool has_nulls, rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource *mr) {
   // Defensive checks.
   CUDF_EXPECTS(map_column.type().id() == type_id::LIST, "Expected LIST<STRUCT<key,value>>.");
 
@@ -155,23 +149,20 @@ std::unique_ptr<column> map_lookup(column_view const& map_column,
   // Two-pass plan: construct gather map, and then gather() on structs_column.child(1). Plan A.
   // (Can do in one pass perhaps, but that's Plan B.)
 
-  auto gather_map = has_nulls? 
-     get_gather_map_for_map_values<true>(map_column, lookup_key, mr, stream)
-   : get_gather_map_for_map_values<false>(map_column, lookup_key, mr, stream);
+  auto gather_map = has_nulls ?
+                        get_gather_map_for_map_values<true>(map_column, lookup_key, stream, mr) :
+                        get_gather_map_for_map_values<false>(map_column, lookup_key, stream, mr);
 
   // Gather map is now available.
 
-  auto values_column    = structs_column.child(1);
+  auto values_column = structs_column.child(1);
   auto table_for_gather = table_view{std::vector<cudf::column_view>{values_column}};
 
-  auto gathered_table = cudf::detail::gather(table_for_gather,
-                                             gather_map->view(),
-                                             detail::out_of_bounds_policy::IGNORE,
-                                             detail::negative_index_policy::NOT_ALLOWED,
-                                             mr,
-                                             stream);
+  auto gathered_table = cudf::detail::gather(
+      table_for_gather, gather_map->view(), detail::out_of_bounds_policy::IGNORE,
+      detail::negative_index_policy::NOT_ALLOWED, stream, mr);
 
   return std::make_unique<cudf::column>(std::move(gathered_table->get_column(0)));
 }
-} // namespace jni;
-} // namespace cudf;
\ No newline at end of file
+} // namespace jni
+} // namespace cudf
diff --git a/java/src/main/native/src/map_lookup.hpp b/java/src/main/native/src/map_lookup.hpp
index c0380fe3306..6d54bfa371d 100644
--- a/java/src/main/native/src/map_lookup.hpp
+++ b/java/src/main/native/src/map_lookup.hpp
@@ -17,41 +17,40 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 
 namespace jni {
 
-  /**
-   * @brief Looks up a "map" column by specified key, and returns a column of string values.
-   * 
-   * The map-column is represented as follows:
-   * 
-   *  list_view<struct_view< string_view, string_view > >. 
-   *                         <---KEY--->  <--VALUE-->
-   * 
-   * The string_view struct members are the key and value, respectively.
-   * For each row in the input list column, the value corresponding to the first match
-   * of the specified lookup_key is returned. If the key is not found, a null is returned.
-   * 
-   * @param map_column The input "map" column to be searched. Must be of
-   *                   type list_view<struct_view<string_view, string_view>>.
-   * @param lookup_key The search key, whose value is to be returned for each list row
-   * @param has_nulls  Whether the input column might contain null list-rows, or null keys.
-   * @param mr         The device memory resource to be used for allocations
-   * @param stream     The CUDA stream
-   * @return           A string_view column with the value from the first match in each list.
-   *                   A null row is returned for any row where the lookup_key is not found.
-   * @throw cudf::logic_error If the input column is not of type 
-   *                          list_view<struct_view<string_view, string_view>>
-   */
-  std::unique_ptr<column> map_lookup(
-    column_view const& map_column,
-    string_scalar lookup_key,
-    bool has_nulls                      = true,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-    cudaStream_t stream                 = 0);
+/**
+ * @brief Looks up a "map" column by specified key, and returns a column of string values.
+ *
+ * The map-column is represented as follows:
+ *
+ *  list_view<struct_view< string_view, string_view > >.
+ *                         <---KEY--->  <--VALUE-->
+ *
+ * The string_view struct members are the key and value, respectively.
+ * For each row in the input list column, the value corresponding to the first match
+ * of the specified lookup_key is returned. If the key is not found, a null is returned.
+ *
+ * @param map_column The input "map" column to be searched. Must be of
+ *                   type list_view<struct_view<string_view, string_view>>.
+ * @param lookup_key The search key, whose value is to be returned for each list row
+ * @param has_nulls  Whether the input column might contain null list-rows, or null keys.
+ * @param stream     The CUDA stream
+ * @param mr         The device memory resource to be used for allocations
+ * @return           A string_view column with the value from the first match in each list.
+ *                   A null row is returned for any row where the lookup_key is not found.
+ * @throw cudf::logic_error If the input column is not of type
+ *                          list_view<struct_view<string_view, string_view>>
+ */
+std::unique_ptr<column>
+map_lookup(column_view const &map_column, string_scalar lookup_key, bool has_nulls = true,
+           rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
-} // namespace jni;
+} // namespace jni
 
-} // namespace cudf;
\ No newline at end of file
+} // namespace cudf
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 4448bc14044..a10ba9a2700 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -17,14 +17,14 @@
 #include <iostream>
 #include <limits>
 
-#include <cudf/table/table.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
-
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include "row_conversion.hpp"
@@ -37,299 +37,271 @@ namespace java {
  * the data on the same stream as is used to copy it.
  */
 template <typename T>
-std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(
-        const std::vector<T> & input,
-        cudaStream_t stream,
-        rmm::mr::device_memory_resource* mr) {
-    std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(
-                input.size(), stream, mr));
-    CUDA_TRY(cudaMemcpyAsync(ret->data(),
-                input.data(),
-                sizeof(T) * input.size(),
-                cudaMemcpyHostToDevice,
-                stream));
-    return ret;
+std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource *mr) {
+  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
+  CUDA_TRY(cudaMemcpyAsync(ret->data(), input.data(), sizeof(T) * input.size(),
+                           cudaMemcpyHostToDevice, stream.value()));
+  return ret;
 }
 
-__global__
-void copy_to_fixed_width_columns(
-        const cudf::size_type num_rows,
-        const cudf::size_type num_columns,
-        const cudf::size_type row_size,
-        const cudf::size_type* input_offset_in_row,
-        const cudf::size_type* num_bytes,
-        int8_t ** output_data,
-        cudf::bitmask_type ** output_nm,
-        const int8_t * input_data) {
-
-    // We are going to copy the data in two passes.
-    // The first pass copies a chunk of data into shared memory.
-    // The second pass copies that chunk from shared memory out to the final location.
-    
-    // Because shared memory is limited we copy a subset of the rows at a time.
-    // For simplicity we will refer to this as a row_group
-
-    // In practice we have found writing more than 4 columns of data per thread
-    // results in performance loss. As such we are using a 2 dimensional
-    // kernel in terms of threads, but not in terms of blocks. Columns are
-    // controlled by the y dimension (there is no y dimension in blocks). Rows
-    // are controlled by the x dimension (there are multiple blocks in the x
-    // dimension).
-
-    cudf::size_type rows_per_group = blockDim.x;
-    cudf::size_type row_group_start = blockIdx.x;
-    cudf::size_type row_group_stride = gridDim.x;
-    cudf::size_type row_group_end = (num_rows + rows_per_group - 1)/rows_per_group + 1;
-
-    extern __shared__ int8_t shared_data[];
-
-    // Because we are copying fixed width only data and we stride the rows
-    // this thread will always start copying from shared data in the same place
-    int8_t * row_tmp = &shared_data[row_size * threadIdx.x];
-    int8_t * row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
-
-    for (cudf::size_type row_group_index = row_group_start;
-            row_group_index < row_group_end;
-            row_group_index += row_group_stride) {
-        // Step 1: Copy the data into shared memory
-        // We know row_size is always aligned with and a multiple of int64_t;
-        int64_t * long_shared = reinterpret_cast<int64_t *>(shared_data);
-        const int64_t * long_input = reinterpret_cast<int64_t const *>(input_data);
-
-        cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
-        cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
-        cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
-        if (row_index_end > num_rows) {
-            row_index_end = num_rows;
-        }
-        cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-        cudf::size_type shared_length = row_size * num_rows_in_group;
+__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
+                                            const cudf::size_type num_columns,
+                                            const cudf::size_type row_size,
+                                            const cudf::size_type *input_offset_in_row,
+                                            const cudf::size_type *num_bytes, int8_t **output_data,
+                                            cudf::bitmask_type **output_nm,
+                                            const int8_t *input_data) {
+
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // For simplicity we will refer to this as a row_group
+
+  // In practice we have found writing more than 4 columns of data per thread
+  // results in performance loss. As such we are using a 2 dimensional
+  // kernel in terms of threads, but not in terms of blocks. Columns are
+  // controlled by the y dimension (there is no y dimension in blocks). Rows
+  // are controlled by the x dimension (there are multiple blocks in the x
+  // dimension).
+
+  cudf::size_type rows_per_group = blockDim.x;
+  cudf::size_type row_group_start = blockIdx.x;
+  cudf::size_type row_group_stride = gridDim.x;
+  cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+
+  extern __shared__ int8_t shared_data[];
+
+  // Because we are copying fixed width only data and we stride the rows
+  // this thread will always start copying from shared data in the same place
+  int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
+  int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+
+  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+       row_group_index += row_group_stride) {
+    // Step 1: Copy the data into shared memory
+    // We know row_size is always aligned with and a multiple of int64_t;
+    int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
+    const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
+
+    cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
+    cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
+    if (row_index_end > num_rows) {
+      row_index_end = num_rows;
+    }
+    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    cudf::size_type shared_length = row_size * num_rows_in_group;
 
-        cudf::size_type shared_output_end = shared_length/sizeof(int64_t);
+    cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
 
-        cudf::size_type start_input_index = (row_size * row_group_index * rows_per_group)/sizeof(int64_t);
+    cudf::size_type start_input_index =
+        (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
 
-        for (cudf::size_type shared_index = shared_output_index; 
-                shared_index < shared_output_end;
-                shared_index += shared_output_stride) {
-            long_shared[shared_index] = long_input[start_input_index + shared_index];
-        }
-        // Wait for all of the data to be in shared memory
-        __syncthreads();
-
-        // Step 2 copy the data back out
-
-        // Within the row group there should be 1 thread for each row.  This is a
-        // requirement for launching the kernel
-        cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
-        // But we might not use all of the threads if the number of rows does not go
-        // evenly into the thread count. We don't want those threads to exit yet
-        // because we may need them to copy data in for the next row group.
-        uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
-        if (row_index < num_rows) {
-            cudf::size_type col_index_start = threadIdx.y;
-            cudf::size_type col_index_stride = blockDim.y;
-            for (cudf::size_type col_index = col_index_start;
-                    col_index < num_columns;
-                    col_index += col_index_stride) {
-
-                cudf::size_type col_size = num_bytes[col_index];
-                const int8_t * col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
-                int8_t * col_output = output_data[col_index];
-                switch(col_size) {
-                    case 1:
-                    {
-                        col_output[row_index] = *col_tmp;
-                        break;
-                    }
-                    case 2:
-                    {
-                        int16_t * short_col_output = reinterpret_cast<int16_t *>(col_output);
-                        short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
-                        break;
-                    }
-                    case 4:
-                    {
-                        int32_t * int_col_output = reinterpret_cast<int32_t *>(col_output);
-                        int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
-                        break;
-                    }
-                    case 8:
-                    {
-                        int64_t * long_col_output = reinterpret_cast<int64_t *>(col_output);
-                        long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
-                        break;
-                    }
-                    default:
-                    {
-                        cudf::size_type output_offset = col_size * row_index;
-                        // TODO this should just not be supported for fixed width columns, but just in case...
-                        for (cudf::size_type b = 0; b < col_size; b++) {
-                            col_output[b + output_offset] = col_tmp[b];
-                        }
-                        break;
-                    }
-                }
-
-                cudf::bitmask_type * nm = output_nm[col_index];
-                int8_t * valid_byte = &row_vld_tmp[col_index/8];
-                cudf::size_type byte_bit_offset = col_index % 8;
-                int predicate = *valid_byte & (1 << byte_bit_offset);
-                uint32_t bitmask = __ballot_sync(active_mask, predicate);
-                if (row_index % 32 == 0) {
-                    nm[word_index(row_index)] = bitmask;
-                }
-            } // end column loop
-        } // end row copy
-        // wait for the row_group to be totally copied before starting on the next row group
-        __syncthreads();
+    for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
+         shared_index += shared_output_stride) {
+      long_shared[shared_index] = long_input[start_input_index + shared_index];
     }
-}
+    // Wait for all of the data to be in shared memory
+    __syncthreads();
+
+    // Step 2 copy the data back out
+
+    // Within the row group there should be 1 thread for each row.  This is a
+    // requirement for launching the kernel
+    cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
+    // But we might not use all of the threads if the number of rows does not go
+    // evenly into the thread count. We don't want those threads to exit yet
+    // because we may need them to copy data in for the next row group.
+    uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
+    if (row_index < num_rows) {
+      cudf::size_type col_index_start = threadIdx.y;
+      cudf::size_type col_index_stride = blockDim.y;
+      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+           col_index += col_index_stride) {
+
+        cudf::size_type col_size = num_bytes[col_index];
+        const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
+        int8_t *col_output = output_data[col_index];
+        switch (col_size) {
+          case 1: {
+            col_output[row_index] = *col_tmp;
+            break;
+          }
+          case 2: {
+            int16_t *short_col_output = reinterpret_cast<int16_t *>(col_output);
+            short_col_output[row_index] = *reinterpret_cast<const int16_t *>(col_tmp);
+            break;
+          }
+          case 4: {
+            int32_t *int_col_output = reinterpret_cast<int32_t *>(col_output);
+            int_col_output[row_index] = *reinterpret_cast<const int32_t *>(col_tmp);
+            break;
+          }
+          case 8: {
+            int64_t *long_col_output = reinterpret_cast<int64_t *>(col_output);
+            long_col_output[row_index] = *reinterpret_cast<const int64_t *>(col_tmp);
+            break;
+          }
+          default: {
+            cudf::size_type output_offset = col_size * row_index;
+            // TODO this should just not be supported for fixed width columns, but just in case...
+            for (cudf::size_type b = 0; b < col_size; b++) {
+              col_output[b + output_offset] = col_tmp[b];
+            }
+            break;
+          }
+        }
 
+        cudf::bitmask_type *nm = output_nm[col_index];
+        int8_t *valid_byte = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
+        int predicate = *valid_byte & (1 << byte_bit_offset);
+        uint32_t bitmask = __ballot_sync(active_mask, predicate);
+        if (row_index % 32 == 0) {
+          nm[word_index(row_index)] = bitmask;
+        }
+      } // end column loop
+    }   // end row copy
+    // wait for the row_group to be totally copied before starting on the next row group
+    __syncthreads();
+  }
+}
 
-__global__
-void copy_from_fixed_width_columns(
-        const cudf::size_type start_row,
-        const cudf::size_type num_rows,
-        const cudf::size_type num_columns,
-        const cudf::size_type row_size,
-        const cudf::size_type* output_offset_in_row,
-        const cudf::size_type* num_bytes,
-        const int8_t ** input_data,
-        const cudf::bitmask_type ** input_nm,
-        int8_t * output_data) {
-    // We are going to copy the data in two passes.
-    // The first pass copies a chunk of data into shared memory.
-    // The second pass copies that chunk from shared memory out to the final location.
-    
-    // Because shared memory is limited we copy a subset of the rows at a time.
-    // We do not support copying a subset of the columns in a row yet, so we don't
-    // currently support a row that is wider than shared memory.
-    // For simplicity we will refer to this as a row_group
-
-    // In practice we have found reading more than 4 columns of data per thread
-    // results in performance loss. As such we are using a 2 dimensional
-    // kernel in terms of threads, but not in terms of blocks. Columns are
-    // controlled by the y dimension (there is no y dimension in blocks). Rows
-    // are controlled by the x dimension (there are multiple blocks in the x
-    // dimension).
-
-    cudf::size_type rows_per_group = blockDim.x;
-    cudf::size_type row_group_start = blockIdx.x;
-    cudf::size_type row_group_stride = gridDim.x;
-    cudf::size_type row_group_end = (num_rows + rows_per_group - 1)/rows_per_group + 1;
-
-    extern __shared__ int8_t shared_data[];
-
-
-    // Because we are copying fixed width only data and we stride the rows
-    // this thread will always start copying to shared data in the same place
-    int8_t * row_tmp = &shared_data[row_size * threadIdx.x];
-    int8_t * row_vld_tmp = &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
-
-    for (cudf::size_type row_group_index = row_group_start;
-            row_group_index < row_group_end;
-            row_group_index += row_group_stride) {
-
-        // Within the row group there should be 1 thread for each row.  This is a
-        // requirement for launching the kernel
-        cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
-        // But we might not use all of the threads if the number of rows does not go
-        // evenly into the thread count. We don't want those threads to exit yet
-        // because we may need them to copy data back out.
-        if (row_index < (start_row + num_rows)) {
-            cudf::size_type col_index_start = threadIdx.y;
-            cudf::size_type col_index_stride = blockDim.y;
-            for (cudf::size_type col_index = col_index_start;
-                    col_index < num_columns;
-                    col_index += col_index_stride) {
-
-                cudf::size_type col_size = num_bytes[col_index];
-                int8_t * col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
-                const int8_t * col_input = input_data[col_index];
-                switch(col_size) {
-                    case 1:
-                    {
-                        *col_tmp = col_input[row_index];
-                        break;
-                    }
-                    case 2:
-                    {
-                        const int16_t * short_col_input = reinterpret_cast<const int16_t *>(col_input);
-                        *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
-                        break;
-                    }
-                    case 4:
-                    {
-                        const int32_t * int_col_input = reinterpret_cast<const int32_t *>(col_input);
-                        *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
-                        break;
-                    }
-                    case 8:
-                    {
-                        const int64_t * long_col_input = reinterpret_cast<const int64_t *>(col_input);
-                        *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
-                        break;
-                    }
-                    default:
-                    {
-                        cudf::size_type input_offset = col_size * row_index;
-                        // TODO this should just not be supported for fixed width columns, but just in case...
-                        for (cudf::size_type b = 0; b < col_size; b++) {
-                            col_tmp[b] = col_input[b + input_offset];
-                        }
-                        break;
-                    }
-                }
-                // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
-                // so we have to rewrite the addresses to make sure that it is 4 byte aligned
-                int8_t * valid_byte = &row_vld_tmp[col_index/8];
-                cudf::size_type byte_bit_offset = col_index % 8;
-                uint64_t fixup_bytes = reinterpret_cast<uint64_t>(valid_byte) % 4;
-                int32_t * valid_int = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
-                cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
-                // Now copy validity for the column
-                if (input_nm[col_index]) {
-                  if (bit_is_set(input_nm[col_index], row_index)) {
-                    atomicOr_block(valid_int, 1 << int_bit_offset);
-                  } else {
-                    atomicAnd_block(valid_int, ~(1 << int_bit_offset));
-                  }
-                } else {
-                  // It is valid so just set the bit
-                  atomicOr_block(valid_int, 1 << int_bit_offset);
-                }
-            } // end column loop
-        } // end row copy
-        // wait for the row_group to be totally copied into shared memory
-        __syncthreads();
-
-        // Step 2: Copy the data back out
-        // We know row_size is always aligned with and a multiple of int64_t;
-        int64_t * long_shared = reinterpret_cast<int64_t *>(shared_data);
-        int64_t * long_output = reinterpret_cast<int64_t *>(output_data);
-
-        cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
-        cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
-        cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
-        if (row_index_end > num_rows) {
-            row_index_end = num_rows;
+__global__ void
+copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_type num_rows,
+                              const cudf::size_type num_columns, const cudf::size_type row_size,
+                              const cudf::size_type *output_offset_in_row,
+                              const cudf::size_type *num_bytes, const int8_t **input_data,
+                              const cudf::bitmask_type **input_nm, int8_t *output_data) {
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // We do not support copying a subset of the columns in a row yet, so we don't
+  // currently support a row that is wider than shared memory.
+  // For simplicity we will refer to this as a row_group
+
+  // In practice we have found reading more than 4 columns of data per thread
+  // results in performance loss. As such we are using a 2 dimensional
+  // kernel in terms of threads, but not in terms of blocks. Columns are
+  // controlled by the y dimension (there is no y dimension in blocks). Rows
+  // are controlled by the x dimension (there are multiple blocks in the x
+  // dimension).
+
+  cudf::size_type rows_per_group = blockDim.x;
+  cudf::size_type row_group_start = blockIdx.x;
+  cudf::size_type row_group_stride = gridDim.x;
+  cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+
+  extern __shared__ int8_t shared_data[];
+
+  // Because we are copying fixed width only data and we stride the rows
+  // this thread will always start copying to shared data in the same place
+  int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
+  int8_t *row_vld_tmp =
+      &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
+
+  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+       row_group_index += row_group_stride) {
+
+    // Within the row group there should be 1 thread for each row.  This is a
+    // requirement for launching the kernel
+    cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
+    // But we might not use all of the threads if the number of rows does not go
+    // evenly into the thread count. We don't want those threads to exit yet
+    // because we may need them to copy data back out.
+    if (row_index < (start_row + num_rows)) {
+      cudf::size_type col_index_start = threadIdx.y;
+      cudf::size_type col_index_stride = blockDim.y;
+      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+           col_index += col_index_stride) {
+
+        cudf::size_type col_size = num_bytes[col_index];
+        int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
+        const int8_t *col_input = input_data[col_index];
+        switch (col_size) {
+          case 1: {
+            *col_tmp = col_input[row_index];
+            break;
+          }
+          case 2: {
+            const int16_t *short_col_input = reinterpret_cast<const int16_t *>(col_input);
+            *reinterpret_cast<int16_t *>(col_tmp) = short_col_input[row_index];
+            break;
+          }
+          case 4: {
+            const int32_t *int_col_input = reinterpret_cast<const int32_t *>(col_input);
+            *reinterpret_cast<int32_t *>(col_tmp) = int_col_input[row_index];
+            break;
+          }
+          case 8: {
+            const int64_t *long_col_input = reinterpret_cast<const int64_t *>(col_input);
+            *reinterpret_cast<int64_t *>(col_tmp) = long_col_input[row_index];
+            break;
+          }
+          default: {
+            cudf::size_type input_offset = col_size * row_index;
+            // TODO this should just not be supported for fixed width columns, but just in case...
+            for (cudf::size_type b = 0; b < col_size; b++) {
+              col_tmp[b] = col_input[b + input_offset];
+            }
+            break;
+          }
+        }
+        // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
+        // so we have to rewrite the addresses to make sure that it is 4 byte aligned
+        int8_t *valid_byte = &row_vld_tmp[col_index / 8];
+        cudf::size_type byte_bit_offset = col_index % 8;
+        uint64_t fixup_bytes = reinterpret_cast<uint64_t>(valid_byte) % 4;
+        int32_t *valid_int = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
+        cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
+        // Now copy validity for the column
+        if (input_nm[col_index]) {
+          if (bit_is_set(input_nm[col_index], row_index)) {
+            atomicOr_block(valid_int, 1 << int_bit_offset);
+          } else {
+            atomicAnd_block(valid_int, ~(1 << int_bit_offset));
+          }
+        } else {
+          // It is valid so just set the bit
+          atomicOr_block(valid_int, 1 << int_bit_offset);
         }
-        cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-        cudf::size_type shared_length = row_size * num_rows_in_group;
+      } // end column loop
+    }   // end row copy
+    // wait for the row_group to be totally copied into shared memory
+    __syncthreads();
+
+    // Step 2: Copy the data back out
+    // We know row_size is always aligned with and a multiple of int64_t;
+    int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
+    int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
+
+    cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
+    cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
+    if (row_index_end > num_rows) {
+      row_index_end = num_rows;
+    }
+    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    cudf::size_type shared_length = row_size * num_rows_in_group;
 
-        cudf::size_type shared_input_end = shared_length/sizeof(int64_t);
+    cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
 
-        cudf::size_type start_output_index = (row_size * row_group_index * rows_per_group)/sizeof(int64_t);
+    cudf::size_type start_output_index =
+        (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
 
-        for (cudf::size_type shared_index = shared_input_index; 
-                shared_index < shared_input_end;
-                shared_index += shared_input_stride) {
-            long_output[start_output_index + shared_index] = long_shared[shared_index];
-        }
-        __syncthreads();
-        // Go for the next round
+    for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
+         shared_index += shared_input_stride) {
+      long_output[start_output_index + shared_index] = long_shared[shared_index];
     }
+    __syncthreads();
+    // Go for the next round
+  }
 }
 
 /**
@@ -341,60 +313,58 @@ void copy_from_fixed_width_columns(
  * @param [out] threads the size of the threads for the kernel
  * @return the size in bytes of shared memory needed for each block.
  */
-static int calc_fixed_width_kernel_dims(
-    const cudf::size_type num_columns,
-    const cudf::size_type num_rows,
-    const cudf::size_type size_per_row,
-    dim3 & blocks,
-    dim3 & threads) {
-
-    // We have found speed degrades when a thread handles more than 4 columns.
-    // Each block is 2 dimensional. The y dimension indicates the columns.
-    // We limit this to 32 threads in the y dimension so we can still
-    // have at least 32 threads in the x dimension (1 warp) which should
-    // result in better coalescing of memory operations. We also
-    // want to guarantee that we are processing a multiple of 32 threads
-    // in the x dimension because we use atomic operations at the block
-    // level when writing validity data out to main memory, and that would
-    // need to change if we split a word of validity data between blocks.
-    int y_block_size = (num_columns + 3) / 4;
-    if (y_block_size > 32) {
-      y_block_size = 32;
-    }
-    int x_possible_block_size = 1024/y_block_size;
-    // 48KB is the default setting for shared memory per block according to the cuda tutorials
-    // If someone configures the GPU to only have 16 KB this might not work.
-    int max_shared_size = 48 * 1024;
-    int max_block_size = max_shared_size/size_per_row;
-    // If we don't have enough shared memory there is no point in having more threads
-    // per block that will just sit idle
-    max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
-    // Make sure that the x dimension is a multiple of 32 this not only helps
-    // coalesce memory access it also lets us do a ballot sync for validity to write
-    // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
-    // dimension is associated with one or more warps, that should correspond to the validity
-    // words directly.
-    int block_size = (max_block_size / 32) * 32;
-    CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
-
-    int num_blocks = (num_rows + block_size - 1) / block_size;
-    if (num_blocks < 1) {
-        num_blocks = 1;
-    } else if (num_blocks > 10240) {
-        // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
-        // but in practice haveing too many can cause some overhead that I don't totally
-        // understand. Playing around with this haveing as little as 600 blocks appears
-        // to be able to saturate memory on V100, so this is an order of magnitude higher
-        // to try and future proof this a bit.
-        num_blocks = 10240;
-    }
-    blocks.x = num_blocks;
-    blocks.y = 1;
-    blocks.z = 1;
-    threads.x = block_size;
-    threads.y = y_block_size;
-    threads.z = 1;
-    return size_per_row * block_size;
+static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
+                                        const cudf::size_type num_rows,
+                                        const cudf::size_type size_per_row, dim3 &blocks,
+                                        dim3 &threads) {
+
+  // We have found speed degrades when a thread handles more than 4 columns.
+  // Each block is 2 dimensional. The y dimension indicates the columns.
+  // We limit this to 32 threads in the y dimension so we can still
+  // have at least 32 threads in the x dimension (1 warp) which should
+  // result in better coalescing of memory operations. We also
+  // want to guarantee that we are processing a multiple of 32 threads
+  // in the x dimension because we use atomic operations at the block
+  // level when writing validity data out to main memory, and that would
+  // need to change if we split a word of validity data between blocks.
+  int y_block_size = (num_columns + 3) / 4;
+  if (y_block_size > 32) {
+    y_block_size = 32;
+  }
+  int x_possible_block_size = 1024 / y_block_size;
+  // 48KB is the default setting for shared memory per block according to the cuda tutorials
+  // If someone configures the GPU to only have 16 KB this might not work.
+  int max_shared_size = 48 * 1024;
+  int max_block_size = max_shared_size / size_per_row;
+  // If we don't have enough shared memory there is no point in having more threads
+  // per block that will just sit idle
+  max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
+  // Make sure that the x dimension is a multiple of 32 this not only helps
+  // coalesce memory access it also lets us do a ballot sync for validity to write
+  // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
+  // dimension is associated with one or more warps, that should correspond to the validity
+  // words directly.
+  int block_size = (max_block_size / 32) * 32;
+  CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
+
+  int num_blocks = (num_rows + block_size - 1) / block_size;
+  if (num_blocks < 1) {
+    num_blocks = 1;
+  } else if (num_blocks > 10240) {
+    // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
+    // but in practice haveing too many can cause some overhead that I don't totally
+    // understand. Playing around with this haveing as little as 600 blocks appears
+    // to be able to saturate memory on V100, so this is an order of magnitude higher
+    // to try and future proof this a bit.
+    num_blocks = 10240;
+  }
+  blocks.x = num_blocks;
+  blocks.y = 1;
+  blocks.z = 1;
+  threads.x = block_size;
+  threads.y = y_block_size;
+  threads.z = 1;
+  return size_per_row * block_size;
 }
 
 /**
@@ -404,68 +374,53 @@ static int calc_fixed_width_kernel_dims(
  * into this function are common between runs and should be calculated once.
  */
 static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
-        const cudf::size_type start_row,
-        const cudf::size_type num_rows,
-        const cudf::size_type num_columns,
-        const cudf::size_type size_per_row,
-        std::unique_ptr<rmm::device_uvector<cudf::size_type>> & column_start,
-        std::unique_ptr<rmm::device_uvector<cudf::size_type>> & column_size,
-        std::unique_ptr<rmm::device_uvector<const int8_t *>> & input_data,
-        std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> & input_nm,
-        const cudf::scalar & zero,
-        const cudf::scalar & scalar_size_per_row,
-        cudaStream_t stream,
-        rmm::mr::device_memory_resource* mr
-        ) {
-    int64_t total_allocation = size_per_row * num_rows;
-    // We made a mistake in the split somehow
-    CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
-
-    // Allocate and set the offsets row for the byte array
-    std::unique_ptr<cudf::column> offsets = cudf::detail::sequence(
-            num_rows + 1, zero, scalar_size_per_row);
-
-    std::unique_ptr<cudf::column> data =
-        cudf::make_numeric_column(
-                cudf::data_type(cudf::type_id::INT8),
-                static_cast<cudf::size_type>(total_allocation));
-
-    dim3 blocks;
-    dim3 threads;
-    int shared_size = calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
-
-    copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream>>>(
-            start_row,
-            num_rows,
-            num_columns,
-            size_per_row,
-            column_start->data(),
-            column_size->data(),
-            input_data->data(),
-            input_nm->data(),
-            data->mutable_view().data<int8_t>());
-
-    return cudf::make_lists_column(num_rows,
-                std::move(offsets),
-                std::move(data),
-                0,
-                rmm::device_buffer{0, 0, mr});
+    const cudf::size_type start_row, const cudf::size_type num_rows,
+    const cudf::size_type num_columns, const cudf::size_type size_per_row,
+    std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
+    std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
+    std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
+    std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
+    const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource *mr) {
+  int64_t total_allocation = size_per_row * num_rows;
+  // We made a mistake in the split somehow
+  CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
+
+  // Allocate and set the offsets row for the byte array
+  std::unique_ptr<cudf::column> offsets =
+      cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
+
+  std::unique_ptr<cudf::column> data = cudf::make_numeric_column(
+      cudf::data_type(cudf::type_id::INT8), static_cast<cudf::size_type>(total_allocation),
+      cudf::mask_state::UNALLOCATED, stream, mr);
+
+  dim3 blocks;
+  dim3 threads;
+  int shared_size =
+      calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+
+  copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+      start_row, num_rows, num_columns, size_per_row, column_start->data(), column_size->data(),
+      input_data->data(), input_nm->data(), data->mutable_view().data<int8_t>());
+
+  return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
+                                 rmm::device_buffer{0, 0, mr}, stream, mr);
 }
 
-static cudf::data_type get_data_type(const cudf::column_view & v) {
-    return v.type();
+static cudf::data_type get_data_type(const cudf::column_view &v) {
+  return v.type();
 }
 
-static bool is_fixed_width(const cudf::data_type & t) {
-    return cudf::is_fixed_width(t);
+static bool is_fixed_width(const cudf::data_type &t) {
+  return cudf::is_fixed_width(t);
 }
 
 static inline int32_t align_offset(int32_t offset, std::size_t alignment) {
-    return (offset + alignment - 1) & ~(alignment - 1);
+  return (offset + alignment - 1) & ~(alignment - 1);
 }
 
-static inline bool are_all_fixed_width(std::vector<cudf::data_type> const & schema) {
-    return std::all_of(schema.begin(), schema.end(), cudf::java::is_fixed_width);
+static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
+  return std::all_of(schema.begin(), schema.end(), cudf::java::is_fixed_width);
 }
 
 /**
@@ -475,168 +430,149 @@ static inline bool are_all_fixed_width(std::vector<cudf::data_type> const & sche
  * @param [out] column_size the size in bytes of the data for each columns in the row.
  * @return the size in bytes each row needs.
  */
-static inline int32_t compute_fixed_width_layout(
-        std::vector<cudf::data_type> const & schema,
-        std::vector<cudf::size_type> & column_start,
-        std::vector<cudf::size_type> & column_size) {
-    // We guarantee that the start of each column is 64-bit aligned so anything can go
-    // there, but to make the code simple we will still do an alignment for it.
-    int32_t at_offset = 0;
-    for (auto col = schema.begin(); col < schema.end(); col++) {
-        cudf::size_type s = cudf::size_of(*col);
-        column_size.emplace_back(s);
-        std::size_t allocation_needed = s;
-        std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types
-        at_offset = align_offset(at_offset, alignment_needed);
-        column_start.emplace_back(at_offset);
-        at_offset += allocation_needed;
-    }
-
-    // Now we need to add in space for validity
-    // Eventually we can think about nullable vs not nullable, but for now we will just always add it in
-    int32_t validity_bytes_needed = (schema.size() + 7)/8;
-    // validity comes at the end and is byte aligned so we can pack more in.
-    at_offset += validity_bytes_needed;
-    // Now we need to pad the end so all rows are 64 bit aligned
-    return align_offset(at_offset, 8); // 8 bytes (64 bits)
+static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
+                                                 std::vector<cudf::size_type> &column_start,
+                                                 std::vector<cudf::size_type> &column_size) {
+  // We guarantee that the start of each column is 64-bit aligned so anything can go
+  // there, but to make the code simple we will still do an alignment for it.
+  int32_t at_offset = 0;
+  for (auto col = schema.begin(); col < schema.end(); col++) {
+    cudf::size_type s = cudf::size_of(*col);
+    column_size.emplace_back(s);
+    std::size_t allocation_needed = s;
+    std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types
+    at_offset = align_offset(at_offset, alignment_needed);
+    column_start.emplace_back(at_offset);
+    at_offset += allocation_needed;
+  }
+
+  // Now we need to add in space for validity
+  // Eventually we can think about nullable vs not nullable, but for now we will just always add it
+  // in
+  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  // validity comes at the end and is byte aligned so we can pack more in.
+  at_offset += validity_bytes_needed;
+  // Now we need to pad the end so all rows are 64 bit aligned
+  return align_offset(at_offset, 8); // 8 bytes (64 bits)
 }
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
-        cudf::table_view const& tbl,
-        cudaStream_t stream,
-        rmm::mr::device_memory_resource* mr) {
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource *mr) {
 
-    const cudf::size_type num_columns = tbl.num_columns();
+  const cudf::size_type num_columns = tbl.num_columns();
 
-    std::vector<cudf::data_type> schema;
-    schema.resize(num_columns);
-    std::transform(tbl.begin(), tbl.end(), schema.begin(), cudf::java::get_data_type);
+  std::vector<cudf::data_type> schema;
+  schema.resize(num_columns);
+  std::transform(tbl.begin(), tbl.end(), schema.begin(), cudf::java::get_data_type);
 
-    if (are_all_fixed_width(schema)) {
-        std::vector<cudf::size_type> column_start;
-        std::vector<cudf::size_type> column_size;
+  if (are_all_fixed_width(schema)) {
+    std::vector<cudf::size_type> column_start;
+    std::vector<cudf::size_type> column_size;
 
-        int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size);
-        auto dev_column_start = copy_to_dev_async(column_start, stream, mr);
-        auto dev_column_size = copy_to_dev_async(column_size, stream, mr);
+    int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size);
+    auto dev_column_start = copy_to_dev_async(column_start, stream, mr);
+    auto dev_column_size = copy_to_dev_async(column_size, stream, mr);
 
-        int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
-        // Make the number of rows per batch a multiple of 32 so we don't have to worry about
-        // splitting validity at a specific row offset.  This might change in the future.
-        max_rows_per_batch = (max_rows_per_batch/32) * 32;
+    int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
+    // Make the number of rows per batch a multiple of 32 so we don't have to worry about
+    // splitting validity at a specific row offset.  This might change in the future.
+    max_rows_per_batch = (max_rows_per_batch / 32) * 32;
 
-        cudf::size_type num_rows = tbl.num_rows();
+    cudf::size_type num_rows = tbl.num_rows();
 
-        // Get the pointers to the input columnar data ready
-        std::vector<const int8_t *> input_data;
-        std::vector<cudf::bitmask_type const *> input_nm;
-        for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
-            cudf::column_view cv = tbl.column(column_number);
-            input_data.emplace_back(cv.data<int8_t>());
-            input_nm.emplace_back(cv.null_mask());
-        }
-        auto dev_input_data = copy_to_dev_async(input_data, stream, mr);
-        auto dev_input_nm = copy_to_dev_async(input_nm, stream, mr);
-
-        using ScalarType = cudf::scalar_type_t<cudf::size_type>;
-        auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-        zero->set_valid(true);
-        static_cast<ScalarType *>(zero.get())->set_value(0);
-
-        auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-        step->set_valid(true);
-        static_cast<ScalarType *>(step.get())->set_value(static_cast<cudf::size_type>(size_per_row));
-
-        std::vector<std::unique_ptr<cudf::column>> ret;
-        for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
-            cudf::size_type row_count = num_rows - row_start;
-            row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-            ret.emplace_back(fixed_width_convert_to_rows(
-                        row_start,
-                        row_count,
-                        num_columns,
-                        size_per_row,
-                        dev_column_start,
-                        dev_column_size,
-                        dev_input_data,
-                        dev_input_nm,
-                        *zero,
-                        *step,
-                        stream,
-                        mr));
-        }
-
-        return ret;
-    } else {
-        CUDF_FAIL("Only fixed width types are currently supported");
+    // Get the pointers to the input columnar data ready
+    std::vector<const int8_t *> input_data;
+    std::vector<cudf::bitmask_type const *> input_nm;
+    for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
+      cudf::column_view cv = tbl.column(column_number);
+      input_data.emplace_back(cv.data<int8_t>());
+      input_nm.emplace_back(cv.null_mask());
+    }
+    auto dev_input_data = copy_to_dev_async(input_data, stream, mr);
+    auto dev_input_nm = copy_to_dev_async(input_nm, stream, mr);
+
+    using ScalarType = cudf::scalar_type_t<cudf::size_type>;
+    auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+    zero->set_valid(true, stream);
+    static_cast<ScalarType *>(zero.get())->set_value(0, stream);
+
+    auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+    step->set_valid(true, stream);
+    static_cast<ScalarType *>(step.get())
+        ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
+
+    std::vector<std::unique_ptr<cudf::column>> ret;
+    for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
+      cudf::size_type row_count = num_rows - row_start;
+      row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
+      ret.emplace_back(fixed_width_convert_to_rows(
+          row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
+          dev_input_data, dev_input_nm, *zero, *step, stream, mr));
     }
-}
 
-std::unique_ptr<cudf::table> convert_from_rows(
-        cudf::lists_column_view const& input,
-        std::vector<cudf::data_type> const& schema,
-        cudaStream_t stream,
-        rmm::mr::device_memory_resource* mr) {
-
-    // verify that the types are what we expect
-    cudf::column_view child = input.child();
-    cudf::type_id list_type = child.type().id();
-    CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
-            "Only a list of bytes is supported as input");
-
-    cudf::size_type num_columns = schema.size();
-
-    if (are_all_fixed_width(schema)) {
-        std::vector<cudf::size_type> column_start;
-        std::vector<cudf::size_type> column_size;
-
-        cudf::size_type num_rows = input.parent().size();
-        int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size);
-
-        // Ideally we would check that the offsets are all the same, etc. but for now
-        // this is probably fine
-        CUDF_EXPECTS(size_per_row * num_rows == child.size(),
-                "The layout of the data appears to be off");
-        auto dev_column_start = copy_to_dev_async(column_start, stream, mr);
-        auto dev_column_size = copy_to_dev_async(column_size, stream, mr);
-
-        // Allocate the columns we are going to write into
-        std::vector<std::unique_ptr<cudf::column>> output_columns;
-        std::vector<int8_t *> output_data;
-        std::vector<cudf::bitmask_type *> output_nm; 
-        for (cudf::size_type i = 0; i < num_columns; i++) {
-            auto column = cudf::make_fixed_width_column(schema[i],
-                        num_rows,
-                        cudf::mask_state::UNINITIALIZED,
-                        stream,
-                        mr);
-            auto mut = column->mutable_view();
-            output_data.emplace_back(mut.data<int8_t>());
-            output_nm.emplace_back(mut.null_mask());
-            output_columns.emplace_back(std::move(column));
-        }
+    return ret;
+  } else {
+    CUDF_FAIL("Only fixed width types are currently supported");
+  }
+}
 
-        auto dev_output_data = copy_to_dev_async(output_data, stream, mr);
-        auto dev_output_nm = copy_to_dev_async(output_nm, stream, mr);
-
-        dim3 blocks;
-        dim3 threads;
-        int shared_size = calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
-
-        copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream>>>(
-                num_rows,
-                num_columns,
-                size_per_row,
-                dev_column_start->data(),
-                dev_column_size->data(),
-                dev_output_data->data(),
-                dev_output_nm->data(),
-                child.data<int8_t>());
-
-        return std::make_unique<cudf::table>(std::move(output_columns));
-    } else {
-        CUDF_FAIL("Only fixed width types are currently supported");
+std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
+                                               std::vector<cudf::data_type> const &schema,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource *mr) {
+
+  // verify that the types are what we expect
+  cudf::column_view child = input.child();
+  cudf::type_id list_type = child.type().id();
+  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+               "Only a list of bytes is supported as input");
+
+  cudf::size_type num_columns = schema.size();
+
+  if (are_all_fixed_width(schema)) {
+    std::vector<cudf::size_type> column_start;
+    std::vector<cudf::size_type> column_size;
+
+    cudf::size_type num_rows = input.parent().size();
+    int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size);
+
+    // Ideally we would check that the offsets are all the same, etc. but for now
+    // this is probably fine
+    CUDF_EXPECTS(size_per_row * num_rows == child.size(),
+                 "The layout of the data appears to be off");
+    auto dev_column_start = copy_to_dev_async(column_start, stream, mr);
+    auto dev_column_size = copy_to_dev_async(column_size, stream, mr);
+
+    // Allocate the columns we are going to write into
+    std::vector<std::unique_ptr<cudf::column>> output_columns;
+    std::vector<int8_t *> output_data;
+    std::vector<cudf::bitmask_type *> output_nm;
+    for (cudf::size_type i = 0; i < num_columns; i++) {
+      auto column = cudf::make_fixed_width_column(schema[i], num_rows,
+                                                  cudf::mask_state::UNINITIALIZED, stream, mr);
+      auto mut = column->mutable_view();
+      output_data.emplace_back(mut.data<int8_t>());
+      output_nm.emplace_back(mut.null_mask());
+      output_columns.emplace_back(std::move(column));
     }
+
+    auto dev_output_data = copy_to_dev_async(output_data, stream, mr);
+    auto dev_output_nm = copy_to_dev_async(output_nm, stream, mr);
+
+    dim3 blocks;
+    dim3 threads;
+    int shared_size =
+        calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+
+    copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
+        num_rows, num_columns, size_per_row, dev_column_start->data(), dev_column_size->data(),
+        dev_output_data->data(), dev_output_nm->data(), child.data<int8_t>());
+
+    return std::make_unique<cudf::table>(std::move(output_columns));
+  } else {
+    CUDF_FAIL("Only fixed width types are currently supported");
+  }
 }
 
 } // namespace java
diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp
index e26eadb35ea..17abde8df19 100644
--- a/java/src/main/native/src/row_conversion.hpp
+++ b/java/src/main/native/src/row_conversion.hpp
@@ -20,22 +20,21 @@
 
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace java {
 
+std::vector<std::unique_ptr<cudf::column>>
+convert_to_rows(cudf::table_view const &tbl,
+                // TODO need something for validity
+                rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+                rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(
-        cudf::table_view const& tbl,
-        // TODO need something for validity
-        cudaStream_t stream = 0,
-        rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::table>
+convert_from_rows(cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
+                  rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::table> convert_from_rows(
-        cudf::lists_column_view const& input,
-        std::vector<cudf::data_type> const& schema,
-        cudaStream_t stream = 0,
-        rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-} // java
-} // cudf
+} // namespace java
+} // namespace cudf

From 4c93c6212c8172a65ca58b47a477397d188ebe8d Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 11 Nov 2020 12:23:55 +1100
Subject: [PATCH 42/51] Add missing CONDA_INCLUDE_DIRS from benchmarks cmake

---
 cpp/benchmarks/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 667498fa965..592f1377c87 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -58,6 +58,10 @@ include_directories("${CMAKE_BINARY_DIR}/include"
                     "${RMM_INCLUDE}"
                     "${CMAKE_CURRENT_SOURCE_DIR}")
 
+if(CONDA_INCLUDE_DIRS)
+  include_directories("${CONDA_INCLUDE_DIRS}")
+endif(CONDA_INCLUDE_DIRS)
+
 ###################################################################################################
 # - library paths ---------------------------------------------------------------------------------
 

From 87548f5957cb2d42f3ffa8e12ddfb3188ec1ee75 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 11 Nov 2020 12:24:24 +1100
Subject: [PATCH 43/51] Add CUDF_CPP_BUILD_DIR to enable rapids-compose build.

---
 java/pom.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/java/pom.xml b/java/pom.xml
index d14d4202e21..8894d9eae46 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -351,6 +351,7 @@
                                     <arg value="-DRMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL}" />
                                     <arg value="-DCMAKE_CXX_FLAGS=${cxx.flags}"/>
                                     <arg value="-DCMAKE_EXPORT_COMPILE_COMMANDS=${CMAKE_EXPORT_COMPILE_COMMANDS}"/>
+                                    <arg value="-DCUDF_CPP_BUILD_DIR=${CUDF_CPP_BUILD_DIR}"/>
                                     <arg value="-DGPU_ARCHS=ALL"/>
                                 </exec>
                                 <exec dir="${native.build.path}"

From f286a15184cdc616a3f33fb89e3758de909e8f2c Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 11 Nov 2020 12:25:34 +1100
Subject: [PATCH 44/51] Changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index edd54204842..67a31c55fa7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -101,6 +101,7 @@
 - PR #6708 Apply `na_rep` to column names in csv writer
 - PR #6721 Add missing serialization methods for ListColumn
 - PR #6722 Fix index=False bug in dask_cudf.read_parquet
+- PR #6732 Fix cuDF benchmarks build with static Arrow lib and fix rapids-compose cuDF JNI build
 
 
 # cuDF 0.16.0 (21 Oct 2020)

From 5b0592b4aaa632d555ee5c8a44bfc4c667b80235 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Mon, 16 Nov 2020 11:22:56 +1100
Subject: [PATCH 45/51] Fix includes, copyright and doc formatting.

---
 cpp/include/cudf/detail/gather.cuh       |  2 +-
 cpp/include/cudf/detail/groupby.hpp      |  5 +++--
 cpp/include/cudf/detail/quantiles.hpp    | 19 ++++++++-----------
 cpp/include/cudf/detail/reduction.cuh    | 11 +++++++----
 cpp/include/cudf/lists/detail/gather.cuh |  5 +++--
 5 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 9a115772a0c..8e586b231bc 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -35,9 +35,9 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <algorithm>
-#include "rmm/cuda_stream_view.hpp"
 
 #include <thrust/functional.h>
 #include <thrust/gather.h>
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index c616a2c8d50..ce5fdb92bd1 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,10 @@
 #include <cudf/groupby.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <memory>
 #include <utility>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index e93886c4f11..5fb2ce4cbe6 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -22,11 +22,10 @@
 namespace cudf {
 namespace detail {
 
-/** @copydoc cudf::quantile(column_view const&, std::vector<double> const&, interpolation,
-    column_view const&, bool, rmm::mr::device_memory_resource*)
-  *
-  * @param stream CUDA stream used for device memory operations and kernel launches.
-  */
+/** @copydoc cudf::quantile()
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
 std::unique_ptr<column> quantile(
   column_view const& input,
   std::vector<double> const& q,
@@ -36,12 +35,10 @@ std::unique_ptr<column> quantile(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/** @copydoc cudf::quantiles(table_view const&, std::vector<double> const&, interpolation,
-    cudf::sorted, std::vector<order> const&, std::vector<null_order> const&,
-    rmm::mr::device_memory_resource*)
-  *
-  * @param stream CUDA stream used for device memory operations and kernel launches.
-  */
+/** @copydoc cudf::quantiles()
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
 std::unique_ptr<table> quantiles(
   table_view const& input,
   std::vector<double> const& q,
diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/detail/reduction.cuh
index 063114adbc3..cc899f946c5 100644
--- a/cpp/include/cudf/detail/reduction.cuh
+++ b/cpp/include/cudf/detail/reduction.cuh
@@ -16,16 +16,19 @@
 
 #pragma once
 
+#include "reduction_operators.cuh"
+
 #include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 
-#include <rmm/thrust_rmm_allocator.h>
+#include <cub/device/device_reduce.cuh>
+
 #include <thrust/for_each.h>
 #include <thrust/iterator/iterator_traits.h>
-#include <cub/device/device_reduce.cuh>
-#include "reduction_operators.cuh"
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 0427e04647d..439bd7ab089 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -15,13 +15,14 @@
  */
 #pragma once
 
-#include <thrust/transform_scan.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <thrust/transform_scan.h>
 
 namespace cudf {
 namespace lists {

From 9d88e344cb85eb96a8db94a298f145cbb488f2d5 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Tue, 17 Nov 2020 15:37:02 +1100
Subject: [PATCH 46/51] Update stream,mr order after recent merges

---
 .../cudf/structs/detail/concatenate.hpp       |  2 +-
 cpp/src/filling/fill.cu                       |  2 +-
 cpp/src/round/round.cu                        |  2 +-
 cpp/src/structs/copying/concatenate.cu        | 14 ++++++------
 cpp/src/unary/cast_ops.cu                     | 22 +++++++++----------
 5 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index ebaf8ec5b3c..ef3da82cfeb 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -49,7 +49,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(
   std::vector<column_view> const& columns,
-  cudaStream_t stream                 = 0,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 390e590736f..8bf510dded6 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -64,7 +64,7 @@ struct in_place_fill_range_dispatch {
 
   template <typename T>
   std::enable_if_t<cudf::is_fixed_width<T>() && not cudf::is_fixed_point<T>(), void> operator()(
-    cudf::size_type begin, cudf::size_type end, cudaStream_t stream = 0)
+    cudf::size_type begin, cudf::size_type end, rmm::cuda_stream_view stream)
   {
     in_place_fill<T>(destination, begin, end, value, stream);
   }
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 0ec57013c30..8e8626db599 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -246,7 +246,7 @@ std::unique_ptr<column> round_with(column_view const& input,
     auto const diff   = input.type().scale() - (-decimal_places);
     auto const scalar = cudf::make_fixed_point_scalar<T>(std::pow(10, diff), scale_type{-diff});
     return cudf::detail::binary_operation(
-      input, *scalar, cudf::binary_operator::MUL, {}, mr, stream);
+      input, *scalar, cudf::binary_operator::MUL, {}, stream, mr);
   }
 
   auto const result_type = data_type{input.type().id(), scale_type{-decimal_places}};
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index 6b917227302..47b63d9cf6f 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <algorithm>
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,9 +22,11 @@
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/structs/structs_column_view.hpp>
-
 #include <structs/utilities.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
+#include <algorithm>
 #include <memory>
 
 namespace cudf {
@@ -36,10 +37,9 @@ namespace detail {
  * @copydoc cudf::structs::detail::concatenate
  *
  */
-std::unique_ptr<column> concatenate(
-  std::vector<column_view> const& columns,
-  cudaStream_t stream                 = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   // get ordered children
   auto ordered_children = extract_ordered_struct_children(columns);
@@ -51,7 +51,7 @@ std::unique_ptr<column> concatenate(
                  ordered_children.end(),
                  std::back_inserter(children),
                  [mr, stream](std::vector<column_view> const& cols) {
-                   return cudf::detail::concatenate(cols, mr, stream);
+                   return cudf::detail::concatenate(cols, stream, mr);
                  });
 
   size_type const total_length = children[0]->size();
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 108c0794af9..f30ec1e2b83 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -172,18 +172,18 @@ struct device_cast {
 template <typename T, typename std::enable_if_t<is_fixed_point<T>()>* = nullptr>
 std::unique_ptr<column> rescale(column_view input,
                                 numeric::scale_type scale,
-                                rmm::mr::device_memory_resource* mr,
-                                cudaStream_t stream)
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   using namespace numeric;
 
   if (input.type().scale() > scale) {
     auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale});
-    return detail::binary_operation(input, *scalar, binary_operator::ADD, {}, mr, stream);
+    return detail::binary_operation(input, *scalar, binary_operator::ADD, {}, stream, mr);
   } else {
     auto const diff   = input.type().scale() - scale;
     auto const scalar = make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff});
-    return detail::binary_operation(input, *scalar, binary_operator::DIV, {}, mr, stream);
+    return detail::binary_operation(input, *scalar, binary_operator::DIV, {}, stream, mr);
   }
 };
 
@@ -286,12 +286,12 @@ struct dispatch_unary_cast_to {
     typename std::enable_if_t<cudf::is_fixed_point<SourceT>() && cudf::is_fixed_point<TargetT>() &&
                               std::is_same<SourceT, TargetT>::value>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     if (input.type() == type) return std::make_unique<column>(input);  // TODO add test for this
 
-    return detail::rescale<TargetT>(input, numeric::scale_type{type.scale()}, mr, stream);
+    return detail::rescale<TargetT>(input, numeric::scale_type{type.scale()}, stream, mr);
   }
 
   template <
@@ -300,8 +300,8 @@ struct dispatch_unary_cast_to {
     typename std::enable_if_t<cudf::is_fixed_point<SourceT>() && cudf::is_fixed_point<TargetT>() &&
                               not std::is_same<SourceT, TargetT>::value>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
-                                     rmm::mr::device_memory_resource* mr,
-                                     cudaStream_t stream)
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     using namespace numeric;
 
@@ -318,14 +318,14 @@ struct dispatch_unary_cast_to {
 
     mutable_column_view output_mutable = *temporary;
 
-    thrust::transform(rmm::exec_policy(stream)->on(stream),
+    thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                       input.begin<SourceDeviceT>(),
                       input.end<SourceDeviceT>(),
                       output_mutable.begin<TargetDeviceT>(),
                       device_cast<SourceDeviceT, TargetDeviceT>{});
 
     // clearly there is a more efficient way to do this, can optimize in the future
-    return rescale<TargetT>(*temporary, numeric::scale_type{type.scale()}, mr, stream);
+    return rescale<TargetT>(*temporary, numeric::scale_type{type.scale()}, stream, mr);
   }
 
   template <typename TargetT,

From 9d8f23ef6cb55cca46961bac3e90e48d24353916 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 18 Nov 2020 13:19:32 +1100
Subject: [PATCH 47/51] Remove MR parameter when it can be defaulted.

---
 cpp/include/cudf/detail/null_mask.hpp    | 31 +++++++++++++-----------
 cpp/src/copying/scatter.cu               | 20 +++++++--------
 cpp/src/dictionary/add_keys.cu           |  4 +--
 cpp/src/dictionary/detail/concatenate.cu |  3 +--
 cpp/src/dictionary/set_keys.cu           |  2 +-
 cpp/src/filling/fill.cu                  |  6 ++---
 cpp/src/groupby/hash/groupby.cu          |  3 +--
 cpp/src/groupby/sort/sort_helper.cu      |  3 +--
 cpp/src/interop/to_arrow.cpp             |  2 +-
 cpp/src/replace/clamp.cu                 | 18 ++++----------
 cpp/src/replace/replace.cu               |  3 +--
 cpp/src/search/search.cu                 |  3 +--
 cpp/src/transform/encode.cu              |  3 +--
 13 files changed, 43 insertions(+), 58 deletions(-)

diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 50a2424e86c..9a5e000f265 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -93,20 +93,22 @@ rmm::device_buffer copy_bitmask(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-rmm::device_buffer bitmask_and(std::vector<bitmask_type const *> const &masks,
-                               std::vector<size_type> const &begin_bits,
-                               size_type mask_size,
-                               rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource *mr);
+rmm::device_buffer bitmask_and(
+  std::vector<bitmask_type const *> const &masks,
+  std::vector<size_type> const &begin_bits,
+  size_type mask_size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc cudf::bitmask_and
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-rmm::device_buffer bitmask_and(table_view const &view,
-                               rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource *mr);
+rmm::device_buffer bitmask_and(
+  table_view const &view,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
@@ -120,12 +122,13 @@ rmm::device_buffer bitmask_and(table_view const &view,
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return rmm::device_buffer Output bitmask
  */
-void inplace_bitmask_and(bitmask_type *dest_mask,
-                         std::vector<bitmask_type const *> const &masks,
-                         std::vector<size_type> const &begin_bits,
-                         size_type mask_size,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource *mr);
+void inplace_bitmask_and(
+  bitmask_type *dest_mask,
+  std::vector<bitmask_type const *> const &masks,
+  std::vector<size_type> const &begin_bits,
+  size_type mask_size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 036962ab744..d8beb052f8f 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -177,16 +177,14 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    auto dict_target = dictionary::detail::add_keys(
-      dictionary_column_view(target),
-      make_column_from_scalar(source.get(), 1, stream, rmm::mr::get_current_device_resource())
-        ->view(),
-      mr,
-      stream.value());
+    auto dict_target =
+      dictionary::detail::add_keys(dictionary_column_view(target),
+                                   make_column_from_scalar(source.get(), 1, stream)->view(),
+                                   mr,
+                                   stream.value());
     auto dict_view    = dictionary_column_view(dict_target->view());
-    auto scalar_index = dictionary::detail::get_index(
-      dict_view, source.get(), stream, rmm::mr::get_current_device_resource());
-    auto scalar_iter = thrust::make_permutation_iterator(
+    auto scalar_index = dictionary::detail::get_index(dict_view, source.get(), stream);
+    auto scalar_iter  = thrust::make_permutation_iterator(
       indexalator_factory::make_input_iterator(*scalar_index), thrust::make_constant_iterator(0));
     auto new_indices = std::make_unique<column>(dict_view.get_indices_annotated(), stream, mr);
     auto target_iter = indexalator_factory::make_output_iterator(new_indices->mutable_view());
@@ -336,8 +334,8 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
                    0);
 
   // The scatter map is actually a table with only one column, which is scatter map.
-  auto scatter_map = detail::apply_boolean_mask(
-    table_view{{indices->view()}}, boolean_mask, stream, rmm::mr::get_current_device_resource());
+  auto scatter_map =
+    detail::apply_boolean_mask(table_view{{indices->view()}}, boolean_mask, stream);
   auto output_table = detail::scatter(table_view{{input}},
                                       scatter_map->get_column(0).view(),
                                       table_view{{target}},
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 6a9b294758d..79effe3fc97 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -57,8 +57,8 @@ std::unique_ptr<column> add_keys(
   CUDF_EXPECTS(new_keys.type() == old_keys.type(), "Keys must be the same type");
   // first, concatenate the keys together
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
-  auto combined_keys = cudf::detail::concatenate(
-    std::vector<column_view>{old_keys, new_keys}, stream, rmm::mr::get_current_device_resource());
+  auto combined_keys =
+    cudf::detail::concatenate(std::vector<column_view>{old_keys, new_keys}, stream);
   // sort and remove any duplicates from the combined keys
   // drop_duplicates([a,b,c,d,f,d,b,e]) = [a,b,c,d,e,f]
   auto table_keys = cudf::detail::drop_duplicates(table_view{{combined_keys->view()}},
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 223e2d7c331..3d44085232e 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -204,8 +204,7 @@ std::unique_ptr<column> concatenate(std::vector<column_view> const& columns,
     CUDF_EXPECTS(keys.type() == keys_type, "key types of all dictionary columns must match");
     return keys;
   });
-  auto all_keys =
-    cudf::detail::concatenate(keys_views, stream, rmm::mr::get_current_device_resource());
+  auto all_keys = cudf::detail::concatenate(keys_views, stream);
 
   // sort keys and remove duplicates;
   // this becomes the keys child for the output dictionary column
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index c934e495de3..69fdcd85b35 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -156,7 +156,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(std::vector<dictionary_c
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
-  auto new_keys  = cudf::detail::concatenate(keys, stream, rmm::mr::get_current_device_resource());
+  auto new_keys  = cudf::detail::concatenate(keys, stream);
   auto keys_view = new_keys->view();
   std::vector<std::unique_ptr<column>> result(input.size());
   std::transform(input.begin(), input.end(), result.begin(), [keys_view, mr, stream](auto& col) {
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 8bf510dded6..77482e13b6c 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -174,16 +174,14 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   }
 
   // add the scalar to get the output dictionary key-set
-  auto scalar_column =
-    cudf::make_column_from_scalar(value, 1, stream, rmm::mr::get_current_device_resource());
+  auto scalar_column = cudf::make_column_from_scalar(value, 1, stream);
   auto target_matched =
     cudf::dictionary::detail::add_keys(target, scalar_column->view(), mr, stream.value());
   cudf::column_view const target_indices =
     cudf::dictionary_column_view(target_matched->view()).get_indices_annotated();
 
   // get the index of the key just added
-  auto index_of_value = cudf::dictionary::detail::get_index(
-    target_matched->view(), value, stream, rmm::mr::get_current_device_resource());
+  auto index_of_value = cudf::dictionary::detail::get_index(target_matched->view(), value, stream);
   // now call fill using just the indices column and the new index
   auto new_indices =
     cudf::type_dispatcher(target_indices.type(),
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index e0c9d92fd30..0a56563cf87 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -280,8 +280,7 @@ void compute_single_pass_aggs(table_view const& keys,
   bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   if (skip_key_rows_with_nulls) {
-    auto row_bitmask{
-      cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource())};
+    auto row_bitmask{cudf::detail::bitmask_and(keys, stream)};
     thrust::for_each_n(
       rmm::exec_policy(stream)->on(stream.value()),
       thrust::make_counting_iterator(0),
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 4b4c6a96688..595efc8198d 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -243,8 +243,7 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea
 {
   if (_keys_bitmask_column) return _keys_bitmask_column->view();
 
-  auto row_bitmask =
-    cudf::detail::bitmask_and(_keys, stream, rmm::mr::get_current_device_resource());
+  auto row_bitmask = cudf::detail::bitmask_and(_keys, stream);
 
   _keys_bitmask_column = make_numeric_column(data_type(type_id::INT8),
                                              _keys.num_rows(),
diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cpp
index c36b2be77e8..874d9078444 100644
--- a/cpp/src/interop/to_arrow.cpp
+++ b/cpp/src/interop/to_arrow.cpp
@@ -137,7 +137,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                                                                   arrow::MemoryPool* ar_mr,
                                                                   rmm::cuda_stream_view stream)
 {
-  auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
+  auto bitmask = bools_to_mask(input, stream);
 
   auto result = arrow::AllocateBuffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
   CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index cdd8d78fdef..c69cf8c16f4 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -322,11 +322,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
     auto add_scalar_key            = [&](scalar const& key, scalar const& key_replace) {
       if (key.is_valid()) {
         result = dictionary::detail::add_keys(
-          matched_view,
-          make_column_from_scalar(key_replace, 1, stream, rmm::mr::get_current_device_resource())
-            ->view(),
-          mr,
-          stream);
+          matched_view, make_column_from_scalar(key_replace, 1, stream)->view(), mr, stream);
         matched_view = dictionary_column_view(result->view());
       }
     };
@@ -337,16 +333,12 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
   auto matched_view = dictionary_column_view(matched_column->view());
 
   // get the indexes for lo_replace and for hi_replace
-  auto lo_replace_index = dictionary::detail::get_index(
-    matched_view, lo_replace, stream, rmm::mr::get_current_device_resource());
-  auto hi_replace_index = dictionary::detail::get_index(
-    matched_view, hi_replace, stream, rmm::mr::get_current_device_resource());
+  auto lo_replace_index = dictionary::detail::get_index(matched_view, lo_replace, stream);
+  auto hi_replace_index = dictionary::detail::get_index(matched_view, hi_replace, stream);
 
   // get the closest indexes for lo and for hi
-  auto lo_index = dictionary::detail::get_insert_index(
-    matched_view, lo, stream, rmm::mr::get_current_device_resource());
-  auto hi_index = dictionary::detail::get_insert_index(
-    matched_view, hi, stream, rmm::mr::get_current_device_resource());
+  auto lo_index = dictionary::detail::get_insert_index(matched_view, lo, stream);
+  auto hi_index = dictionary::detail::get_insert_index(matched_view, hi, stream);
 
   // call clamp with the scalar indexes and the matched indices
   auto matched_indices = matched_view.get_indices_annotated();
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 6ca894ac186..a6b129630a8 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -453,8 +453,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
   auto replacements = cudf::dictionary_column_view(replacement_values);
 
   auto matched_input = [&] {
-    auto new_keys = cudf::detail::concatenate(
-      {values.keys(), replacements.keys()}, stream, rmm::mr::get_current_device_resource());
+    auto new_keys = cudf::detail::concatenate({values.keys(), replacements.keys()}, stream);
     return cudf::dictionary::detail::add_keys(input, new_keys->view(), mr, stream.value());
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index e8d776d0d2a..0efd68ac974 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -197,8 +197,7 @@ bool contains_scalar_dispatch::operator()<cudf::dictionary32>(column_view const&
 {
   auto dict_col = cudf::dictionary_column_view(col);
   // first, find the value in the dictionary's key set
-  auto index = cudf::dictionary::detail::get_index(
-    dict_col, value, stream, rmm::mr::get_current_device_resource());
+  auto index = cudf::dictionary::detail::get_index(dict_col, value, stream);
   // if found, check the index is actually in the indices column
   return index->is_valid() ? cudf::type_dispatcher(dict_col.indices().type(),
                                                    contains_scalar_dispatch{},
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 1ecf8a7814a..5abe4e1aaf7 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -53,8 +53,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
     // https://github.com/rapidsai/cudf/issues/6144 is resolved
 
     auto num_rows = keys_table->num_rows();
-    auto mask =
-      cudf::detail::bitmask_and(keys_table->view(), stream, rmm::mr::get_current_device_resource());
+    auto mask     = cudf::detail::bitmask_and(keys_table->view(), stream);
     auto num_rows_with_nulls =
       cudf::count_unset_bits(reinterpret_cast<bitmask_type*>(mask.data()), 0, num_rows);
 

From 6a6274487b2e97ee5066443b9df2620a093942f7 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 18 Nov 2020 13:23:54 +1100
Subject: [PATCH 48/51] Header fix

---
 cpp/src/groupby/groupby.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 4c391852386..6a004393b83 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -29,11 +29,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/copy.h>
 
 #include <memory>
 #include <utility>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace groupby {

From acbb2ebca732f74a7683feb8b7c16d48a1f2e495 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Wed, 18 Nov 2020 13:29:04 +1100
Subject: [PATCH 49/51] Header fixes

---
 cpp/include/cudf/strings/detail/copy_range.cuh | 3 ++-
 cpp/include/cudf/strings/detail/gather.cuh     | 3 ++-
 cpp/include/cudf/table/table_device_view.cuh   | 3 ++-
 cpp/src/copying/concatenate.cu                 | 3 ++-
 cpp/src/copying/copy_range.cu                  | 3 ++-
 cpp/src/dictionary/replace.cu                  | 3 ++-
 cpp/src/interop/dlpack.cpp                     | 3 ++-
 cpp/src/join/semi_join.cu                      | 7 ++++---
 cpp/src/replace/nulls.cu                       | 3 ++-
 cpp/src/reshape/tile.cu                        | 7 ++++---
 cpp/src/sort/sort.cu                           | 3 ++-
 cpp/src/stream_compaction/drop_nulls.cu        | 5 +++--
 cpp/src/transform/transform.cpp                | 3 ++-
 cpp/src/unary/nan_ops.cu                       | 3 ++-
 cpp/src/unary/null_ops.cu                      | 5 +++--
 cpp/src/unary/unary_ops.cuh                    | 7 ++++---
 java/src/main/native/src/map_lookup.cu         | 3 +--
 17 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh
index fe0d1dcf2a7..563f66ad2c8 100644
--- a/cpp/include/cudf/strings/detail/copy_range.cuh
+++ b/cpp/include/cudf/strings/detail/copy_range.cuh
@@ -22,7 +22,8 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 8ca70db74a6..e70dbd399c9 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -22,7 +22,8 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 8a1938423f0..76d2e57597f 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -19,9 +19,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <cassert>
 #include <memory>
-#include "rmm/cuda_stream_view.hpp"
 
 /**
  * @file table_device_view.cuh
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 90b76498860..91354250073 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -28,13 +28,14 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/binary_search.h>
 #include <thrust/transform_scan.h>
 
 #include <algorithm>
 #include <numeric>
 #include <utility>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index 1df9fc78aa2..ff532059108 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -32,10 +32,11 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 
 #include <memory>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace {
 template <typename T>
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 27a85c03898..60e7c496e06 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -23,7 +23,8 @@
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 1ae6119aefd..efc19791c07 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -18,10 +18,11 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <dlpack/dlpack.h>
 
 #include <algorithm>
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace {
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 9d6dd55ec03..2b58c1a864a 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include <hash/concurrent_unordered_map.cuh>
+#include <join/join_common_utils.hpp>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -22,9 +25,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <hash/concurrent_unordered_map.cuh>
-#include <join/join_common_utils.hpp>
-#include "rmm/cuda_stream_view.hpp"
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 6f860dfd60d..2c7542a2f5d 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -35,7 +35,8 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index c912143f6d7..e1c665cf8dd 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -15,18 +15,19 @@
  */
 
 #include <cudf/copying.hpp>
+#include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/reshape.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <memory>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <cudf/detail/gather.cuh>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <memory>
 
 namespace cudf {
 namespace {
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index 18d6839e2a2..028796d59cb 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include "rmm/cuda_stream_view.hpp"
 #include "sort_impl.cuh"
 
 #include <cudf/column/column.hpp>
@@ -23,6 +22,8 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 std::unique_ptr<column> sorted_order(table_view input,
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index 71aa8f6c63c..7eb8e1c9644 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace {
 // Returns true if the mask is true for index i in at least keep_threshold
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 2372382d178..f4224f87957 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -26,12 +26,13 @@
 #include <jit/parser.h>
 #include <jit/type.h>
 #include "jit/code/code.h"
-#include "rmm/cuda_stream_view.hpp"
 
 #include <jit/common_headers.hpp>
 #include <jit/timestamps.hpp.jit>
 #include <jit/types.hpp.jit>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace transformation {
 //! Jit functions
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 9f8f0e53cb2..1840aebf8f0 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -21,7 +21,8 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu
index 699439da1c9..6a967b4ecd7 100644
--- a/cpp/src/unary/null_ops.cu
+++ b/cpp/src/unary/null_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,8 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 std::unique_ptr<column> is_null(cudf::column_view const& input, rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index a74a05437be..ab246bde540 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,13 +17,14 @@
 #ifndef UNARY_OPS_H
 #define UNARY_OPS_H
 
-#include <rmm/thrust_rmm_allocator.h>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
-#include "rmm/cuda_stream_view.hpp"
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace unary {
diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu
index a3e25ce8905..95eea10e8e0 100644
--- a/java/src/main/native/src/map_lookup.cu
+++ b/java/src/main/native/src/map_lookup.cu
@@ -25,10 +25,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-#include "rmm/cuda_stream_view.hpp"
-
 namespace cudf {
 namespace {
 

From 81cd6012014cf8fc511888b6706a01e163e791e2 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 19 Nov 2020 07:33:22 +1100
Subject: [PATCH 50/51] Add missing include.

---
 cpp/src/hash/helper_functions.cuh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/hash/helper_functions.cuh b/cpp/src/hash/helper_functions.cuh
index 1b6411de612..57747142f58 100644
--- a/cpp/src/hash/helper_functions.cuh
+++ b/cpp/src/hash/helper_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #ifndef HELPER_FUNCTIONS_CUH
 #define HELPER_FUNCTIONS_CUH
 
+#include <cudf/types.hpp>
+
 #include <thrust/pair.h>
 
 constexpr int64_t DEFAULT_HASH_TABLE_OCCUPANCY = 50;

From 9fc08f349a53f28f64e4ee0f316b56f9cf3bb656 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 19 Nov 2020 17:42:12 +1100
Subject: [PATCH 51/51] cudaStream_t to cuda_stream_view in math_ops

---
 cpp/src/unary/math_ops.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index d27dac3542c..f756f68e4a9 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -276,7 +276,7 @@ struct fixed_point_abs {
 
 template <typename T, template <typename> typename FixedPointFunctor>
 std::unique_ptr<column> unary_op_with(column_view const& input,
-                                      cudaStream_t stream,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   using Type                     = device_storage_type_t<T>;
@@ -294,7 +294,7 @@ std::unique_ptr<column> unary_op_with(column_view const& input,
   auto out_view = result->mutable_view();
   Type const n  = std::pow(10, -input.type().scale());
 
-  thrust::transform(rmm::exec_policy(stream)->on(stream),
+  thrust::transform(rmm::exec_policy(stream)->on(stream.value()),
                     input.begin<Type>(),
                     input.end<Type>(),
                     out_view.begin<Type>(),
@@ -554,7 +554,7 @@ struct FixedPointOpDispatcher {
   std::enable_if_t<cudf::is_fixed_point<T>(), std::unique_ptr<column>> operator()(
     column_view const& input,
     cudf::unary_op op,
-    cudaStream_t stream,
+    rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr)
   {
     switch (op) {