From 9c9dd54f5ec6834ed87f26828c5e3878ced39b2d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 13 Mar 2023 16:59:09 -0400
Subject: [PATCH 01/63] Remove default parameters from detail headers in
 include (#12888)

Contributes to #9854. None of these changes should affect users, nor do they impose a particularly onerous burden on libcudf developers (just some extra passing through `mr` or `cudf::get_default_stream()`.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/12888
---
 cpp/benchmarks/common/generate_input.cu       |   4 +-
 cpp/include/cudf/detail/copy.hpp              | 121 ++++++++----------
 cpp/include/cudf/detail/gather.cuh            |  15 +--
 cpp/include/cudf/detail/gather.hpp            |  28 ++--
 cpp/include/cudf/detail/hashing.hpp           |  38 +++---
 cpp/include/cudf/detail/interop.hpp           |   8 +-
 cpp/include/cudf/detail/stream_compaction.hpp |  97 +++++++-------
 cpp/include/cudf/lists/detail/gather.cuh      |  32 ++---
 cpp/src/binaryop/compiled/binary_ops.hpp      |  80 ++++++------
 cpp/src/copying/copy.cu                       |   6 +-
 cpp/src/copying/scatter.cu                    |   9 +-
 cpp/src/rolling/detail/lead_lag_nested.cuh    |   6 +-
 cpp/src/sort/segmented_sort_impl.cuh          |  10 +-
 cpp/src/stream_compaction/distinct.cu         |  10 +-
 cpp/src/stream_compaction/distinct_count.cu   |   3 +-
 cpp/src/stream_compaction/stable_distinct.cu  |  10 +-
 cpp/src/stream_compaction/unique_count.cu     |   3 +-
 cpp/tests/copying/detail_gather_tests.cu      |  19 ++-
 cpp/tests/copying/gather_str_tests.cpp        |  16 ++-
 java/src/main/native/src/ColumnViewJni.cu     |  18 +--
 20 files changed, 270 insertions(+), 263 deletions(-)
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index edb19b7b0ca..545028260b8 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
@@ -542,7 +543,8 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
                                         sample_indices,
                                         cudf::out_of_bounds_policy::DONT_CHECK,
                                         cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                        cudf::get_default_stream());
+                                        cudf::get_default_stream(),
+                                        rmm::mr::get_current_device_resource());
   return std::move(str_table->release()[0]);
 }
 
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 8c3f315284d..83395f8fa90 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,12 +144,11 @@ std::vector<table_view> split(table_view const& input,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> shift(
-  column_view const& input,
-  size_type offset,
-  scalar const& fill_value,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> shift(column_view const& input,
+                              size_type offset,
+                              scalar const& fill_value,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Performs segmented shifts for specified values.
@@ -184,24 +183,22 @@ std::unique_ptr<column> shift(
  *
  * @note If `offset == 0`, a copy of @p segmented_values is returned.
  */
-std::unique_ptr<column> segmented_shift(
-  column_view const& segmented_values,
-  device_span<size_type const> segment_offsets,
-  size_type offset,
-  scalar const& fill_value,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_shift(column_view const& segmented_values,
+                                        device_span<size_type const> segment_offsets,
+                                        size_type offset,
+                                        scalar const& fill_value,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::contiguous_split
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  **/
-std::vector<packed_table> contiguous_split(
-  cudf::table_view const& input,
-  std::vector<size_type> const& splits,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::vector<packed_table> contiguous_split(cudf::table_view const& input,
+                                           std::vector<size_type> const& splits,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::pack
@@ -210,7 +207,7 @@ std::vector<packed_table> contiguous_split(
  **/
 packed_columns pack(cudf::table_view const& input,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::allocate_like(column_view const&, size_type, mask_allocation_policy,
@@ -218,12 +215,11 @@ packed_columns pack(cudf::table_view const& input,
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> allocate_like(
-  column_view const& input,
-  size_type size,
-  mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> allocate_like(column_view const& input,
+                                      size_type size,
+                                      mask_allocation_policy mask_alloc,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, column_view const&,
@@ -231,12 +227,11 @@ std::unique_ptr<column> allocate_like(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> copy_if_else(
-  column_view const& lhs,
-  column_view const& rhs,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> copy_if_else(column_view const& lhs,
+                                     column_view const& rhs,
+                                     column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, column_view const&,
@@ -244,12 +239,11 @@ std::unique_ptr<column> copy_if_else(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> copy_if_else(
-  scalar const& lhs,
-  column_view const& rhs,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> copy_if_else(scalar const& lhs,
+                                     column_view const& rhs,
+                                     column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, scalar const&,
@@ -257,12 +251,11 @@ std::unique_ptr<column> copy_if_else(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> copy_if_else(
-  column_view const& lhs,
-  scalar const& rhs,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> copy_if_else(column_view const& lhs,
+                                     scalar const& rhs,
+                                     column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, scalar const&,
@@ -270,36 +263,33 @@ std::unique_ptr<column> copy_if_else(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> copy_if_else(
-  scalar const& lhs,
-  scalar const& rhs,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> copy_if_else(scalar const& lhs,
+                                     scalar const& rhs,
+                                     column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::sample
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> sample(
-  table_view const& input,
-  size_type const n,
-  sample_with_replacement replacement = sample_with_replacement::FALSE,
-  int64_t const seed                  = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> sample(table_view const& input,
+                              size_type const n,
+                              sample_with_replacement replacement,
+                              int64_t const seed,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::get_element
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<scalar> get_element(
-  column_view const& input,
-  size_type index,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> get_element(column_view const& input,
+                                    size_type index,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::has_nonempty_nulls
@@ -320,10 +310,9 @@ bool may_have_nonempty_nulls(column_view const& input, rmm::cuda_stream_view str
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> purge_nonempty_nulls(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 57d834e6277..ac2865c05c5 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -647,13 +647,12 @@ void gather_bitmask(table_view const& source,
  * @return cudf::table Result of the gather
  */
 template <typename MapIterator>
-std::unique_ptr<table> gather(
-  table_view const& source_table,
-  MapIterator gather_map_begin,
-  MapIterator gather_map_end,
-  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> gather(table_view const& source_table,
+                              MapIterator gather_map_begin,
+                              MapIterator gather_map_end,
+                              out_of_bounds_policy bounds_policy,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   std::vector<std::unique_ptr<column>> destination_columns;
 
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 9d61a8de184..034eb6c1282 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,13 +61,12 @@ enum class negative_index_policy : bool { ALLOWED, NOT_ALLOWED };
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Result of the gather
  */
-std::unique_ptr<table> gather(
-  table_view const& source_table,
-  column_view const& gather_map,
-  out_of_bounds_policy bounds_policy,
-  negative_index_policy neg_indices,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> gather(table_view const& source_table,
+                              column_view const& gather_map,
+                              out_of_bounds_policy bounds_policy,
+                              negative_index_policy neg_indices,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::detail::gather(table_view const&,column_view const&,table_view
@@ -76,13 +75,12 @@ std::unique_ptr<table> gather(
  *
  * @throws cudf::logic_error if `gather_map` span size is larger than max of `size_type`.
  */
-std::unique_ptr<table> gather(
-  table_view const& source_table,
-  device_span<size_type const> const gather_map,
-  out_of_bounds_policy bounds_policy,
-  negative_index_policy neg_indices,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> gather(table_view const& source_table,
+                              device_span<size_type const> const gather_map,
+                              out_of_bounds_policy bounds_policy,
+                              negative_index_policy neg_indices,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index b7469d80a8d..771b3e150ec 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,29 +31,25 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> hash(
-  table_view const& input,
-  hash_id hash_function               = hash_id::HASH_MURMUR3,
-  uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> hash(table_view const& input,
+                             hash_id hash_function,
+                             uint32_t seed,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> murmur_hash3_32(
-  table_view const& input,
-  uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> murmur_hash3_32(table_view const& input,
+                                        uint32_t seed,
+                                        rmm::cuda_stream_view,
+                                        rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> spark_murmur_hash3_32(
-  table_view const& input,
-  uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> spark_murmur_hash3_32(table_view const& input,
+                                              uint32_t seed,
+                                              rmm::cuda_stream_view,
+                                              rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> md5_hash(
-  table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> md5_hash(table_view const& input,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /* Copyright 2005-2014 Daniel James.
  *
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 25ce5b09eb8..452144da167 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,9 +111,9 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type);
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata = {},
-                                       rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                                       arrow::MemoryPool* ar_mr     = arrow::default_memory_pool());
+                                       std::vector<column_metadata> const& metadata,
+                                       rmm::cuda_stream_view stream,
+                                       arrow::MemoryPool* ar_mr);
 
 /**
  * @copydoc cudf::arrow_to_cudf
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index e725718ed22..e0fc7b71cd9 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,12 +32,11 @@ namespace detail {
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> drop_nulls(
-  table_view const& input,
-  std::vector<size_type> const& keys,
-  cudf::size_type keep_threshold,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> drop_nulls(table_view const& input,
+                                  std::vector<size_type> const& keys,
+                                  cudf::size_type keep_threshold,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::drop_nans(table_view const&, std::vector<size_type> const&,
@@ -45,50 +44,46 @@ std::unique_ptr<table> drop_nulls(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> drop_nans(
-  table_view const& input,
-  std::vector<size_type> const& keys,
-  cudf::size_type keep_threshold,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> drop_nans(table_view const& input,
+                                 std::vector<size_type> const& keys,
+                                 cudf::size_type keep_threshold,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::apply_boolean_mask
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> apply_boolean_mask(
-  table_view const& input,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> apply_boolean_mask(table_view const& input,
+                                          column_view const& boolean_mask,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::unique
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> unique(
-  table_view const& input,
-  std::vector<size_type> const& keys,
-  duplicate_keep_option keep,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> unique(table_view const& input,
+                              std::vector<size_type> const& keys,
+                              duplicate_keep_option keep,
+                              null_equality nulls_equal,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::distinct
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> distinct(
-  table_view const& input,
-  std::vector<size_type> const& keys,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> distinct(table_view const& input,
+                                std::vector<size_type> const& keys,
+                                duplicate_keep_option keep,
+                                null_equality nulls_equal,
+                                nan_equality nans_equal,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create a new table without duplicate rows.
@@ -110,14 +105,13 @@ std::unique_ptr<table> distinct(
  * @param mr Device memory resource used to allocate the returned table
  * @return A table containing the resulting distinct rows
  */
-std::unique_ptr<table> stable_distinct(
-  table_view const& input,
-  std::vector<size_type> const& keys,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> stable_distinct(table_view const& input,
+                                       std::vector<size_type> const& keys,
+                                       duplicate_keep_option keep,
+                                       null_equality nulls_equal,
+                                       nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create a column of indices of all distinct rows in the input table.
@@ -133,13 +127,12 @@ std::unique_ptr<table> stable_distinct(
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the result indices
  */
-rmm::device_uvector<size_type> get_distinct_indices(
-  table_view const& input,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
+                                                    duplicate_keep_option keep,
+                                                    null_equality nulls_equal,
+                                                    nan_equality nans_equal,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy)
@@ -157,8 +150,8 @@ cudf::size_type unique_count(column_view const& input,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 cudf::size_type unique_count(table_view const& input,
-                             null_equality nulls_equal    = null_equality::EQUAL,
-                             rmm::cuda_stream_view stream = cudf::get_default_stream());
+                             null_equality nulls_equal,
+                             rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
@@ -176,8 +169,8 @@ cudf::size_type distinct_count(column_view const& input,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 cudf::size_type distinct_count(table_view const& input,
-                               null_equality nulls_equal    = null_equality::EQUAL,
-                               rmm::cuda_stream_view stream = cudf::get_default_stream());
+                               null_equality nulls_equal,
+                               rmm::cuda_stream_view stream);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 48c0ed8f6e9..83710a49f6a 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -285,11 +285,10 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
  *
  * @returns column with elements gathered based on `gather_data`
  */
-std::unique_ptr<column> gather_list_nested(
-  lists_column_view const& list,
-  gather_data& gd,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
+                                           gather_data& gd,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Gather a leaf column from a hierarchy of list columns.
@@ -303,11 +302,10 @@ std::unique_ptr<column> gather_list_nested(
  *
  * @returns column with elements gathered based on `gather_data`
  */
-std::unique_ptr<column> gather_list_leaf(
-  column_view const& column,
-  gather_data const& gd,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> gather_list_leaf(column_view const& column,
+                                         gather_data const& gd,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::segmented_gather(lists_column_view const& source_column,
@@ -317,13 +315,11 @@ std::unique_ptr<column> gather_list_leaf(
  *
  * @param stream CUDA stream on which to execute kernels
  */
-std::unique_ptr<column> segmented_gather(
-  lists_column_view const& source_column,
-  lists_column_view const& gather_map_list,
-  out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
-  // Move before bounds_policy?
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
+                                         lists_column_view const& gather_map_list,
+                                         out_of_bounds_policy bounds_policy,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index c51993409ef..47fd50c5d97 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,29 +32,26 @@ class mutable_column_device_view;
 namespace binops {
 namespace compiled {
 
-std::unique_ptr<column> string_null_min_max(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> string_null_min_max(scalar const& lhs,
+                                            column_view const& rhs,
+                                            binary_operator op,
+                                            data_type output_type,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> string_null_min_max(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> string_null_min_max(column_view const& lhs,
+                                            scalar const& rhs,
+                                            binary_operator op,
+                                            data_type output_type,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> string_null_min_max(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> string_null_min_max(column_view const& lhs,
+                                            column_view const& rhs,
+                                            binary_operator op,
+                                            data_type output_type,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Performs a binary operation between a string scalar and a string
@@ -75,13 +72,12 @@ std::unique_ptr<column> string_null_min_max(
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> Output column
  */
-std::unique_ptr<column> binary_operation(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(scalar const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Performs a binary operation between a string column and a string
@@ -102,13 +98,12 @@ std::unique_ptr<column> binary_operation(
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> Output column
  */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         scalar const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Performs a binary operation between two string columns.
@@ -128,13 +123,12 @@ std::unique_ptr<column> binary_operation(
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> Output column
  */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 void binary_operation(mutable_column_view& out,
                       scalar const& lhs,
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 0978cf441d8..9ec00612f2f 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -174,7 +175,8 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                     gather_map,
                                                     out_of_bounds_policy::DONT_CHECK,
                                                     negative_index_policy::NOT_ALLOWED,
-                                                    stream);
+                                                    stream,
+                                                    rmm::mr::get_current_device_resource());
 
   auto result = cudf::detail::scatter(
     table_view{std::vector<column_view>{scatter_src_lhs->get_column(0).view()}},
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index dd4912a216e..316f39b616c 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -253,7 +253,8 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
 
     auto scatter_functor   = column_scalar_scatterer<decltype(scatter_iter)>{};
     auto fields_iter_begin = make_counting_transform_iterator(0, [&](auto const& i) {
-      auto row_slr = get_element(typed_s->view().column(i), 0, stream);
+      auto row_slr =
+        get_element(typed_s->view().column(i), 0, stream, rmm::mr::get_current_device_resource());
       return type_dispatcher<dispatch_storage_type>(row_slr->type(),
                                                     scatter_functor,
                                                     *row_slr,
@@ -392,8 +393,8 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
                    0);
 
   // The scatter map is actually a table with only one column, which is scatter map.
-  auto scatter_map =
-    detail::apply_boolean_mask(table_view{{indices->view()}}, boolean_mask, stream);
+  auto scatter_map = detail::apply_boolean_mask(
+    table_view{{indices->view()}}, boolean_mask, stream, rmm::mr::get_current_device_resource());
   auto output_table = detail::scatter(
     table_view{{input}}, scatter_map->get_column(0).view(), table_view{{target}}, stream, mr);
 
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index 859ed7e5d53..d2fe9fabd1b 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
@@ -191,7 +192,8 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                          scatter_map,
                          out_of_bounds_policy::DONT_CHECK,
                          cudf::detail::negative_index_policy::NOT_ALLOWED,
-                         stream);
+                         stream,
+                         rmm::mr::get_current_device_resource());
 
   // Scatter defaults into locations where LEAD/LAG computed nulls.
   auto scattered_results = cudf::detail::scatter(
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index a32382b840f..b7347974173 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cub/device/device_segmented_sort.cuh>
 
@@ -72,8 +73,11 @@ struct column_fast_sort_fn {
   {
     // CUB's segmented sort functions cannot accept iterators.
     // We create a temporary column here for it to use.
-    auto temp_col =
-      cudf::detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream);
+    auto temp_col                   = cudf::detail::allocate_like(input,
+                                                input.size(),
+                                                mask_allocation_policy::NEVER,
+                                                stream,
+                                                rmm::mr::get_current_device_resource());
     mutable_column_view output_view = temp_col->mutable_view();
 
     // DeviceSegmentedSort is faster than DeviceSegmentedRadixSort at this time
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index e15d54b4251..083b1b2eb46 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -25,6 +25,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -145,8 +147,12 @@ std::unique_ptr<table> distinct(table_view const& input,
     return empty_like(input);
   }
 
-  auto const gather_map =
-    get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream);
+  auto const gather_map = get_distinct_indices(input.select(keys),
+                                               keep,
+                                               nulls_equal,
+                                               nans_equal,
+                                               stream,
+                                               rmm::mr::get_current_device_resource());
   return detail::gather(input,
                         gather_map,
                         out_of_bounds_policy::DONT_CHECK,
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 760fcf4bb6b..0dae26c18a9 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -28,6 +28,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -209,6 +210,6 @@ cudf::size_type distinct_count(column_view const& input,
 cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, nulls_equal);
+  return detail::distinct_count(input, nulls_equal, cudf::get_default_stream());
 }
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index dc80a454777..d45897930b0 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,8 +38,12 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
     return empty_like(input);
   }
 
-  auto const distinct_indices =
-    get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream);
+  auto const distinct_indices = get_distinct_indices(input.select(keys),
+                                                     keep,
+                                                     nulls_equal,
+                                                     nans_equal,
+                                                     stream,
+                                                     rmm::mr::get_current_device_resource());
 
   // Markers to denote which rows to be copied to the output.
   auto const output_markers = [&] {
diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu
index c7c10438d7a..4c1cf2b2bc3 100644
--- a/cpp/src/stream_compaction/unique_count.cu
+++ b/cpp/src/stream_compaction/unique_count.cu
@@ -28,6 +28,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -144,7 +145,7 @@ cudf::size_type unique_count(column_view const& input,
 cudf::size_type unique_count(table_view const& input, null_equality nulls_equal)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique_count(input, nulls_equal);
+  return detail::unique_count(input, nulls_equal, cudf::get_default_stream());
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu
index bf2937ae8ab..aae511413ef 100644
--- a/cpp/tests/copying/detail_gather_tests.cu
+++ b/cpp/tests/copying/detail_gather_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,12 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
   // test with device vector iterators
   {
     std::unique_ptr<cudf::table> result =
-      cudf::detail::gather(source_table, gather_map.begin(), gather_map.end());
+      cudf::detail::gather(source_table,
+                           gather_map.begin(),
+                           gather_map.end(),
+                           cudf::out_of_bounds_policy::DONT_CHECK,
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
     for (auto i = 0; i < source_table.num_columns(); ++i) {
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -70,7 +75,12 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
   // test with raw pointers
   {
     std::unique_ptr<cudf::table> result =
-      cudf::detail::gather(source_table, gather_map.data(), gather_map.data() + gather_map.size());
+      cudf::detail::gather(source_table,
+                           gather_map.begin(),
+                           gather_map.data() + gather_map.size(),
+                           cudf::out_of_bounds_policy::DONT_CHECK,
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
     for (auto i = 0; i < source_table.num_columns(); ++i) {
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -97,7 +107,8 @@ TYPED_TEST(GatherTest, GatherDetailInvalidIndexTest)
                          gather_map,
                          cudf::out_of_bounds_policy::NULLIFY,
                          cudf::detail::negative_index_policy::NOT_ALLOWED,
-                         cudf::get_default_stream());
+                         cudf::get_default_stream(),
+                         rmm::mr::get_current_device_resource());
 
   auto expect_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2) ? 0 : i; });
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index 3db2ce399cc..7810566fbf1 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 class GatherTestStr : public cudf::test::BaseFixture {
 };
 
@@ -87,7 +89,8 @@ TEST_F(GatherTestStr, Gather)
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                      cudf::get_default_stream());
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource());
 
   std::vector<const char*> h_expected;
   std::vector<int32_t> expected_validity;
@@ -118,7 +121,8 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
                                       gather_map,
                                       cudf::out_of_bounds_policy::DONT_CHECK,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                      cudf::get_default_stream());
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource());
 
   std::vector<const char*> h_expected;
   for (auto itr = h_map.begin(); itr != h_map.end(); ++itr) {
@@ -137,7 +141,8 @@ TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                      cudf::get_default_stream());
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource());
   cudf::test::expect_column_empty(results->get_column(0).view());
 }
 
@@ -151,6 +156,7 @@ TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                      cudf::get_default_stream());
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results->get_column(0).view());
 }
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 7e0b0f9330d..8a2c0b2b411 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 #include <cudf/utilities/span.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <thrust/functional.h>
 #include <thrust/logical.h>
 #include <thrust/scan.h>
@@ -187,13 +188,14 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
   // Use `cudf::duplicate_keep_option::KEEP_LAST` so this will produce the desired behavior when
   // being called in `create_map` in spark-rapids.
   // Other options comparing nulls and NaNs are set as all-equal.
-  auto out_columns = cudf::detail::stable_distinct(
-                         table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}},
-                                     child.child(0), child.child(1)}}, // input table
-                         std::vector<size_type>{0, 1},                 // key columns
-                         cudf::duplicate_keep_option::KEEP_LAST, cudf::null_equality::EQUAL,
-                         cudf::nan_equality::ALL_EQUAL, stream)
-                         ->release();
+  auto out_columns =
+      cudf::detail::stable_distinct(
+          table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}}, child.child(0),
+                      child.child(1)}}, // input table
+          std::vector<size_type>{0, 1}, // key columns
+          cudf::duplicate_keep_option::KEEP_LAST, cudf::null_equality::EQUAL,
+          cudf::nan_equality::ALL_EQUAL, stream, rmm::mr::get_current_device_resource())
+          ->release();
   auto const out_labels = out_columns.front()->view();
 
   // Assemble a structs column of <out_keys, out_vals>.

From 3584739a301fa8ab98caa4b7a887aab26712f9d4 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 13 Mar 2023 22:58:20 +0100
Subject: [PATCH 02/63] Setting a threshold for KvikIO IO (#12841)

For small reads and writes the overhead of using cuFile and/or KvikIO becomes significant. This PR introduces the threshold already used by the `GDS` to the `KVIKIO` backend as well.

Closes #12780

### Future work
Let's optimize KvikIO for small reads and writes so we don't need this threshold.
Tracking here: https://github.com/rapidsai/kvikio/issues/178

#

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12841
---
 cpp/src/io/utilities/data_sink.cpp         |  8 ++++--
 cpp/src/io/utilities/datasource.cpp        |  6 ++--
 cpp/src/io/utilities/file_io_utilities.hpp | 32 ++--------------------
 3 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index cba45f693f9..40b70986eca 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,8 +63,8 @@ class file_sink : public data_sink {
 
   [[nodiscard]] bool is_device_write_preferred(size_t size) const override
   {
-    return !_kvikio_file.closed() ||
-           (_cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size));
+    if (size < _gds_write_preferred_threshold) { return false; }
+    return supports_device_write();
   }
 
   std::future<void> device_write_async(void const* gpu_data,
@@ -96,6 +96,8 @@ class file_sink : public data_sink {
   size_t _bytes_written = 0;
   std::unique_ptr<detail::cufile_output_impl> _cufile_out;
   kvikio::FileHandle _kvikio_file;
+  // The write size above which GDS is faster then d2h-copy + posix-write
+  static constexpr size_t _gds_write_preferred_threshold = 128 << 10;  // 128KB
 };
 
 /**
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 71d64900398..e2cea7a56ff 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -56,8 +56,8 @@ class file_source : public datasource {
 
   [[nodiscard]] bool is_device_read_preferred(size_t size) const override
   {
-    return !_kvikio_file.closed() ||
-           (_cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size));
+    if (size < _gds_read_preferred_threshold) { return false; }
+    return supports_device_read();
   }
 
   std::future<size_t> device_read_async(size_t offset,
@@ -98,6 +98,8 @@ class file_source : public datasource {
  private:
   std::unique_ptr<detail::cufile_input_impl> _cufile_in;
   kvikio::FileHandle _kvikio_file;
+  // The read size above which GDS is faster then posix-read + h2d-copy
+  static constexpr size_t _gds_read_preferred_threshold = 128 << 10;  // 128KB
 };
 
 /**
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 38674892966..b55dd3b1583 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,36 +49,10 @@ class file_wrapper {
   [[nodiscard]] auto desc() const { return fd; }
 };
 
-/**
- * @brief Base class for cuFile input/output.
- *
- * Contains the common API for cuFile input and output classes.
- */
-class cufile_io_base {
- public:
-  /**
-   * @brief Returns an estimate of whether the cuFile operation is the optimal option.
-   *
-   * @param size Read/write operation size, in bytes.
-   * @return Whether a cuFile operation with the given size is expected to be faster than a host
-   * read + H2D copy
-   */
-  static bool is_cufile_io_preferred(size_t size) { return size > op_size_threshold; }
-
- protected:
-  /**
-   * @brief The read/write size above which cuFile is faster then host read + copy
-   *
-   * This may not be the optimal threshold for all systems. Derived `is_cufile_io_preferred`
-   * implementations can use a different logic.
-   */
-  static constexpr size_t op_size_threshold = 128 << 10;
-};
-
 /**
  * @brief Interface class for cufile input.
  */
-class cufile_input : public cufile_io_base {
+class cufile_input {
  public:
   /**
    * @brief Asynchronously reads into existing device memory.
@@ -101,7 +75,7 @@ class cufile_input : public cufile_io_base {
 /**
  * @brief Interface class for cufile output.
  */
-class cufile_output : public cufile_io_base {
+class cufile_output {
  public:
   /**
    * @brief Asynchronously writes the data from a device buffer into a file.

From 55ed34778694974f19af71f9be82721b8b8af183 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 14 Mar 2023 11:20:51 -0400
Subject: [PATCH 03/63] Enable detection of undesired stream usage (#12089)

This PR builds on #11875 and partially addresses #11943. This PR allows us to run all tests on a precise stream (the newly introduced `cudf::test::get_default_stream()`) and then verify that all CUDA APIs end up invoked on that stream. This implements the feature required in #11943, but to apply it universally across libcudf will require the API changes that will expose streams so I plan to make those changes incrementally after this PR is merged.

The preload library is now compiled twice, once to overload `cudf::get_default_stream` and once to overload `cudf::test::get_default_stream`. For now there is still some manual coordination associated with determining which one should be used with a given test, but once #12451 is merged and we start running all tests via ctest instead of direct invocation of the test executables we can start encoding this information in the CMake configuration of the tests by associating the require environment variables directly with the test executable using `set_tests_properties`.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Ray Douglass (https://github.com/raydouglass)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12089
---
 ci/test_cpp.sh                                |  9 +-
 conda/recipes/libcudf/meta.yaml               |  7 +-
 cpp/CMakeLists.txt                            | 41 ++++++---
 cpp/include/cudf_test/base_fixture.hpp        | 63 +++++++++-----
 cpp/include/cudf_test/default_stream.hpp      | 41 +++++++++
 ...p => stream_checking_resource_adaptor.hpp} | 56 +++++++++---
 cpp/tests/CMakeLists.txt                      |  4 +-
 cpp/tests/error/error_handling_test.cu        | 16 ++--
 cpp/tests/utilities/default_stream.cpp        | 27 ++++++
 cpp/tests/utilities/identify_stream_usage.cpp | 85 +++++++++++++------
 10 files changed, 264 insertions(+), 85 deletions(-)
 create mode 100644 cpp/include/cudf_test/default_stream.hpp
 rename cpp/include/cudf_test/{stream_checking_resource_adapter.hpp => stream_checking_resource_adaptor.hpp} (69%)
 create mode 100644 cpp/tests/utilities/default_stream.cpp

diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index bd7a82afbea..3f65399d3af 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -8,9 +8,10 @@ trap "EXITCODE=1" ERR
 set +e
 
 # Get library for finding incorrect default stream usage.
-STREAM_IDENTIFY_LIB="${CONDA_PREFIX}/lib/libcudf_identify_stream_usage.so"
+STREAM_IDENTIFY_LIB_MODE_CUDF="${CONDA_PREFIX}/lib/libcudf_identify_stream_usage_mode_cudf.so"
+STREAM_IDENTIFY_LIB_MODE_TESTING="${CONDA_PREFIX}/lib/libcudf_identify_stream_usage_mode_testing.so"
 
-echo "STREAM_IDENTIFY_LIB=${STREAM_IDENTIFY_LIB}"
+echo "STREAM_IDENTIFY_LIB=${STREAM_IDENTIFY_LIB_MODE_CUDF}"
 
 # Run libcudf and libcudf_kafka gtests from libcudf-tests package
 rapids-logger "Run gtests"
@@ -31,10 +32,10 @@ for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do
         # This one test is specifically designed to test using a thrust device
         # vector, so we expect and allow it to include default stream usage.
         gtest_filter="SpanTest.CanConstructFromDeviceContainers"
-        GTEST_CUDF_STREAM_MODE="custom" LD_PRELOAD=${STREAM_IDENTIFY_LIB} ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR} --gtest_filter="-${gtest_filter}" && \
+        GTEST_CUDF_STREAM_MODE="new_cudf_default" LD_PRELOAD=${STREAM_IDENTIFY_LIB_MODE_CUDF} ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR} --gtest_filter="-${gtest_filter}" && \
             ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR} --gtest_filter="${gtest_filter}"
     else
-        GTEST_CUDF_STREAM_MODE="custom" LD_PRELOAD=${STREAM_IDENTIFY_LIB} ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR}
+        GTEST_CUDF_STREAM_MODE="new_cudf_default" LD_PRELOAD=${STREAM_IDENTIFY_LIB_MODE_CUDF} ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR}
     fi
 done
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 770a234b56e..b31ff37d23b 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -74,7 +74,9 @@ outputs:
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
-        - test -f $PREFIX/lib/libcudftestutil.a
+        - test -f $PREFIX/lib/libcudftestutil.so
+        - test -f $PREFIX/lib/libcudf_identify_stream_usage_mode_cudf.so
+        - test -f $PREFIX/lib/libcudf_identify_stream_usage_mode_testing.so
         - test -f $PREFIX/include/cudf/aggregation.hpp
         - test -f $PREFIX/include/cudf/ast/detail/expression_parser.hpp
         - test -f $PREFIX/include/cudf/ast/detail/operators.hpp
@@ -294,11 +296,12 @@ outputs:
         - test -f $PREFIX/include/cudf_test/column_wrapper.hpp
         - test -f $PREFIX/include/cudf_test/cudf_gtest.hpp
         - test -f $PREFIX/include/cudf_test/cxxopts.hpp
+        - test -f $PREFIX/include/cudf_test/default_stream.hpp
         - test -f $PREFIX/include/cudf_test/detail/column_utilities.hpp
         - test -f $PREFIX/include/cudf_test/file_utilities.hpp
         - test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp
         - test -f $PREFIX/include/cudf_test/iterator_utilities.hpp
-        - test -f $PREFIX/include/cudf_test/stream_checking_resource_adapter.hpp
+        - test -f $PREFIX/include/cudf_test/stream_checking_resource_adaptor.hpp
         - test -f $PREFIX/include/cudf_test/table_utilities.hpp
         - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
         - test -f $PREFIX/include/cudf_test/type_list_utilities.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a261049d3f0..12b812d0bbe 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -740,10 +740,15 @@ add_library(cudf::cudf ALIAS cudf)
 
 if(CUDF_BUILD_TESTUTIL)
   add_library(
-    cudftestutil STATIC
+    # This library must also be compiled as a dynamic library to support
+    # LD_PRELOAD injection of symbols. We currently leverage this for
+    # stream-related library validation and may make use of it for other
+    # similar features in the future.
+    cudftestutil SHARED
     tests/io/metadata_utilities.cpp
     tests/utilities/base_fixture.cpp
     tests/utilities/column_utilities.cu
+    tests/utilities/default_stream.cpp
     tests/utilities/table_utilities.cu
     tests/utilities/tdigest_utilities.cu
   )
@@ -790,18 +795,27 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
     )
   endif()
 
-  # Libraries for stream-related testing.
-  add_library(cudf_identify_stream_usage SHARED tests/utilities/identify_stream_usage.cpp)
+  # Libraries for stream-related testing. We build the library twice, one with STREAM_MODE_TESTING
+  # on and one with it set to off. Each test will then be configured to use the appropriate library
+  # depending via ctest and whether it has been updated to expose public stream APIs.
+  foreach(_mode cudf testing)
+    set(_tgt "cudf_identify_stream_usage_mode_${_mode}")
+    add_library(${_tgt} SHARED tests/utilities/identify_stream_usage.cpp)
+
+    set_target_properties(
+      ${_tgt}
+      PROPERTIES # set target compile options
+                 CXX_STANDARD 17
+                 CXX_STANDARD_REQUIRED ON
+                 POSITION_INDEPENDENT_CODE ON
+    )
+    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm)
+    add_library(cudf::${_tgt} ALIAS ${_tgt})
 
-  set_target_properties(
-    cudf_identify_stream_usage
-    PROPERTIES # set target compile options
-               CXX_STANDARD 17
-               CXX_STANDARD_REQUIRED ON
-               POSITION_INDEPENDENT_CODE ON
-  )
-  target_link_libraries(cudf_identify_stream_usage PUBLIC CUDA::cudart rmm::rmm)
-  add_library(cudf::cudf_identify_stream_usage ALIAS cudf_identify_stream_usage)
+    if("${_mode}" STREQUAL "testing")
+      target_compile_definitions(${_tgt} PUBLIC STREAM_MODE_TESTING)
+    endif()
+  endforeach()
 endif()
 
 # ##################################################################################################
@@ -877,7 +891,8 @@ if(CUDF_BUILD_TESTUTIL)
 endif()
 
 if(CUDF_BUILD_STREAMS_TEST_UTIL)
-  install(TARGETS cudf_identify_stream_usage DESTINATION ${lib_dir})
+  install(TARGETS cudf_identify_stream_usage_mode_cudf DESTINATION ${lib_dir})
+  install(TARGETS cudf_identify_stream_usage_mode_testing DESTINATION ${lib_dir})
 endif()
 
 set(doc_string
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index be4d5bccd7b..6bdfc7bfe98 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,8 +23,9 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/cxxopts.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/file_utilities.hpp>
-#include <cudf_test/stream_checking_resource_adapter.hpp>
+#include <cudf_test/stream_checking_resource_adaptor.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -308,16 +309,33 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
     const char* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE");  // Overridden by CLI options
     const char* env_stream_mode =
       std::getenv("GTEST_CUDF_STREAM_MODE");  // Overridden by CLI options
-    auto default_rmm_mode    = env_rmm_mode ? env_rmm_mode : "pool";
-    auto default_stream_mode = env_stream_mode ? env_stream_mode : "default";
+    const char* env_stream_error_mode =
+      std::getenv("GTEST_CUDF_STREAM_ERROR_MODE");  // Overridden by CLI options
+    auto default_rmm_mode          = env_rmm_mode ? env_rmm_mode : "pool";
+    auto default_stream_mode       = env_stream_mode ? env_stream_mode : "default";
+    auto default_stream_error_mode = env_stream_error_mode ? env_stream_error_mode : "error";
     options.allow_unrecognised_options().add_options()(
       "rmm_mode",
       "RMM allocation mode",
       cxxopts::value<std::string>()->default_value(default_rmm_mode));
+    // `new_cudf_default` means that cudf::get_default_stream has been patched,
+    // so we raise errors anywhere that a CUDA default stream is observed
+    // instead of cudf::get_default_stream(). This corresponds to compiling
+    // identify_stream_usage with STREAM_MODE_TESTING=OFF (must do both at the
+    // same time).
+    // `new_testing_default` means that cudf::test::get_default_stream has been
+    // patched, so we raise errors anywhere that _any_ other stream is
+    // observed. This corresponds to compiling identify_stream_usage with
+    // STREAM_MODE_TESTING=ON (must do both at the same time).
     options.allow_unrecognised_options().add_options()(
       "stream_mode",
       "Whether to use a non-default stream",
       cxxopts::value<std::string>()->default_value(default_stream_mode));
+    options.allow_unrecognised_options().add_options()(
+      "stream_error_mode",
+      "Whether to error or print to stdout when a non-default stream is observed and stream_mode "
+      "is not \"default\"",
+      cxxopts::value<std::string>()->default_value(default_stream_error_mode));
     return options.parse(argc, argv);
   } catch (const cxxopts::OptionException& e) {
     CUDF_FAIL("Error parsing command line options");
@@ -334,21 +352,24 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
  * function parses the command line to customize test behavior, like the
  * allocation mode used for creating the default memory resource.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                            \
-  int main(int argc, char** argv)                                           \
-  {                                                                         \
-    ::testing::InitGoogleTest(&argc, argv);                                 \
-    auto const cmd_opts = parse_cudf_test_opts(argc, argv);                 \
-    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();           \
-    auto resource       = cudf::test::create_memory_resource(rmm_mode);     \
-    rmm::mr::set_current_device_resource(resource.get());                   \
-                                                                            \
-    auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();     \
-    rmm::cuda_stream const new_default_stream{};                            \
-    if (stream_mode == "custom") {                                          \
-      auto adapter = make_stream_checking_resource_adaptor(resource.get()); \
-      rmm::mr::set_current_device_resource(&adapter);                       \
-    }                                                                       \
-                                                                            \
-    return RUN_ALL_TESTS();                                                 \
+#define CUDF_TEST_PROGRAM_MAIN()                                                              \
+  int main(int argc, char** argv)                                                             \
+  {                                                                                           \
+    ::testing::InitGoogleTest(&argc, argv);                                                   \
+    auto const cmd_opts = parse_cudf_test_opts(argc, argv);                                   \
+    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();                             \
+    auto resource       = cudf::test::create_memory_resource(rmm_mode);                       \
+    rmm::mr::set_current_device_resource(resource.get());                                     \
+                                                                                              \
+    auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();                       \
+    if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {      \
+      auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();   \
+      auto const error_on_invalid_stream = (stream_error_mode == "error");                    \
+      auto const check_default_stream    = (stream_mode == "new_cudf_default");               \
+      auto adaptor                       = make_stream_checking_resource_adaptor(             \
+        resource.get(), error_on_invalid_stream, check_default_stream); \
+      rmm::mr::set_current_device_resource(&adaptor);                                         \
+    }                                                                                         \
+                                                                                              \
+    return RUN_ALL_TESTS();                                                                   \
   }
diff --git a/cpp/include/cudf_test/default_stream.hpp b/cpp/include/cudf_test/default_stream.hpp
new file mode 100644
index 00000000000..1da97d71f44
--- /dev/null
+++ b/cpp/include/cudf_test/default_stream.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace test {
+
+/**
+ * @brief Get the default stream to use for tests.
+ *
+ * The standard behavior of this function is to return cudf's default stream
+ * (cudf::get_default_stream). This function is primarily provided as an
+ * overload target for preload libraries (via LD_PRELOAD) so that the default
+ * stream used for tests may be modified for tracking purposes. All tests of
+ * public APIs that accept streams should pass `cudf::test::get_default_stream`
+ * as the stream argument so that a preload library changing the behavior of
+ * this function will trigger those tests to run on a different stream than
+ * `cudf::get_default_stream`.
+ *
+ * @return The default stream to use for tests.
+ */
+rmm::cuda_stream_view const get_default_stream();
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/include/cudf_test/stream_checking_resource_adapter.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
similarity index 69%
rename from cpp/include/cudf_test/stream_checking_resource_adapter.hpp
rename to cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 4a22ff148ae..e6108309ae2 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adapter.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <cudf_test/default_stream.hpp>
+
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 /**
@@ -33,7 +35,12 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    *
    * @param upstream The resource used for allocating/deallocating device memory
    */
-  stream_checking_resource_adaptor(Upstream* upstream) : upstream_{upstream}
+  stream_checking_resource_adaptor(Upstream* upstream,
+                                   bool error_on_invalid_stream,
+                                   bool check_default_stream)
+    : upstream_{upstream},
+      error_on_invalid_stream_{error_on_invalid_stream},
+      check_default_stream_{check_default_stream}
   {
     CUDF_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
@@ -87,7 +94,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    */
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
-    verify_non_default_stream(stream);
+    verify_stream(stream);
     return upstream_->allocate(bytes, stream);
   }
 
@@ -102,7 +109,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    */
   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
   {
-    verify_non_default_stream(stream);
+    verify_stream(stream);
     upstream_->deallocate(ptr, bytes, stream);
   }
 
@@ -131,25 +138,44 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    */
   std::pair<std::size_t, std::size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override
   {
-    verify_non_default_stream(stream);
+    verify_stream(stream);
     return upstream_->get_mem_info(stream);
   }
 
   /**
-   * @brief Throw an error if given one of CUDA's default stream specifiers.
+   * @brief Throw an error if the provided stream is invalid.
+   *
+   * A stream is invalid if:
+   * - check_default_stream_ is true and this function is passed one of CUDA's
+   *   default stream specifiers, or
+   * - check_default_stream_ is false and this function is passed any stream
+   *   other than the result of cudf::test::get_default_stream().
    *
-   * @throws `std::runtime_error` if provided a default stream
+   * @throws `std::runtime_error` if provided an invalid stream
    */
-  void verify_non_default_stream(rmm::cuda_stream_view const stream) const
+  void verify_stream(rmm::cuda_stream_view const stream) const
   {
     auto cstream{stream.value()};
-    if (cstream == cudaStreamDefault || (cstream == cudaStreamLegacy) ||
-        (cstream == cudaStreamPerThread)) {
-      throw std::runtime_error("Attempted to perform an operation on a default stream!");
+    auto const invalid_stream =
+      check_default_stream_ ? ((cstream == cudaStreamDefault) || (cstream == cudaStreamLegacy) ||
+                               (cstream == cudaStreamPerThread))
+                            : (cstream != cudf::test::get_default_stream().value());
+
+    if (invalid_stream) {
+      if (error_on_invalid_stream_) {
+        throw std::runtime_error("Attempted to perform an operation on an unexpected stream!");
+      } else {
+        std::cout << "Attempted to perform an operation on an unexpected stream!" << std::endl;
+      }
     }
   }
 
-  Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
+  Upstream* upstream_;            // the upstream resource used for satisfying allocation requests
+  bool error_on_invalid_stream_;  // If true, throw an exception when the wrong stream is detected.
+                                  // If false, simply print to stdout.
+  bool check_default_stream_;  // If true, throw an exception when the default stream is observed.
+                               // If false, throw an exception when anything other than
+                               // cudf::test::get_default_stream() is observed.
 };
 
 /**
@@ -160,7 +186,9 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
  * @param upstream Pointer to the upstream resource
  */
 template <typename Upstream>
-stream_checking_resource_adaptor<Upstream> make_stream_checking_resource_adaptor(Upstream* upstream)
+stream_checking_resource_adaptor<Upstream> make_stream_checking_resource_adaptor(
+  Upstream* upstream, bool error_on_invalid_stream, bool check_default_stream)
 {
-  return stream_checking_resource_adaptor<Upstream>{upstream};
+  return stream_checking_resource_adaptor<Upstream>{
+    upstream, error_on_invalid_stream, check_default_stream};
 }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 7c021a73eb5..0d58b19de6a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -519,8 +519,8 @@ ConfigureTest(
 # tests by manually invoking the executable, so we'll have to manually pass this environment
 # variable in that setup.
 set_tests_properties(
-  STREAM_IDENTIFICATION_TEST PROPERTIES ENVIRONMENT
-                                        LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage>
+  STREAM_IDENTIFICATION_TEST
+  PROPERTIES ENVIRONMENT LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_cudf>
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index eb4a3e895f9..5b842322681 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/stream_checking_resource_adaptor.hpp>
 
 #include <cudf/filling.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -125,10 +127,14 @@ int main(int argc, char** argv)
   ::testing::InitGoogleTest(&argc, argv);
   auto const cmd_opts    = parse_cudf_test_opts(argc, argv);
   auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();
-  if (stream_mode == "custom") {
-    auto resource = rmm::mr::get_current_device_resource();
-    auto adapter  = make_stream_checking_resource_adaptor(resource);
-    rmm::mr::set_current_device_resource(&adapter);
+  if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
+    auto resource                      = rmm::mr::get_current_device_resource();
+    auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
+    auto const error_on_invalid_stream = (stream_error_mode == "error");
+    auto const check_default_stream    = (stream_mode == "new_cudf_default");
+    auto adaptor                       = make_stream_checking_resource_adaptor(
+      resource, error_on_invalid_stream, check_default_stream);
+    rmm::mr::set_current_device_resource(&adaptor);
   }
   return RUN_ALL_TESTS();
 }
diff --git a/cpp/tests/utilities/default_stream.cpp b/cpp/tests/utilities/default_stream.cpp
new file mode 100644
index 00000000000..52752f78bb9
--- /dev/null
+++ b/cpp/tests/utilities/default_stream.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/utilities/default_stream.hpp>
+
+#include <cudf_test/default_stream.hpp>
+
+namespace cudf {
+namespace test {
+
+rmm::cuda_stream_view const get_default_stream() { return cudf::get_default_stream(); }
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/utilities/identify_stream_usage.cpp b/cpp/tests/utilities/identify_stream_usage.cpp
index 87301a7d49d..a4d19a8f552 100644
--- a/cpp/tests/utilities/identify_stream_usage.cpp
+++ b/cpp/tests/utilities/identify_stream_usage.cpp
@@ -19,18 +19,58 @@
 
 #include <cuda_runtime.h>
 
+#include <cstdlib>
 #include <cxxabi.h>
 #include <dlfcn.h>
 #include <execinfo.h>
 #include <iostream>
 #include <stdexcept>
+#include <string>
 #include <unordered_map>
 
-/**
- * @brief Print a backtrace and raise an error if stream is a default stream.
- */
-void check_stream_and_error(cudaStream_t stream)
+// We control whether to override cudf::test::get_default_stream or
+// cudf::get_default_stream with a compile-time flag. Thesee are the two valid
+// options:
+// 1. STREAM_MODE_TESTING=OFF: In this mode, cudf::get_default_stream will
+//    return a custom stream and stream_is_invalid will return true if any CUDA
+//    API is called using any of CUDA's default stream constants
+//    (cudaStreamLegacy, cudaStreamDefault, or cudaStreamPerThread). This check
+//    is sufficient to ensure that cudf is using cudf::get_default_stream
+//    everywhere internally rather than implicitly using stream 0,
+//    cudaStreamDefault, cudaStreamLegacy, thrust execution policies, etc. It
+//    is not sufficient to guarantee a stream-ordered API because it will not
+//    identify places in the code that use cudf::get_default_stream instead of
+//    properly forwarding along a user-provided stream.
+// 2. STREAM_MODE_TESTING=ON: In this mode, cudf::test::get_default_stream
+//    returns a custom stream and stream_is_invalid returns true if any CUDA
+//    API is called using any stream other than cudf::test::get_default_stream.
+//    This is a necessary and sufficient condition to ensure that libcudf is
+//    properly passing streams through all of its (tested) APIs.
+
+namespace cudf {
+
+#ifdef STREAM_MODE_TESTING
+namespace test {
+#endif
+
+rmm::cuda_stream_view const get_default_stream()
 {
+  static rmm::cuda_stream stream{};
+  return {stream};
+}
+
+#ifdef STREAM_MODE_TESTING
+}  // namespace test
+#endif
+
+}  // namespace cudf
+
+bool stream_is_invalid(cudaStream_t stream)
+{
+#ifdef STREAM_MODE_TESTING
+  // In this mode the _only_ valid stream is the one returned by cudf::test::get_default_stream.
+  return (stream != cudf::test::get_default_stream().value());
+#else
   // We explicitly list the possibilities rather than using
   // `cudf::get_default_stream().value()` for two reasons:
   // 1. There is no guarantee that `thrust::device` and the default value of
@@ -39,8 +79,17 @@ void check_stream_and_error(cudaStream_t stream)
   // 2. Using the cudf default stream would require linking against cudf, which
   //    adds unnecessary complexity to the build process (especially in CI)
   //    when this simple approach is sufficient.
-  if (stream == cudaStreamDefault || (stream == cudaStreamLegacy) ||
-      (stream == cudaStreamPerThread)) {
+  return (stream == cudaStreamDefault) || (stream == cudaStreamLegacy) ||
+         (stream == cudaStreamPerThread);
+#endif
+}
+
+/**
+ * @brief Print a backtrace and raise an error if stream is a default stream.
+ */
+void check_stream_and_error(cudaStream_t stream)
+{
+  if (stream_is_invalid(stream)) {
 #ifdef __GNUC__
     // If we're on the wrong stream, print the stack trace from the current frame.
     // Adapted from from https://panthema.net/2008/0901-stacktrace-demangled/
@@ -109,7 +158,12 @@ void check_stream_and_error(cudaStream_t stream)
 #else
     std::cout << "Backtraces are only when built with a GNU compiler." << std::endl;
 #endif  // __GNUC__
-    throw std::runtime_error("Found unexpected default stream!");
+    char const* env_stream_error_mode{std::getenv("GTEST_CUDF_STREAM_ERROR_MODE")};
+    if (env_stream_error_mode && !strcmp(env_stream_error_mode, "print")) {
+      std::cout << "Found unexpected stream!" << std::endl;
+    } else {
+      throw std::runtime_error("Found unexpected stream!");
+    }
   }
 }
 
@@ -289,23 +343,6 @@ DEFINE_OVERLOAD(cudaMallocFromPoolAsync,
                 ARG(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream),
                 ARG(ptr, size, memPool, stream));
 
-namespace cudf {
-
-/**
- * @brief Get the current default stream
- *
- * Overload the default function to return a new stream here.
- *
- * @return The current default stream.
- */
-rmm::cuda_stream_view const get_default_stream()
-{
-  static rmm::cuda_stream stream{};
-  return {stream};
-}
-
-}  // namespace cudf
-
 /**
  * @brief Function to collect all the original CUDA symbols corresponding to overloaded functions.
  *

From fbbf1fabf18fa70628b85f11ce76a868a0d47a00 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 14 Mar 2023 17:51:36 -0500
Subject: [PATCH 04/63] Add README symlink for dask-cudf. (#12946)

dask-cudf's `pyproject.toml` references a README.md that does not exist. This fixes it by adding a symlink to the top-level repository README.md.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/12946
---
 python/dask_cudf/README.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 python/dask_cudf/README.md

diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/dask_cudf/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file

From 745d35e9fe4225572ed8f086fdd58032ec1ffaeb Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 14 Mar 2023 16:28:02 -0700
Subject: [PATCH 05/63] Refactor `io::orc::ProtobufWriter` (#12877)

This refactors the class `cudf::io::orc::ProtobufWriter`, making it independent from the ORC writer buffer. From now, a new instance of `ProtobufWriter` will work on its own buffer. That avoids touching the ORC writer's internal states.

The PR is part of solution for https://github.com/rapidsai/cudf/issues/12792.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/12877
---
 cpp/src/io/orc/orc.cpp              | 40 ++++++------
 cpp/src/io/orc/orc.hpp              | 87 ++++++++++++++-----------
 cpp/src/io/orc/orc_field_writer.hpp | 14 ++---
 cpp/src/io/orc/writer_impl.cu       | 98 ++++++++++++++---------------
 cpp/src/io/orc/writer_impl.hpp      |  7 +--
 5 files changed, 125 insertions(+), 121 deletions(-)

diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 880990c552f..5445e59297c 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -28,7 +28,7 @@ namespace cudf {
 namespace io {
 namespace orc {
 
-uint32_t ProtobufReader::read_field_size(const uint8_t* end)
+uint32_t ProtobufReader::read_field_size(uint8_t const* end)
 {
   auto const size = get<uint32_t>();
   CUDF_EXPECTS(size <= static_cast<uint32_t>(end - m_cur), "Protobuf parsing out of bounds");
@@ -213,8 +213,7 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
                                          TypeKind kind,
                                          ColStatsBlob const* stats)
 {
-  std::vector<uint8_t> positions_data;
-  ProtobufWriter position_writer(&positions_data);
+  ProtobufWriter position_writer;
   auto const positions_size_offset = position_writer.put_uint(
     encode_field_number(1, ProtofType::FIXEDLEN));  // 1:positions[packed=true]
   position_writer.put_byte(0xcd);                   // positions size placeholder
@@ -246,19 +245,20 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
       positions_size += position_writer.put_byte(0);
     }
   }
+
   // size of the field 1
-  positions_data[positions_size_offset] = static_cast<uint8_t>(positions_size);
+  position_writer.buffer()[positions_size_offset] = static_cast<uint8_t>(positions_size);
 
   auto const stats_size = (stats == nullptr)
                             ? 0
                             : varint_size(encode_field_number<decltype(*stats)>(2)) +
                                 varint_size(stats->size()) + stats->size();
-  auto const entry_size = positions_data.size() + stats_size;
+  auto const entry_size = position_writer.size() + stats_size;
 
   // 1:RowIndex.entry
   put_uint(encode_field_number(1, ProtofType::FIXEDLEN));
   put_uint(entry_size);
-  put_bytes<uint8_t>(positions_data);
+  put_bytes<uint8_t>(position_writer.buffer());
 
   if (stats != nullptr) {
     put_uint(encode_field_number<decltype(*stats)>(2));  // 2: statistics
@@ -268,7 +268,7 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
   }
 }
 
-size_t ProtobufWriter::write(const PostScript& s)
+size_t ProtobufWriter::write(PostScript const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.footerLength);
@@ -280,7 +280,7 @@ size_t ProtobufWriter::write(const PostScript& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const FileFooter& s)
+size_t ProtobufWriter::write(FileFooter const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.headerLength);
@@ -294,7 +294,7 @@ size_t ProtobufWriter::write(const FileFooter& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const StripeInformation& s)
+size_t ProtobufWriter::write(StripeInformation const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.offset);
@@ -305,7 +305,7 @@ size_t ProtobufWriter::write(const StripeInformation& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const SchemaType& s)
+size_t ProtobufWriter::write(SchemaType const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
@@ -317,7 +317,7 @@ size_t ProtobufWriter::write(const SchemaType& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const UserMetadataItem& s)
+size_t ProtobufWriter::write(UserMetadataItem const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_blob(1, s.name);
@@ -325,7 +325,7 @@ size_t ProtobufWriter::write(const UserMetadataItem& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const StripeFooter& s)
+size_t ProtobufWriter::write(StripeFooter const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_repeated_struct(1, s.streams);
@@ -334,7 +334,7 @@ size_t ProtobufWriter::write(const StripeFooter& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const Stream& s)
+size_t ProtobufWriter::write(Stream const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
@@ -343,7 +343,7 @@ size_t ProtobufWriter::write(const Stream& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const ColumnEncoding& s)
+size_t ProtobufWriter::write(ColumnEncoding const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
@@ -351,14 +351,14 @@ size_t ProtobufWriter::write(const ColumnEncoding& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const StripeStatistics& s)
+size_t ProtobufWriter::write(StripeStatistics const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_repeated_struct_blob(1, s.colStats);
   return w.value();
 }
 
-size_t ProtobufWriter::write(const Metadata& s)
+size_t ProtobufWriter::write(Metadata const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_repeated_struct(1, s.stripeStats);
@@ -443,13 +443,13 @@ host_span<uint8_t const> OrcDecompressor::decompress_blocks(host_span<uint8_t co
 
 metadata::metadata(datasource* const src, rmm::cuda_stream_view stream) : source(src)
 {
-  const auto len         = source->size();
-  const auto max_ps_size = std::min(len, static_cast<size_t>(256));
+  auto const len         = source->size();
+  auto const max_ps_size = std::min(len, static_cast<size_t>(256));
 
   // Read uncompressed postscript section (max 255 bytes + 1 byte for length)
   auto buffer            = source->host_read(len - max_ps_size, max_ps_size);
-  const size_t ps_length = buffer->data()[max_ps_size - 1];
-  const uint8_t* ps_data = &buffer->data()[max_ps_size - ps_length - 1];
+  size_t const ps_length = buffer->data()[max_ps_size - 1];
+  uint8_t const* ps_data = &buffer->data()[max_ps_size - ps_length - 1];
   ProtobufReader(ps_data, ps_length).read(ps);
   CUDF_EXPECTS(ps.footerLength + ps_length < len, "Invalid footer length");
 
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 44882b71925..d30c3823080 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -196,7 +196,7 @@ int constexpr encode_field_number(int field_number) noexcept
  */
 class ProtobufReader {
  public:
-  ProtobufReader(const uint8_t* base, size_t len) : m_base(base), m_cur(base), m_end(base + len) {}
+  ProtobufReader(uint8_t const* base, size_t len) : m_base(base), m_cur(base), m_end(base + len) {}
 
   template <typename T>
   void read(T& s)
@@ -241,40 +241,40 @@ class ProtobufReader {
   template <typename T, typename... Operator>
   void function_builder(T& s, size_t maxlen, std::tuple<Operator...>& op);
 
-  uint32_t read_field_size(const uint8_t* end);
+  uint32_t read_field_size(uint8_t const* end);
 
   template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     value = get<T>();
   }
 
   template <typename T, std::enable_if_t<std::is_enum_v<T>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     value = static_cast<T>(get<uint32_t>());
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, std::string>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     auto const size = read_field_size(end);
-    value.assign(reinterpret_cast<const char*>(m_cur), size);
+    value.assign(reinterpret_cast<char const*>(m_cur), size);
     m_cur += size;
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, std::vector<std::string>>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     auto const size = read_field_size(end);
-    value.emplace_back(reinterpret_cast<const char*>(m_cur), size);
+    value.emplace_back(reinterpret_cast<char const*>(m_cur), size);
     m_cur += size;
   }
 
   template <typename T,
             std::enable_if_t<std::is_same_v<T, std::vector<typename T::value_type>> and
                              !std::is_same_v<std::string, typename T::value_type>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     auto const size = read_field_size(end);
     value.emplace_back();
@@ -283,7 +283,7 @@ class ProtobufReader {
 
   template <typename T,
             std::enable_if_t<std::is_same_v<T, std::optional<typename T::value_type>>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     typename T::value_type contained_value;
     read_field(contained_value, end);
@@ -291,21 +291,21 @@ class ProtobufReader {
   }
 
   template <typename T>
-  auto read_field(T& value, const uint8_t* end) -> decltype(read(value, 0))
+  auto read_field(T& value, uint8_t const* end) -> decltype(read(value, 0))
   {
     auto const size = read_field_size(end);
     read(value, size);
   }
 
   template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     memcpy(&value, m_cur, sizeof(T));
     m_cur += sizeof(T);
   }
 
   template <typename T>
-  void read_packed_field(T& value, const uint8_t* end)
+  void read_packed_field(T& value, uint8_t const* end)
   {
     auto const len       = get<uint32_t>();
     auto const field_end = std::min(m_cur + len, end);
@@ -314,7 +314,7 @@ class ProtobufReader {
   }
 
   template <typename T>
-  void read_raw_field(T& value, const uint8_t* end)
+  void read_raw_field(T& value, uint8_t const* end)
   {
     auto const size = read_field_size(end);
     value.emplace_back(m_cur, m_cur + size);
@@ -331,7 +331,7 @@ class ProtobufReader {
     {
     }
 
-    inline void operator()(ProtobufReader* pbr, const uint8_t* end)
+    inline void operator()(ProtobufReader* pbr, uint8_t const* end)
     {
       pbr->read_field(output_value, end);
     }
@@ -347,7 +347,7 @@ class ProtobufReader {
     {
     }
 
-    inline void operator()(ProtobufReader* pbr, const uint8_t* end)
+    inline void operator()(ProtobufReader* pbr, uint8_t const* end)
     {
       pbr->read_packed_field(output_value, end);
     }
@@ -363,15 +363,15 @@ class ProtobufReader {
     {
     }
 
-    inline void operator()(ProtobufReader* pbr, const uint8_t* end)
+    inline void operator()(ProtobufReader* pbr, uint8_t const* end)
     {
       pbr->read_raw_field(output_value, end);
     }
   };
 
-  const uint8_t* const m_base;
-  const uint8_t* m_cur;
-  const uint8_t* const m_end;
+  uint8_t const* const m_base;
+  uint8_t const* m_cur;
+  uint8_t const* const m_end;
 
  public:
   /**
@@ -477,21 +477,25 @@ inline int64_t ProtobufReader::get<int64_t>()
  */
 class ProtobufWriter {
  public:
-  ProtobufWriter() { m_buf = nullptr; }
-  ProtobufWriter(std::vector<uint8_t>* output) { m_buf = output; }
+  ProtobufWriter() = default;
+
+  ProtobufWriter(std::size_t bytes) : m_buff(bytes) {}
+
   uint32_t put_byte(uint8_t v)
   {
-    m_buf->push_back(v);
+    m_buff.push_back(v);
     return 1;
   }
+
   template <typename T>
   uint32_t put_bytes(host_span<T const> values)
   {
     static_assert(sizeof(T) == 1);
-    m_buf->reserve(m_buf->size() + values.size());
-    m_buf->insert(m_buf->end(), values.begin(), values.end());
+    m_buff.reserve(m_buff.size() + values.size());
+    m_buff.insert(m_buff.end(), values.begin(), values.end());
     return values.size();
   }
+
   uint32_t put_uint(uint64_t v)
   {
     int l = 1;
@@ -519,6 +523,7 @@ class ProtobufWriter {
     int64_t s = (v < 0);
     return put_uint(((v ^ -s) << 1) + s);
   }
+
   void put_row_index_entry(int32_t present_blk,
                            int32_t present_ofs,
                            int32_t data_blk,
@@ -528,20 +533,26 @@ class ProtobufWriter {
                            TypeKind kind,
                            ColStatsBlob const* stats);
 
+  std::size_t size() const { return m_buff.size(); }
+  uint8_t const* data() { return m_buff.data(); }
+
+  std::vector<uint8_t>& buffer() { return m_buff; }
+  std::vector<uint8_t> release() { return std::move(m_buff); }
+
  public:
-  size_t write(const PostScript&);
-  size_t write(const FileFooter&);
-  size_t write(const StripeInformation&);
-  size_t write(const SchemaType&);
-  size_t write(const UserMetadataItem&);
-  size_t write(const StripeFooter&);
-  size_t write(const Stream&);
-  size_t write(const ColumnEncoding&);
-  size_t write(const StripeStatistics&);
-  size_t write(const Metadata&);
+  size_t write(PostScript const&);
+  size_t write(FileFooter const&);
+  size_t write(StripeInformation const&);
+  size_t write(SchemaType const&);
+  size_t write(UserMetadataItem const&);
+  size_t write(StripeFooter const&);
+  size_t write(Stream const&);
+  size_t write(ColumnEncoding const&);
+  size_t write(StripeStatistics const&);
+  size_t write(Metadata const&);
 
  protected:
-  std::vector<uint8_t>* m_buf;
+  std::vector<uint8_t> m_buff;
   struct ProtobufFieldWriter;
 };
 
@@ -613,7 +624,7 @@ struct column_validity_info {
  * convenience methods for initializing and accessing metadata.
  */
 class metadata {
-  using OrcStripeInfo = std::pair<const StripeInformation*, const StripeFooter*>;
+  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
 
  public:
   struct stripe_source_mapping {
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index 44d87190844..fdba0d81a32 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ struct ProtobufWriter::ProtobufFieldWriter {
   void field_packed_uint(int field, const std::vector<T>& value)
   {
     struct_size += p->put_uint(encode_field_number<std::vector<T>>(field));
-    auto lpos = p->m_buf->size();
+    auto lpos = p->m_buff.size();
     p->put_byte(0);
     auto sz = std::accumulate(value.begin(), value.end(), 0, [p = this->p](size_t sum, auto val) {
       return sum + p->put_uint(val);
@@ -62,8 +62,8 @@ struct ProtobufWriter::ProtobufFieldWriter {
 
     struct_size += sz + 1;
     for (; sz > 0x7f; sz >>= 7, struct_size++)
-      p->m_buf->insert(p->m_buf->begin() + (lpos++), static_cast<uint8_t>((sz & 0x7f) | 0x80));
-    (*(p->m_buf))[lpos] = static_cast<uint8_t>(sz);
+      p->m_buff.insert(p->m_buff.begin() + (lpos++), static_cast<uint8_t>((sz & 0x7f) | 0x80));
+    (p->m_buff)[lpos] = static_cast<uint8_t>(sz);
   }
 
   /**
@@ -84,13 +84,13 @@ struct ProtobufWriter::ProtobufFieldWriter {
   void field_struct(int field, const T& value)
   {
     struct_size += p->put_uint(encode_field_number(field, ProtofType::FIXEDLEN));
-    auto lpos = p->m_buf->size();
+    auto lpos = p->m_buff.size();
     p->put_byte(0);
     auto sz = p->write(value);
     struct_size += sz + 1;
     for (; sz > 0x7f; sz >>= 7, struct_size++)
-      p->m_buf->insert(p->m_buf->begin() + (lpos++), static_cast<uint8_t>((sz & 0x7f) | 0x80));
-    (*(p->m_buf))[lpos] = static_cast<uint8_t>(sz);
+      p->m_buff.insert(p->m_buff.begin() + (lpos++), static_cast<uint8_t>((sz & 0x7f) | 0x80));
+    (p->m_buff)[lpos] = static_cast<uint8_t>(sz);
   }
 
   /**
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 8d85b001817..00b5c5428b1 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1390,8 +1390,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
                                       host_span<compression_result const> comp_res,
                                       std::vector<ColStatsBlob> const& rg_stats,
                                       StripeInformation* stripe,
-                                      orc_streams* streams,
-                                      ProtobufWriter* pbw)
+                                      orc_streams* streams)
 {
   row_group_index_info present;
   row_group_index_info data;
@@ -1443,21 +1442,21 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     }
   }
 
-  buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
+  ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
 
   // Add row index entries
   auto const& rowgroups_range = segmentation.stripes[stripe_id];
   std::for_each(rowgroups_range.cbegin(), rowgroups_range.cend(), [&](auto rowgroup) {
-    pbw->put_row_index_entry(present.comp_pos,
-                             present.pos,
-                             data.comp_pos,
-                             data.pos,
-                             data2.comp_pos,
-                             data2.pos,
-                             kind,
-                             (rg_stats.empty() or stream_id == 0)
-                               ? nullptr
-                               : (&rg_stats[column_id * segmentation.num_rowgroups() + rowgroup]));
+    pbw.put_row_index_entry(present.comp_pos,
+                            present.pos,
+                            data.comp_pos,
+                            data.pos,
+                            data2.comp_pos,
+                            data2.pos,
+                            kind,
+                            (rg_stats.empty() or stream_id == 0)
+                              ? nullptr
+                              : (&rg_stats[column_id * segmentation.num_rowgroups() + rowgroup]));
 
     if (stream_id != 0) {
       const auto& strm = enc_streams[column_id][rowgroup];
@@ -1467,15 +1466,15 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     }
   });
 
-  (*streams)[stream_id].length = buffer_.size();
+  (*streams)[stream_id].length = pbw.size();
   if (compression_kind_ != NONE) {
     uint32_t uncomp_ix_len = (uint32_t)((*streams)[stream_id].length - 3) * 2 + 1;
-    buffer_[0]             = static_cast<uint8_t>(uncomp_ix_len >> 0);
-    buffer_[1]             = static_cast<uint8_t>(uncomp_ix_len >> 8);
-    buffer_[2]             = static_cast<uint8_t>(uncomp_ix_len >> 16);
+    pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_ix_len >> 0);
+    pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_ix_len >> 8);
+    pbw.buffer()[2]        = static_cast<uint8_t>(uncomp_ix_len >> 16);
   }
-  out_sink_->host_write(buffer_.data(), buffer_.size());
-  stripe->indexLength += buffer_.size();
+  out_sink_->host_write(pbw.data(), pbw.size());
+  stripe->indexLength += pbw.size();
 }
 
 std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_desc,
@@ -2254,8 +2253,6 @@ void writer::impl::write(table_view const& table)
       comp_results.device_to_host(stream, true);
     }
 
-    ProtobufWriter pbw_(&buffer_);
-
     auto intermediate_stats = gather_statistic_blobs(stats_freq_, orc_table, segmentation);
 
     if (intermediate_stats.stripe_stat_chunks.size() > 0) {
@@ -2281,8 +2278,7 @@ void writer::impl::write(table_view const& table)
                            comp_results,
                            intermediate_stats.rowgroup_blobs,
                            &stripe,
-                           &streams,
-                           &pbw_);
+                           &streams);
       }
 
       // Column data consisting one or more separate streams
@@ -2309,16 +2305,16 @@ void writer::impl::write(table_view const& table)
             : 0;
         if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
       }
-      buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-      pbw_.write(sf);
-      stripe.footerLength = buffer_.size();
+      ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
+      pbw.write(sf);
+      stripe.footerLength = pbw.size();
       if (compression_kind_ != NONE) {
         uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-        buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
-        buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
-        buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+        pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_sf_len >> 0);
+        pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_sf_len >> 8);
+        pbw.buffer()[2]        = static_cast<uint8_t>(uncomp_sf_len >> 16);
       }
-      out_sink_->host_write(buffer_.data(), buffer_.size());
+      out_sink_->host_write(pbw.data(), pbw.size());
     }
     for (auto const& task : write_tasks) {
       task.wait();
@@ -2376,19 +2372,18 @@ void writer::impl::close()
 {
   if (closed) { return; }
   closed = true;
-  ProtobufWriter pbw_(&buffer_);
   PostScript ps;
 
   auto const statistics = finish_statistic_blobs(ff.stripes.size(), persisted_stripe_statistics);
 
   // File-level statistics
   if (not statistics.file_level.empty()) {
-    buffer_.resize(0);
-    pbw_.put_uint(encode_field_number<size_type>(1));
-    pbw_.put_uint(persisted_stripe_statistics.num_rows);
+    ProtobufWriter pbw;
+    pbw.put_uint(encode_field_number<size_type>(1));
+    pbw.put_uint(persisted_stripe_statistics.num_rows);
     // First entry contains total number of rows
     ff.statistics.reserve(ff.types.size());
-    ff.statistics.emplace_back(std::move(buffer_));
+    ff.statistics.emplace_back(pbw.release());
     // Add file stats, stored after stripe stats in `column_stats`
     ff.statistics.insert(ff.statistics.end(),
                          std::make_move_iterator(statistics.file_level.begin()),
@@ -2400,10 +2395,10 @@ void writer::impl::close()
     md.stripeStats.resize(ff.stripes.size());
     for (size_t stripe_id = 0; stripe_id < ff.stripes.size(); stripe_id++) {
       md.stripeStats[stripe_id].colStats.resize(ff.types.size());
-      buffer_.resize(0);
-      pbw_.put_uint(encode_field_number<size_type>(1));
-      pbw_.put_uint(ff.stripes[stripe_id].numberOfRows);
-      md.stripeStats[stripe_id].colStats[0] = std::move(buffer_);
+      ProtobufWriter pbw;
+      pbw.put_uint(encode_field_number<size_type>(1));
+      pbw.put_uint(ff.stripes[stripe_id].numberOfRows);
+      md.stripeStats[stripe_id].colStats[0] = pbw.release();
       for (size_t col_idx = 0; col_idx < ff.types.size() - 1; col_idx++) {
         size_t idx                                      = ff.stripes.size() * col_idx + stripe_id;
         md.stripeStats[stripe_id].colStats[1 + col_idx] = std::move(statistics.stripe_level[idx]);
@@ -2421,27 +2416,28 @@ void writer::impl::close()
 
   // Write statistics metadata
   if (md.stripeStats.size() != 0) {
-    buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-    pbw_.write(md);
-    add_uncompressed_block_headers(buffer_);
-    ps.metadataLength = buffer_.size();
-    out_sink_->host_write(buffer_.data(), buffer_.size());
+    ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
+    pbw.write(md);
+    add_uncompressed_block_headers(pbw.buffer());
+    ps.metadataLength = pbw.size();
+    out_sink_->host_write(pbw.data(), pbw.size());
   } else {
     ps.metadataLength = 0;
   }
-  buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-  pbw_.write(ff);
-  add_uncompressed_block_headers(buffer_);
+  ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
+  pbw.write(ff);
+  add_uncompressed_block_headers(pbw.buffer());
 
   // Write postscript metadata
-  ps.footerLength         = buffer_.size();
+  ps.footerLength         = pbw.size();
   ps.compression          = compression_kind_;
   ps.compressionBlockSize = compression_blocksize_;
   ps.version              = {0, 12};
   ps.magic                = MAGIC;
-  const auto ps_length    = static_cast<uint8_t>(pbw_.write(ps));
-  buffer_.push_back(ps_length);
-  out_sink_->host_write(buffer_.data(), buffer_.size());
+
+  const auto ps_length = static_cast<uint8_t>(pbw.write(ps));
+  pbw.put_byte(ps_length);
+  out_sink_->host_write(pbw.data(), pbw.size());
   out_sink_->flush();
 }
 
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index dc8aad33af0..691fba6bac2 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -382,7 +382,6 @@ class writer::impl {
    * @param[in] rg_stats row group level statistics
    * @param[in,out] stripe Stream's parent stripe
    * @param[in,out] streams List of all streams
-   * @param[in,out] pbw Protobuf writer
    */
   void write_index_stream(int32_t stripe_id,
                           int32_t stream_id,
@@ -393,8 +392,7 @@ class writer::impl {
                           host_span<compression_result const> comp_out,
                           std::vector<ColStatsBlob> const& rg_stats,
                           StripeInformation* stripe,
-                          orc_streams* streams,
-                          ProtobufWriter* pbw);
+                          orc_streams* streams);
 
   /**
    * @brief Write the specified column's data streams
@@ -451,7 +449,6 @@ class writer::impl {
   // statistics data saved between calls to write before a close writes out the statistics
   persisted_statistics persisted_stripe_statistics;
 
-  std::vector<uint8_t> buffer_;
   std::unique_ptr<data_sink> out_sink_;
 };
 

From 6c8bf45a5eb03e573dd3c4029b27ff4c4c877797 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 Mar 2023 23:29:41 -0500
Subject: [PATCH 06/63] Avoid building cython twice (#12945)

This PR moves the `build_ext` step to only happen when `INSTALL_TARGET==''`, thus avoiding the cython build to occur twice when `build.sh` is invoked.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12945
---
 build.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index bee66d819b4..0de87826ba7 100755
--- a/build.sh
+++ b/build.sh
@@ -331,9 +331,10 @@ fi
 if buildAll || hasArg cudf; then
 
     cd ${REPODIR}/python/cudf
-    python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     if [[ ${INSTALL_TARGET} != "" ]]; then
-        python setup.py install --single-version-externally-managed --record=record.txt  -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
+        python setup.py install --single-version-externally-managed --record=record.txt -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
+    else
+        python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     fi
 fi
 

From dfa9e934e27826bd07ee334b60d01b42afc0ce78 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 15 Mar 2023 09:56:36 -0400
Subject: [PATCH 07/63] Remove remaining default stream parameters (#12943)

This PR closes #9854, removing all default stream parameters in detail APIs. This increases stream-safety by removing the ability to accidentally use the default stream when a detail API is called without an explicit stream parameter when a user-provided stream should have been passed through.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/12943
---
 .../binaryop/compiled/struct_binary_ops.cuh   |  8 +--
 cpp/src/join/conditional_join.hpp             | 21 ++++----
 cpp/src/merge/merge.cu                        |  8 +--
 cpp/src/partitioning/round_robin.cu           |  8 +--
 .../rolling/detail/range_window_bounds.hpp    |  9 ++--
 .../rolling/range_window_bounds_test.cpp      | 54 +++++++++++++------
 6 files changed, 64 insertions(+), 44 deletions(-)

diff --git a/cpp/src/binaryop/compiled/struct_binary_ops.cuh b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
index 8418493318f..2299df5a9bb 100644
--- a/cpp/src/binaryop/compiled/struct_binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
@@ -70,8 +70,8 @@ void apply_struct_binary_op(mutable_column_view& out,
                             column_view const& rhs,
                             bool is_lhs_scalar,
                             bool is_rhs_scalar,
-                            PhysicalElementComparator comparator = {},
-                            rmm::cuda_stream_view stream         = cudf::get_default_stream())
+                            PhysicalElementComparator comparator,
+                            rmm::cuda_stream_view stream)
 {
   auto const compare_orders = std::vector<order>(
     lhs.size(),
@@ -144,8 +144,8 @@ void apply_struct_equality_op(mutable_column_view& out,
                               bool is_lhs_scalar,
                               bool is_rhs_scalar,
                               binary_operator op,
-                              PhysicalEqualityComparator comparator = {},
-                              rmm::cuda_stream_view stream          = cudf::get_default_stream())
+                              PhysicalEqualityComparator comparator,
+                              rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(op == binary_operator::EQUAL || op == binary_operator::NOT_EQUAL ||
                  op == binary_operator::NULL_EQUALS,
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 7c329cd8e17..9bc6024ee7e 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,9 +47,9 @@ conditional_join(table_view const& left,
                  table_view const& right,
                  ast::expression const& binary_predicate,
                  join_kind JoinKind,
-                 std::optional<std::size_t> output_size = {},
-                 rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+                 std::optional<std::size_t> output_size,
+                 rmm::cuda_stream_view stream,
+                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes the size of a join operation between two tables without
@@ -63,13 +63,12 @@ conditional_join(table_view const& left,
  *
  * @return Join output indices vector pair
  */
-std::size_t compute_conditional_join_output_size(
-  table_view const& left,
-  table_view const& right,
-  ast::expression const& binary_predicate,
-  join_kind JoinKind,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::size_t compute_conditional_join_output_size(table_view const& left,
+                                                 table_view const& right,
+                                                 ast::expression const& binary_predicate,
+                                                 join_kind JoinKind,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index d9c573e8155..05842348807 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -170,8 +170,8 @@ index_vector generate_merged_indices(table_view const& left_table,
                                      table_view const& right_table,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
-                                     bool nullable                = true,
-                                     rmm::cuda_stream_view stream = cudf::get_default_stream())
+                                     bool nullable,
+                                     rmm::cuda_stream_view stream)
 {
   const size_type left_size  = left_table.num_rows();
   const size_type right_size = right_table.num_rows();
@@ -410,7 +410,7 @@ table_ptr_type merge(cudf::table_view const& left_table,
   // extract merged row order according to indices:
   //
   auto const merged_indices = generate_merged_indices(
-    index_left_view, index_right_view, column_order, null_precedence, nullable);
+    index_left_view, index_right_view, column_order, null_precedence, nullable, stream);
 
   // create merged table:
   //
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 990992cd8f2..00f64b36e2d 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -151,9 +151,9 @@ namespace detail {
 std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_partition(
   table_view const& input,
   cudf::size_type num_partitions,
-  cudf::size_type start_partition     = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  cudf::size_type start_partition,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto nrows = input.num_rows();
 
diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp
index 506bd54e5eb..d1de7adba7a 100644
--- a/cpp/src/rolling/detail/range_window_bounds.hpp
+++ b/cpp/src/rolling/detail/range_window_bounds.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -146,10 +146,9 @@ RepT range_comparable_value_impl(scalar const& range_scalar,
  * @return RepType Value of the range scalar
  */
 template <typename OrderByType>
-range_rep_type<OrderByType> range_comparable_value(
-  range_window_bounds const& range_bounds,
-  data_type const& order_by_data_type = data_type{type_to_id<OrderByType>()},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream())
+range_rep_type<OrderByType> range_comparable_value(range_window_bounds const& range_bounds,
+                                                   data_type const& order_by_data_type,
+                                                   rmm::cuda_stream_view stream)
 {
   auto const& range_scalar = range_bounds.range_scalar();
   using range_type         = cudf::detail::range_type<OrderByType>;
diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp
index 1b753fb6040..c70e0a78100 100644
--- a/cpp/tests/rolling/range_window_bounds_test.cpp
+++ b/cpp/tests/rolling/range_window_bounds_test.cpp
@@ -21,6 +21,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/rolling/range_window_bounds.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <src/rolling/detail/range_window_bounds.hpp>
 
 #include <vector>
@@ -57,34 +58,43 @@ TYPED_TEST(TimestampRangeWindowBoundsTest, BoundsConstruction)
   using OrderByType = TypeParam;
   using range_type  = cudf::detail::range_type<OrderByType>;
   using rep_type    = cudf::detail::range_rep_type<OrderByType>;
+  auto const dtype  = cudf::data_type{cudf::type_to_id<OrderByType>()};
 
   static_assert(cudf::is_duration<range_type>());
   auto range_3 = cudf::range_window_bounds::get(cudf::duration_scalar<range_type>{3, true});
   EXPECT_FALSE(range_3.is_unbounded() &&
                "range_window_bounds constructed from scalar cannot be unbounded.");
-  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_3), rep_type{3});
+  EXPECT_EQ(
+    cudf::detail::range_comparable_value<OrderByType>(range_3, dtype, cudf::get_default_stream()),
+    rep_type{3});
 
   auto range_unbounded =
     cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<range_type>()});
   EXPECT_TRUE(range_unbounded.is_unbounded() &&
               "range_window_bounds::unbounded() must return an unbounded range.");
-  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_unbounded), rep_type{});
+  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(
+              range_unbounded, dtype, cudf::get_default_stream()),
+            rep_type{});
 }
 
 TYPED_TEST(TimestampRangeWindowBoundsTest, WrongRangeType)
 {
   using OrderByType = TypeParam;
+  auto const dtype  = cudf::data_type{cudf::type_to_id<OrderByType>()};
 
   using wrong_range_type = std::conditional_t<std::is_same_v<OrderByType, cudf::timestamp_D>,
                                               cudf::duration_ns,
                                               cudf::duration_D>;
   auto range_3 = cudf::range_window_bounds::get(cudf::duration_scalar<wrong_range_type>{3, true});
 
-  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_3), cudf::logic_error);
+  EXPECT_THROW(
+    cudf::detail::range_comparable_value<OrderByType>(range_3, dtype, cudf::get_default_stream()),
+    cudf::logic_error);
 
   auto range_unbounded =
     cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<wrong_range_type>()});
-  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_unbounded),
+  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(
+                 range_unbounded, dtype, cudf::get_default_stream()),
                cudf::logic_error);
 }
 
@@ -112,33 +122,42 @@ TYPED_TEST(NumericRangeWindowBoundsTest, BoundsConstruction)
   using OrderByType = TypeParam;
   using range_type  = cudf::detail::range_type<OrderByType>;
   using rep_type    = cudf::detail::range_rep_type<OrderByType>;
+  auto const dtype  = cudf::data_type{cudf::type_to_id<OrderByType>()};
 
   static_assert(std::is_integral_v<range_type>);
   auto range_3 = cudf::range_window_bounds::get(cudf::numeric_scalar<range_type>{3, true});
   EXPECT_FALSE(range_3.is_unbounded() &&
                "range_window_bounds constructed from scalar cannot be unbounded.");
-  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_3), rep_type{3});
+  EXPECT_EQ(
+    cudf::detail::range_comparable_value<OrderByType>(range_3, dtype, cudf::get_default_stream()),
+    rep_type{3});
 
   auto range_unbounded =
     cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<range_type>()});
   EXPECT_TRUE(range_unbounded.is_unbounded() &&
               "range_window_bounds::unbounded() must return an unbounded range.");
-  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_unbounded), rep_type{});
+  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(
+              range_unbounded, dtype, cudf::get_default_stream()),
+            rep_type{});
 }
 
 TYPED_TEST(NumericRangeWindowBoundsTest, WrongRangeType)
 {
   using OrderByType = TypeParam;
+  auto const dtype  = cudf::data_type{cudf::type_to_id<OrderByType>()};
 
   using wrong_range_type =
     std::conditional_t<std::is_same_v<OrderByType, int32_t>, int16_t, int32_t>;
   auto range_3 = cudf::range_window_bounds::get(cudf::numeric_scalar<wrong_range_type>{3, true});
 
-  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_3), cudf::logic_error);
+  EXPECT_THROW(
+    cudf::detail::range_comparable_value<OrderByType>(range_3, dtype, cudf::get_default_stream()),
+    cudf::logic_error);
 
   auto range_unbounded =
     cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<wrong_range_type>()});
-  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_unbounded),
+  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(
+                 range_unbounded, dtype, cudf::get_default_stream()),
                cudf::logic_error);
 }
 
@@ -150,8 +169,9 @@ TYPED_TEST_SUITE(DecimalRangeBoundsTest, cudf::test::FixedPointTypes);
 
 TYPED_TEST(DecimalRangeBoundsTest, BoundsConstruction)
 {
-  using DecimalT = TypeParam;
-  using Rep      = cudf::detail::range_rep_type<DecimalT>;
+  using DecimalT   = TypeParam;
+  using Rep        = cudf::detail::range_rep_type<DecimalT>;
+  auto const dtype = cudf::data_type{cudf::type_to_id<DecimalT>()};
 
   // Interval type must match the decimal type.
   static_assert(std::is_same_v<cudf::detail::range_type<DecimalT>, DecimalT>);
@@ -160,7 +180,9 @@ TYPED_TEST(DecimalRangeBoundsTest, BoundsConstruction)
     cudf::fixed_point_scalar<DecimalT>{Rep{3}, numeric::scale_type{0}});
   EXPECT_FALSE(range_3.is_unbounded() &&
                "range_window_bounds constructed from scalar cannot be unbounded.");
-  EXPECT_EQ(cudf::detail::range_comparable_value<DecimalT>(range_3), Rep{3});
+  EXPECT_EQ(
+    cudf::detail::range_comparable_value<DecimalT>(range_3, dtype, cudf::get_default_stream()),
+    Rep{3});
 
   auto const range_unbounded =
     cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<DecimalT>()});
@@ -183,8 +205,8 @@ TYPED_TEST(DecimalRangeBoundsTest, Rescale)
   for (auto const range_scale : {-2, -1, 0, 1, 2}) {
     auto const decimal_range_bounds = cudf::range_window_bounds::get(
       cudf::fixed_point_scalar<DecimalT>{RepT{20}, numeric::scale_type{range_scale}});
-    auto const rescaled_range_rep =
-      cudf::detail::range_comparable_value<DecimalT>(decimal_range_bounds, order_by_data_type);
+    auto const rescaled_range_rep = cudf::detail::range_comparable_value<DecimalT>(
+      decimal_range_bounds, order_by_data_type, cudf::get_default_stream());
     EXPECT_EQ(rescaled_range_rep, RepT{20} * pow10[range_scale - order_by_scale]);
   }
 
@@ -192,8 +214,8 @@ TYPED_TEST(DecimalRangeBoundsTest, Rescale)
   {
     auto const decimal_range_bounds = cudf::range_window_bounds::get(
       cudf::fixed_point_scalar<DecimalT>{RepT{200}, numeric::scale_type{-3}});
-    EXPECT_THROW(
-      cudf::detail::range_comparable_value<DecimalT>(decimal_range_bounds, order_by_data_type),
-      cudf::logic_error);
+    EXPECT_THROW(cudf::detail::range_comparable_value<DecimalT>(
+                   decimal_range_bounds, order_by_data_type, cudf::get_default_stream()),
+                 cudf::logic_error);
   }
 }

From e9ec83f787f43b0981848f682a1f36425c66fc0c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 15 Mar 2023 09:29:38 -0500
Subject: [PATCH 08/63] Fix set index error for Series rolling window
 operations (#12942)

This PR fixes an issue with Series rolling window operations where there is a failure because `Series.set_index` has been dropped in the previous release.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/12942
---
 python/cudf/cudf/core/window/rolling.py |  3 ++-
 python/cudf/cudf/tests/test_rolling.py  | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index cac4774400a..8a92ea86d57 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -557,5 +557,6 @@ def _apply_agg(self, agg_name):
             )
         )
 
-        result = super()._apply_agg(agg_name).set_index(index)
+        result = super()._apply_agg(agg_name)
+        result.index = index
         return result
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 62120619d94..b4e0983a9e3 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -547,3 +547,13 @@ def test_rolling_indexer_support(indexer):
     actual = gdf.rolling(window=indexer, min_periods=2).sum()
 
     assert_eq(expected, actual)
+
+
+def test_rolling_series():
+    df = cudf.DataFrame({"a": range(0, 100), "b": [10, 20, 30, 40, 50] * 20})
+    pdf = df.to_pandas()
+
+    expected = pdf.groupby("b")["a"].rolling(5).mean()
+    actual = df.groupby("b")["a"].rolling(5).mean()
+
+    assert_eq(expected, actual)

From ced3fdfddb2a52ed80d9b58d24ed5fc98a4eaab3 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 15 Mar 2023 09:58:33 -0500
Subject: [PATCH 09/63] Preserve integer dtype of hive-partitioned column
 containing nulls (#12930)

This is a follow-up "fix" for https://github.com/rapidsai/cudf/pull/12866
While that PR enables the writing/reading of null hive partitions using `dask_cudf`, it does not preserve the type of integer partition columns containing nulls. This PR should address the remaining issue.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/12930
---
 python/cudf/cudf/io/parquet.py                | 37 +++++++++++++------
 .../dask_cudf/io/tests/test_parquet.py        |  5 ++-
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index ca4fb103ee8..1b7c1116205 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,7 +15,7 @@
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
-from cudf.core.column import as_column, build_categorical_column
+from cudf.core.column import build_categorical_column, column_empty, full
 from cudf.utils import ioutils
 from cudf.utils.utils import _cudf_nvtx_annotate
 
@@ -609,11 +609,12 @@ def _parquet_to_frame(
         )
         # Add partition columns to the last DataFrame
         for (name, value) in part_key:
+            _len = len(dfs[-1])
             if partition_categories and name in partition_categories:
                 # Build the categorical column from `codes`
-                codes = as_column(
-                    partition_categories[name].index(value),
-                    length=len(dfs[-1]),
+                codes = full(
+                    size=_len,
+                    fill_value=partition_categories[name].index(value),
                 )
                 dfs[-1][name] = build_categorical_column(
                     categories=partition_categories[name],
@@ -625,14 +626,23 @@ def _parquet_to_frame(
             else:
                 # Not building categorical columns, so
                 # `value` is already what we want
-                if partition_meta is not None:
-                    dfs[-1][name] = as_column(
-                        value,
-                        length=len(dfs[-1]),
-                        dtype=partition_meta[name].dtype,
+                _dtype = (
+                    partition_meta[name].dtype
+                    if partition_meta is not None
+                    else None
+                )
+                if pd.isna(value):
+                    dfs[-1][name] = column_empty(
+                        row_count=_len,
+                        dtype=_dtype,
+                        masked=True,
                     )
                 else:
-                    dfs[-1][name] = as_column(value, length=len(dfs[-1]))
+                    dfs[-1][name] = full(
+                        size=_len,
+                        fill_value=value,
+                        dtype=_dtype,
+                    )
 
     # Concatenate dfs and return.
     # Assume we can ignore the index if it has no name.
@@ -886,8 +896,11 @@ def _get_groups_and_offsets(
         grouped_df.reset_index(drop=True, inplace=True)
     grouped_df.drop(columns=partition_cols, inplace=True)
     # Copy the entire keys df in one operation rather than using iloc
-    part_names = part_keys.to_pandas().unique().to_frame(index=False)
-
+    part_names = (
+        part_keys.take(part_offsets[:-1])
+        .to_pandas(nullable=True)
+        .to_frame(index=False)
+    )
     return part_names, grouped_df, part_offsets
 
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 8fb6e591660..f5ae9706fde 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -508,13 +508,14 @@ def test_null_partition(tmpdir):
     import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
 
-    df = pd.DataFrame({"id": [0, 1, None], "x": [1, 2, 3]})
+    ids = pd.Series([0, 1, None], dtype="Int64")
+    df = pd.DataFrame({"id": ids, "x": [1, 2, 3]})
     ddf = dd.from_pandas(df, npartitions=1).to_backend("cudf")
     ddf.to_parquet(str(tmpdir), partition_on="id")
     fns = glob.glob(os.path.join(tmpdir, "id" + "=*/*.parquet"))
     assert len(fns) == 3
 
-    partitioning = HivePartitioning(pa.schema([("id", pa.float64())]))
+    partitioning = HivePartitioning(pa.schema([("id", pa.int64())]))
     ddf_read = dask_cudf.read_parquet(
         str(tmpdir),
         dataset={"partitioning": partitioning},

From 7776e0e202cabd8705160f9709653f19f1f7332a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 15 Mar 2023 10:59:07 -0400
Subject: [PATCH 10/63] Update libcudf counting functions to specify
 cudf::size_type (#12904)

Adds section to developer guide about `cudf::size_type` and adds links to it from other relevant parts of the document.
The fundamental nature of this type seems important enough to mention in the developer guide since it is the basis for how much of the code is designed and implemented.
Also updates some doxygen for public APIs that are return `size_type` column values but had cited `INT32` specifically.

Reference: https://github.com/rapidsai/cudf/pull/12779#discussion_r1117315749

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Lawrence Mitchell (https://github.com/wence-)
  - Yunsong Wang (https://github.com/PointKernel)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/12904
---
 .../developer_guide/DEVELOPER_GUIDE.md        | 23 +++++++----
 cpp/include/cudf/lists/contains.hpp           | 28 ++++++-------
 cpp/include/cudf/lists/count_elements.hpp     |  8 ++--
 cpp/include/cudf/search.hpp                   |  6 +--
 cpp/include/cudf/sorting.hpp                  |  2 +-
 cpp/include/cudf/strings/attributes.hpp       | 36 ++++++++---------
 cpp/include/cudf/strings/contains.hpp         |  2 +-
 .../cudf/strings/detail/strings_children.cuh  |  2 +-
 .../detail/strings_column_factories.cuh       |  2 +-
 cpp/include/nvtext/detail/tokenize.hpp        | 28 +++----------
 cpp/include/nvtext/tokenize.hpp               | 20 +++++-----
 cpp/src/strings/attributes.cu                 | 40 +++++++++----------
 cpp/src/strings/count_matches.cu              |  4 +-
 13 files changed, 95 insertions(+), 106 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 8cd4f8c6d27..a88f621095c 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -269,6 +269,13 @@ An *immutable*, non-owning view of a table.
 
 A *mutable*, non-owning view of a table.
 
+## cudf::size_type
+
+The `cudf::size_type` is the type used for the number of elements in a column, offsets to elements within a column, indices to address specific elements, segments for subsets of column elements, etc.
+It is equivalent to a signed, 32-bit integer type and therefore has a maximum value of 2147483647.
+Some APIs also accept negative index values and those functions support a minimum value of -2147483648.
+This fundamental type also influences output values not just for column size limits but for counting elements as well.
+
 ## Spans
 
 libcudf provides `span` classes that mimic C++20 `std::span`, which is a lightweight
@@ -370,16 +377,16 @@ libcudf APIs should still perform any validation that does not require introspec
 To give some idea of what should or should not be validated, here are (non-exhaustive) lists of examples.
 
 **Things that libcudf should validate**:
-- Input column/table sizes or dtypes
+- Input column/table sizes or data types
 
 **Things that libcudf should not validate**:
 - Integer overflow
-- Ensuring that outputs will not exceed the 2GB size limit for a given set of inputs
+- Ensuring that outputs will not exceed the [2GB size](#cudfsize_type) limit for a given set of inputs
 
 
 ## libcudf expects nested types to have sanitized null masks
 
-Various libcudf APIs accepting columns of nested dtypes (such as `LIST` or `STRUCT`) may assume that these columns have been sanitized.
+Various libcudf APIs accepting columns of nested data types (such as `LIST` or `STRUCT`) may assume that these columns have been sanitized.
 In this context, sanitization refers to ensuring that the null elements in a column with a nested dtype are compatible with the elements of nested columns.
 Specifically:
 - Null elements of list columns should also be empty. The starting offset of a null element should be equal to the ending offset.
@@ -746,8 +753,8 @@ where compile time was a problem is in types used to store indices, which can be
 The "Indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
 used for index types (integers) without requiring a type-specific instance. It can be used for any
 iterator interface for reading an array of integer values of type `int8`, `int16`, `int32`,
-`int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always return a
-`cudf::size_type` integer.
+`int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always returns a
+[`cudf::size_type`](#cudfsize_type) integer.
 
 Use the `indexalator_factory` to create an appropriate input iterator from a column_view. Example
 input iterator usage:
@@ -1104,7 +1111,7 @@ For list columns, the parent column's type is `LIST` and contains no data, but i
 the number of lists in the column, and its null mask represents the validity of each list element.
 The parent has two children.
 
-1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each list
+1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each list
    in a dense column of elements.
 2. A column containing the actual data and optional null mask for all elements of all the lists
    packed together.
@@ -1152,7 +1159,7 @@ a non-nullable column of `INT8` data. The parent column's type is `STRING` and c
 but its size represents the number of strings in the column, and its null mask represents the
 validity of each string. To summarize, the strings column children are:
 
-1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each
+1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each
    string in a dense column of all characters.
 2. A non-nullable column of `INT8` elements of all the characters across all the strings packed
    together.
@@ -1264,7 +1271,7 @@ libcudf provides view types for nested column types as well as for the data elem
 `cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of
 any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore
 `cudf::string_view` is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
-data type for a `cudf::column` of type `INT32`. As it's name implies, this is a read-only object
+data type for a `cudf::column` of type [`size_type`](#cudfsize_type). As its name implies, this is a read-only object
 instance that points to device memory inside the strings column. It's lifespan is the same (or less)
 as the column it views.
 
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index fbe931f945d..21c2ca1d64e 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -42,7 +42,7 @@ namespace lists {
  *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_key The scalar key to be looked up in each list row
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
@@ -64,7 +64,7 @@ std::unique_ptr<column> contains(
  *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_keys Column of elements to be looked up in each list row
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
@@ -85,7 +85,7 @@ std::unique_ptr<column> contains(
  * Nulls inside non-null nested elements (such as lists or structs) are not considered.
  *
  * @param lists Lists column whose `n` rows are to be searched
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains_nulls(
@@ -102,7 +102,7 @@ enum class duplicate_find_option : int32_t {
 };
 
 /**
- * @brief Create a column of `size_type` values indicating the position of a search key
+ * @brief Create a column of values indicating the position of a search key
  * within each list row in the `lists` column
  *
  * The output column has as many elements as there are rows in the input `lists` column.
@@ -119,14 +119,14 @@ enum class duplicate_find_option : int32_t {
  * If `find_option == FIND_LAST`, the position of the last match in the list row is
  * returned.
  *
+ * @throw cudf::data_type_error If `search_keys` type does not match the element type in `lists`
+ *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_key The scalar key to be looked up in each list row
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return INT32 column of `n` rows with the location of the `search_key`
- *
- * @throw cudf::data_type_error If `search_keys` type does not match the element type in `lists`
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return column of `n` rows with the location of the `search_key`
  */
 std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
@@ -135,7 +135,7 @@ std::unique_ptr<column> index_of(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Create a column of `size_type` values indicating the position of a search key
+ * @brief Create a column of values indicating the position of a search key
  * row within the corresponding list row in the `lists` column
  *
  * The output column has as many elements as there are rows in the input `lists` column.
@@ -152,16 +152,16 @@ std::unique_ptr<column> index_of(
  * If `find_option == FIND_LAST`, the position of the last match in the list row is
  * returned.
  *
+ * @throw cudf::logic_error If `search_keys` does not match `lists` in its number of rows
+ * @throw cudf::data_type_error If `search_keys` type does not match the element type in `lists`
+ *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_keys A column of search keys to be looked up in each corresponding row of
  * `lists`
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return INT32 column of `n` rows with the location of the `search_key`
- *
- * @throw cudf::logic_error If `search_keys` does not match `lists` in its number of rows
- * @throw cudf::data_type_error If `search_keys` type does not match the element type in `lists`
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return column of `n` rows with the location of the `search_key`
  */
 std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index dac6c1b5bf8..552ba058b93 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,9 +44,9 @@ namespace lists {
  * Any null input element will result in a corresponding null entry
  * in the output column.
  *
- * @param input Input lists column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column with the number of elements for each row.
+ * @param input Input lists column
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with the number of elements for each row
  */
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index bd9520df644..fee22786d7a 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ namespace cudf {
  * @param column_order Vector of column sort order
  * @param null_precedence Vector of null_precedence enums needles
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return A non-nullable column of cudf::size_type elements containing the insertion points
+ * @return A non-nullable column of elements containing the insertion points
  */
 std::unique_ptr<column> lower_bound(
   table_view const& haystack,
@@ -104,7 +104,7 @@ std::unique_ptr<column> lower_bound(
  * @param column_order Vector of column sort order
  * @param null_precedence Vector of null_precedence enums needles
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return A non-nullable column of cudf::size_type elements containing the insertion points
+ * @return A non-nullable column of elements containing the insertion points
  */
 std::unique_ptr<column> upper_bound(
   table_view const& haystack,
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 922bed3b1ea..6924e77ae9b 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -44,7 +44,7 @@ namespace cudf {
  * for each column. Size must be equal to `input.num_columns()` or empty.
  * If empty, all columns will be sorted in `null_order::BEFORE`.
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return A non-nullable column of `size_type` elements containing the permuted row indices of
+ * @return A non-nullable column of elements containing the permuted row indices of
  * `input` if it were sorted
  */
 std::unique_ptr<column> sorted_order(
diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp
index f0f7c667697..85086e44a26 100644
--- a/cpp/include/cudf/strings/attributes.hpp
+++ b/cpp/include/cudf/strings/attributes.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,8 +32,8 @@ namespace strings {
  */
 
 /**
- * @brief Returns an integer numeric column containing the length of each string in
- * characters.
+ * @brief Returns a column containing character lengths
+ * of each string in the given column
  *
  * The output column will have the same number of rows as the
  * specified strings column. Each row value will be the number of
@@ -41,17 +41,17 @@ namespace strings {
  *
  * Any null string will result in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column with lengths for each string.
+ * @param input Strings instance for this operation
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with lengths for each string
  */
 std::unique_ptr<column> count_characters(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Returns a numeric column containing the length of each string in
- * bytes.
+ * @brief Returns a column containing byte lengths
+ * of each string in the given column
  *
  * The output column will have the same number of rows as the
  * specified strings column. Each row value will be the number of
@@ -59,17 +59,17 @@ std::unique_ptr<column> count_characters(
  *
  * Any null string will result in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column with the number of bytes for each string.
+ * @param input Strings instance for this operation
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with the number of bytes for each string
  */
 std::unique_ptr<column> count_bytes(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a numeric column with code point values (integers) for each
- * character of each string.
+ * character of each string
  *
  * A code point is the integer value representation of a character.
  * For example, the code point value for the character 'A' in UTF-8 is 65.
@@ -79,12 +79,12 @@ std::unique_ptr<column> count_bytes(
  *
  * Any null string is ignored. No null entries will appear in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column with code point integer values for each character.
+ * @param input Strings instance for this operation
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New INT32 column with code point integer values for each character
  */
 std::unique_ptr<column> code_points(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of strings_apis group
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index aebc4ae7dab..92914bc810f 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -165,7 +165,7 @@ std::unique_ptr<column> matches_re(
  * @param strings Strings instance for this operation
  * @param prog Regex program instance
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT32 column with counts for each string
+ * @return New column of match counts for each string
  */
 std::unique_ptr<column> count_re(
   strings_column_view const& strings,
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 09e0f3bb079..02a65c01178 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -59,7 +59,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            rmm::mr::device_memory_resource* mr)
 {
   auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto offsets_view          = offsets_column->mutable_view();
   auto d_offsets             = offsets_view.template data<int32_t>();
   size_and_exec_fn.d_offsets = d_offsets;
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 2939c47e6af..a3a5946fe55 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -175,7 +175,7 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
 
   // build offsets column -- this is the number of strings + 1
   auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto offsets_view = offsets_column->mutable_view();
   thrust::transform(rmm::exec_policy(stream),
                     offsets_begin,
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index 38b49e63590..80a6edc496b 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,12 +28,7 @@ namespace detail {
  * @copydoc nvtext::tokenize(strings_column_view const&,string_scalar
  * const&,rmm::mr::device_memory_resource*)
  *
- * @param strings Strings column tokenize.
- * @param delimiter UTF-8 characters used to separate each string into tokens.
- *                  The default of empty string will separate tokens using whitespace.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::string_scalar const& delimiter,
@@ -44,11 +39,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
  * @copydoc nvtext::tokenize(strings_column_view const&,strings_column_view
  * const&,rmm::mr::device_memory_resource*)
  *
- * @param strings Strings column to tokenize.
- * @param delimiters Strings used to separate individual strings into tokens.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::strings_column_view const& delimiters,
@@ -59,12 +50,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
  * @copydoc nvtext::count_tokens(strings_column_view const&, string_scalar
  * const&,rmm::mr::device_memory_resource*)
  *
- * @param strings Strings column to use for this operation.
- * @param delimiter Strings used to separate each string into tokens.
- *                  The default of empty string will separate tokens using whitespace.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column of token counts.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::string_scalar const& delimiter,
@@ -75,11 +61,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
  * @copydoc nvtext::count_tokens(strings_column_view const&,strings_column_view
  * const&,rmm::mr::device_memory_resource*)
  *
- * @param strings Strings column to use for this operation.
- * @param delimiters Strings used to separate each string into tokens.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column of token counts.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::strings_column_view const& delimiters,
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 10a9f746d76..a72f7dcfa59 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,11 +112,11 @@ std::unique_ptr<cudf::column> tokenize(
  * All null row entries are ignored and the output contains all valid rows.
  * The number of tokens for a null element is set to 0 in the output column.
  *
- * @param strings Strings column to use for this operation.
- * @param delimiter Strings used to separate each string into tokens.
+ * @param strings Strings column to use for this operation
+ * @param delimiter Strings used to separate each string into tokens;
  *                  The default of empty string will separate tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column of token counts.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of token counts
  */
 std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& strings,
@@ -141,12 +141,12 @@ std::unique_ptr<cudf::column> count_tokens(
  * All null row entries are ignored and the output contains all valid rows.
  * The number of tokens for a null element is set to 0 in the output column.
  *
- * @throw cudf::logic_error if the delimiters column is empty or contains nulls.
+ * @throw cudf::logic_error if the delimiters column is empty or contains nulls
  *
- * @param strings Strings column to use for this operation.
- * @param delimiters Strings used to separate each string into tokens.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column of token counts.
+ * @param strings Strings column to use for this operation
+ * @param delimiters Strings used to separate each string into tokens
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of token counts
  */
 std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& strings,
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 66288c7d14d..3a1b7044b56 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -60,16 +60,16 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
 
 /**
  * @brief Returns a numeric column containing lengths of each string in
- * based on the provided unary function.
+ * based on the provided unary function
  *
  * Any null string will result in a null entry for that row in the output column.
  *
- * @tparam UnaryFunction Device function that returns an integer given a string_view.
- * @param strings Strings instance for this operation.
- * @param ufn Function returns an integer for each string.
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @tparam UnaryFunction Device function that returns an integer given a string_view
+ * @param strings Strings instance for this operation
+ * @param ufn Function returns an integer for each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT32 column with lengths for each string.
+ * @return New column with lengths for each string
  */
 template <typename UnaryFunction>
 std::unique_ptr<column> counts_fn(strings_column_view const& strings,
@@ -78,7 +78,7 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   // create output column
-  auto results   = make_numeric_column(data_type{type_id::INT32},
+  auto results   = make_numeric_column(data_type{type_to_id<size_type>()},
                                      strings.size(),
                                      cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                      strings.null_count(),
@@ -176,12 +176,12 @@ std::unique_ptr<column> count_characters(strings_column_view const& input,
   return count_characters_parallel(input, stream, mr);
 }
 
-std::unique_ptr<column> count_bytes(strings_column_view const& strings,
+std::unique_ptr<column> count_bytes(strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   auto ufn = [] __device__(string_view const& d_str) { return d_str.size_bytes(); };
-  return counts_fn(strings, ufn, stream, mr);
+  return counts_fn(input, ufn, stream, mr);
 }
 
 }  // namespace detail
@@ -214,19 +214,19 @@ struct code_points_fn {
 
 namespace detail {
 //
-std::unique_ptr<column> code_points(strings_column_view const& strings,
+std::unique_ptr<column> code_points(strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
 
   // create offsets vector to account for each string's character length
-  rmm::device_uvector<size_type> offsets(strings.size() + 1, stream);
+  rmm::device_uvector<size_type> offsets(input.size() + 1, stream);
   thrust::transform_inclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(strings.size()),
+    thrust::make_counting_iterator<size_type>(input.size()),
     offsets.begin() + 1,
     [d_column] __device__(size_type idx) {
       size_type length = 0;
@@ -248,7 +248,7 @@ std::unique_ptr<column> code_points(strings_column_view const& strings,
   // now set the ranges from each strings' character values
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
-                     strings.size(),
+                     input.size(),
                      code_points_fn{d_column, offsets.data(), d_results});
 
   results->set_null_count(0);
@@ -259,25 +259,25 @@ std::unique_ptr<column> code_points(strings_column_view const& strings,
 
 // external APIS
 
-std::unique_ptr<column> count_characters(strings_column_view const& strings,
+std::unique_ptr<column> count_characters(strings_column_view const& input,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_characters(strings, cudf::get_default_stream(), mr);
+  return detail::count_characters(input, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> count_bytes(strings_column_view const& strings,
+std::unique_ptr<column> count_bytes(strings_column_view const& input,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_bytes(strings, cudf::get_default_stream(), mr);
+  return detail::count_bytes(input, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> code_points(strings_column_view const& strings,
+std::unique_ptr<column> code_points(strings_column_view const& input,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::code_points(strings, cudf::get_default_stream(), mr);
+  return detail::code_points(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index 9d29bbb8c96..1fde3a54089 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ std::unique_ptr<column> count_matches(column_device_view const& d_strings,
   assert(output_size >= d_strings.size() and "Unexpected output size");
 
   auto results = make_numeric_column(
-    data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, output_size, mask_state::UNALLOCATED, stream, mr);
 
   if (d_strings.size() == 0) return results;
 

From 1b780396ccfa812d658fa295ef685d4dd7bd9221 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 15 Mar 2023 11:09:24 -0400
Subject: [PATCH 11/63] Fix cudf::segmented_reduce gtest for ANY aggregation
 (#12940)

Minor fix to the `SegmentedReductionTest/AnyExcludeNulls` gtest to use `0/false` as the initial value to better test and demonstrate the usage. Found this when looking for an example to answer issue #10455
Also reworked the code in this test to use variables to help minimize copy errors and shorten the code size.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - https://github.com/nvdbaranec
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12940
---
 .../reductions/segmented_reduction_tests.cpp  | 29 ++++++-------------
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 74c5e7fb504..47bcbb874cf 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -251,36 +251,25 @@ TYPED_TEST(SegmentedReductionTest, AnyExcludeNulls)
     {false, false, true, true, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, true, true, true, false, true, true, false, false}};
 
-  auto res =
-    cudf::segmented_reduce(input,
-                           d_offsets,
-                           *cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>(),
-                           cudf::data_type{cudf::type_id::BOOL8},
-                           cudf::null_policy::EXCLUDE);
+  auto const agg         = cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::BOOL8};
+  auto const policy      = cudf::null_policy::EXCLUDE;
+
+  auto res = cudf::segmented_reduce(input, d_offsets, *agg, output_type, policy);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
 
   // Test with initial value
-  auto const init_scalar = cudf::make_fixed_width_scalar<TypeParam>(1);
+  auto const init_scalar = cudf::make_fixed_width_scalar<TypeParam>(0);
   auto const init_expect = cudf::test::fixed_width_column_wrapper<bool>{
-    {true, true, true, true, true, true, true, true, true},
+    {false, false, true, true, false, false, true, false, false},
     {true, true, true, true, true, true, true, true, true}};
 
-  res = cudf::segmented_reduce(input,
-                               d_offsets,
-                               *cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>(),
-                               cudf::data_type{cudf::type_id::BOOL8},
-                               cudf::null_policy::EXCLUDE,
-                               *init_scalar);
+  res = cudf::segmented_reduce(input, d_offsets, *agg, output_type, policy, *init_scalar);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, init_expect);
 
   // Test with null initial value
   init_scalar->set_valid_async(false);
-  res = cudf::segmented_reduce(input,
-                               d_offsets,
-                               *cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>(),
-                               cudf::data_type{cudf::type_id::BOOL8},
-                               cudf::null_policy::EXCLUDE,
-                               *init_scalar);
+  res = cudf::segmented_reduce(input, d_offsets, *agg, output_type, policy, *init_scalar);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
 }
 

From 3c72eb0fbd76f696507f1ebc0e21af2cdebfbdb7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 15 Mar 2023 15:14:49 -0400
Subject: [PATCH 12/63] Generate pyproject dependencies using dfg (#12906)

This PR updates cudf to generate all build, run and test dependencies in pyproject.toml using `rapids-dependency-file-generator`. In the process, a few additional relevant changes were made:
- dependencies.yaml was refactored to more precisely capture the requirements of the different packages in the repo as well as more accurately separate the build/run/test dependencies. This should facilitate generation of meta.yaml dependencies with `rapids-dependency-file-generator` if we should choose to do so.
- cudf_kafka's setup.py had as much logic as possible moved to pyproject.toml without having to change the way that extension modules are built. This package is somewhat unique within RAPIDS in having a C++ component that we have not converted to scikit-build because we have no plans for wheels there at this time. As a result, it was not previously converted to a pyproject.toml-driven build (classic Cython builds are not compatible with pyproject.toml-driven builds).
- A tiny unrelated fix to the Java build that removed an outdated CMake flag.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Mark Sadang (https://github.com/msadang)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/12906
---
 .github/workflows/pr.yaml                     |   3 +-
 .github/workflows/test.yaml                   |   2 +
 .pre-commit-config.yaml                       |   2 +-
 ci/release/update-version.sh                  |  16 +-
 ci/test_java.sh                               |   2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  13 +-
 cpp/cmake/thirdparty/get_arrow.cmake          |   2 +
 dependencies.yaml                             | 334 ++++++++++++------
 python/cudf/pyproject.toml                    |  44 +--
 python/cudf_kafka/MANIFEST.in                 |   3 +
 python/cudf_kafka/pyproject.toml              |  35 +-
 python/cudf_kafka/setup.py                    |  23 --
 python/custreamz/pyproject.toml               |  14 +-
 python/dask_cudf/pyproject.toml               |  18 +-
 14 files changed, 333 insertions(+), 178 deletions(-)
 create mode 100644 python/cudf_kafka/MANIFEST.in

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d02825b73d1..dd4482375b9 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -133,5 +133,6 @@ jobs:
     with:
       build_type: pull-request
       package-name: dask_cudf
-      test-before: "RAPIDS_PY_WHEEL_NAME=cudf_cu11 rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl"
+      # Install the cudf we just built, and also test against latest dask/distributed/dask-cuda.
+      test-before: "RAPIDS_PY_WHEEL_NAME=cudf_cu11 rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
       test-unittest: "python -m pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index c808e1475e6..a4bd14439b0 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -97,4 +97,6 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: dask_cudf
+      # Test against latest dask/distributed/dask-cuda.
+      test-before: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
       test-unittest: "python -m pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1eb2c508db9..8b46eb25950 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -161,7 +161,7 @@ repos:
                     ^CHANGELOG.md$
                   )
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.4.0
+        rev: v1.5.1
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index e5c9ba0569f..dc5ea6015f9 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -24,6 +24,11 @@ NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
+# Need to distutils-normalize the versions for some use cases
+CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+echo "current is ${CURRENT_SHORT_TAG_PEP440}, next is ${NEXT_SHORT_TAG_PEP440}"
+
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
 # Inplace sed replace; workaround for Linux and Mac
@@ -70,9 +75,10 @@ sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/
 
 # bump rmm & dask-cuda
 for FILE in conda/environments/*.yaml dependencies.yaml; do
-  sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE};
-  sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
-  sed_runner "s/rmm-cu11=${CURRENT_SHORT_TAG}/rmm-cu11=${NEXT_SHORT_TAG}/g" ${FILE};
+  sed_runner "s/dask-cuda==${CURRENT_SHORT_TAG_PEP440}/dask-cuda==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
+  sed_runner "s/rmm==${CURRENT_SHORT_TAG_PEP440}/rmm==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
+  sed_runner "s/cudf==${CURRENT_SHORT_TAG_PEP440}/cudf==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
+  sed_runner "s/cudf_kafka==${CURRENT_SHORT_TAG_PEP440}/cudf_kafka==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
 done
 
 # Doxyfile update
@@ -86,13 +92,11 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt
 
-# Need to distutils-normalize the original version
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-
 # Dependency versions in pyproject.toml
 sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/pyproject.toml
 sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/pyproject.toml
 
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
 done
diff --git a/ci/test_java.sh b/ci/test_java.sh
index f905aaa1178..e4df62501cc 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -38,7 +38,7 @@ set +e
 
 rapids-logger "Run Java tests"
 pushd java
-mvn test -B -DCUDF_JNI_ARROW_STATIC=OFF -DCUDF_JNI_ENABLE_PROFILING=OFF
+mvn test -B -DCUDF_JNI_ENABLE_PROFILING=OFF
 popd
 
 rapids-logger "Test script exiting with value: $EXITCODE"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 66d375910d4..ef92a9ad80f 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cupy>=9.5.0,<12.0.0a0
 - cxx-compiler
 - cython>=0.29,<0.30
-- dask-cuda=23.04.*
+- dask-cuda==23.4.*
 - dask>=2023.1.1
 - distributed>=2023.1.1
 - dlpack>=0.5,<0.6.0a0
@@ -32,11 +32,12 @@ dependencies:
 - gcc_linux-64=11.*
 - hypothesis
 - ipython
-- libarrow=10
+- libarrow==10.0.1.*
 - librdkafka=1.7.0
-- librmm=23.04.*
+- librmm==23.4.*
 - mimesis>=4.1.0
 - moto>=4.0.8
+- msgpack-python
 - myst-nb
 - nbsphinx
 - ninja
@@ -53,7 +54,7 @@ dependencies:
 - pre-commit
 - protobuf>=4.21.6,<4.22
 - ptxcompiler
-- pyarrow=10
+- pyarrow==10.0.1.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
@@ -61,11 +62,11 @@ dependencies:
 - pytest-cases
 - pytest-cov
 - pytest-xdist
-- python-confluent-kafka=1.7.0
+- python-confluent-kafka==1.7.0
 - python-snappy>=0.6.0
 - python>=3.8,<3.11
 - pytorch<1.12.0
-- rmm=23.04.*
+- rmm==23.4.*
 - s3fs>=2022.3.0
 - scikit-build>=0.13.1
 - scipy
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 943b89238e0..a716995182d 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -379,6 +379,8 @@ endfunction()
 
 if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
+      # This version must be kept in sync with the libarrow version pinned for builds in
+      # dependencies.yaml.
       10.0.1
       CACHE STRING "The version of Arrow to find (or build)"
   )
diff --git a/dependencies.yaml b/dependencies.yaml
index 7b623d58425..e3fe3bbda0d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -8,31 +8,43 @@ files:
     includes:
       - build_all
       - build_cpp
+      - build_wheels
       - build_python
+      - build_python_common
       - cudatoolkit
       - develop
       - docs
       - notebooks
       - py_version
-      - run
+      - run_common
+      - run_cudf
+      - run_dask_cudf
+      - run_cudf_kafka
+      - run_custreamz
       - test_cpp
-      - test_python
+      - test_python_common
+      - test_python_cudf
+      - test_python_dask_cudf
   test_cpp:
     output: none
     includes:
       - cudatoolkit
-      - libidentify_stream_usage_build
       - test_cpp
+      - libarrow_run
   test_python:
     output: none
     includes:
       - cudatoolkit
       - py_version
-      - test_python
+      - test_python_common
+      - test_python_cudf
+      - test_python_dask_cudf
+      - pyarrow_run
   test_java:
     output: none
     includes:
       - build_all
+      - libarrow_run
       - cudatoolkit
       - test_java
   test_notebooks:
@@ -51,6 +63,103 @@ files:
       - cudatoolkit
       - docs
       - py_version
+  py_build_cudf:
+    output: pyproject
+    pyproject_dir: python/cudf
+    extras:
+      table: build-system
+    includes:
+      - build_all
+      - build_python
+      - build_python_common
+      - build_wheels
+  py_run_cudf:
+    output: pyproject
+    pyproject_dir: python/cudf
+    extras:
+      table: project
+    includes:
+      - run_common
+      - run_cudf
+      - pyarrow_run
+  py_test_cudf:
+    output: pyproject
+    pyproject_dir: python/cudf
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+      - test_python_cudf
+  py_build_dask_cudf:
+    output: pyproject
+    pyproject_dir: python/dask_cudf
+    extras:
+      table: build-system
+    includes:
+      - build_wheels
+  py_run_dask_cudf:
+    output: pyproject
+    pyproject_dir: python/dask_cudf
+    extras:
+      table: project
+    includes:
+      - run_common
+      - run_dask_cudf
+  py_test_dask_cudf:
+    output: pyproject
+    pyproject_dir: python/dask_cudf
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+      - test_python_dask_cudf
+  py_build_cudf_kafka:
+    output: pyproject
+    pyproject_dir: python/cudf_kafka
+    extras:
+      table: build-system
+    includes:
+      - build_wheels
+      - build_python_common
+  py_run_cudf_kafka:
+    output: pyproject
+    pyproject_dir: python/cudf_kafka
+    extras:
+      table: project
+    includes:
+      - run_cudf_kafka
+  py_test_cudf_kafka:
+    output: pyproject
+    pyproject_dir: python/cudf_kafka
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+  py_build_custreamz:
+    output: pyproject
+    pyproject_dir: python/custreamz
+    extras:
+      table: build-system
+    includes:
+      - build_wheels
+  py_run_custreamz:
+    output: pyproject
+    pyproject_dir: python/custreamz
+    extras:
+      table: project
+    includes:
+      - run_custreamz
+  py_test_custreamz:
+    output: pyproject
+    pyproject_dir: python/custreamz
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
 channels:
   - rapidsai
   - rapidsai-nightly
@@ -61,29 +170,28 @@ channels:
 dependencies:
   build_all:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.23.1,!=3.25.0
-          - dlpack>=0.5,<0.6.0a0
           - ninja
       - output_types: conda
         packages:
-          - libarrow=10
           - c-compiler
           - cxx-compiler
+          - dlpack>=0.5,<0.6.0a0
     specific:
       - output_types: conda
         matrices:
           - matrix:
               arch: x86_64
             packages:
-              - &gcc_amd64 gcc_linux-64=11.*
-              - &sysroot_amd64 sysroot_linux-64==2.17
+              - gcc_linux-64=11.*
+              - sysroot_linux-64==2.17
           - matrix:
               arch: aarch64
             packages:
-              - &gcc_aarch64 gcc_linux-aarch64=11.*
-              - &sysroot_aarch64 sysroot_linux-aarch64==2.17
+              - gcc_linux-aarch64=11.*
+              - sysroot_linux-aarch64==2.17
       - output_types: conda
         matrices:
           - matrix:
@@ -100,24 +208,54 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - librmm=23.04.*
+          - librmm==23.4.*
       - output_types: conda
         packages:
           - fmt>=9.1.0,<10
           - librdkafka=1.7.0
           - spdlog>=1.11.0,<1.12
-  build_python:
+          # Hard pin the patch version used during the build. This must be kept
+          # in sync with the version pinned in get_arrow.cmake.
+          - libarrow==10.0.1.*
+  build_wheels:
     common:
-      - output_types: [conda, requirements]
+      - output_types: pyproject
+        packages:
+          - wheel
+          - setuptools
+  build_python_common:
+    common:
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - cuda-python>=11.7.1,<12.0
           - cython>=0.29,<0.30
-          - pyarrow=10
-          - rmm=23.04.*
+          # Hard pin the patch version used during the build. This must be kept
+          # in sync with the version pinned in get_arrow.cmake.
+          - pyarrow==10.0.1.*
+          - numpy>=1.21
+  build_python:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
           - scikit-build>=0.13.1
+          - rmm==23.4.*
       - output_types: conda
         packages:
-          - protobuf>=4.21.6,<4.22
+          - &protobuf protobuf>=4.21.6,<4.22
+      - output_types: pyproject
+        packages:
+          - protoc-wheel
+  libarrow_run:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          # Allow runtime version to float up to minor version
+          - libarrow==10.*
+  pyarrow_run:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          # Allow runtime version to float up to minor version
+          - pyarrow==10.*
   cudatoolkit:
     specific:
       - output_types: conda
@@ -161,66 +299,6 @@ dependencies:
           - sphinx-copybutton
           - sphinx-markdown-tables
           - sphinxcontrib-websupport
-  libidentify_stream_usage_build:
-    common:
-      - output_types: conda
-        packages:
-          - *cmake_ver
-    specific:
-      - output_types: conda
-        matrices:
-          - matrix:
-              arch: x86_64
-            packages:
-              - *gcc_amd64
-              - *sysroot_amd64
-          - matrix:
-              arch: aarch64
-            packages:
-              - *gcc_aarch64
-              - *sysroot_aarch64
-      - output_types: conda
-        matrices:
-          - matrix:
-              arch: x86_64
-              cuda: "11.2"
-            packages:
-              - nvcc_linux-64=11.2
-          - matrix:
-              arch: aarch64
-              cuda: "11.2"
-            packages:
-              - nvcc_linux-aarch64=11.2
-          - matrix:
-              arch: x86_64
-              cuda: "11.4"
-            packages:
-              - nvcc_linux-64=11.4
-          - matrix:
-              arch: aarch64
-              cuda: "11.4"
-            packages:
-              - nvcc_linux-aarch64=11.4
-          - matrix:
-              arch: x86_64
-              cuda: "11.5"
-            packages:
-              - nvcc_linux-64=11.5
-          - matrix:
-              arch: aarch64
-              cuda: "11.5"
-            packages:
-              - nvcc_linux-aarch64=11.5
-          - matrix:
-              arch: x86_64
-              cuda: "11.8"
-            packages:
-              - nvcc_linux-64=11.8
-          - matrix:
-              arch: aarch64
-              cuda: "11.8"
-            packages:
-              - nvcc_linux-aarch64=11.8
   notebooks:
     common:
       - output_types: [conda, requirements]
@@ -247,22 +325,25 @@ dependencies:
           - matrix:
             packages:
               - python>=3.8,<3.11
-  run:
+  run_common:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - cachetools
-          - dask>=2023.1.1
-          - distributed>=2023.1.1
           - fsspec>=0.6.0
-          - numba>=0.56.2
           - numpy>=1.21
+          - pandas>=1.3,<1.6.0dev0
+  run_cudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - cachetools
+          - cuda-python>=11.7.1,<12.0
+          - numba>=0.56.2
           - nvtx>=0.2.1
           - packaging
-          - pandas>=1.3,<1.6.0dev0
-          - python-confluent-kafka=1.7.0
-          - streamz
+          - rmm==23.4.*
           - typing_extensions
+          - *protobuf
       - output_types: conda
         packages:
           - cubinlinker
@@ -271,7 +352,6 @@ dependencies:
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
           - ptxcompiler
-          - rmm=23.04.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -280,7 +360,11 @@ dependencies:
           - cubinlinker-cu11
           - git+https://github.com/python-streamz/streamz.git@master
           - ptxcompiler-cu11
-          - rmm-cu11=23.04.*
+      - output_types: pyproject
+        packages:
+          - cubinlinker
+          - &cupy_pip cupy-cuda11x>=9.5.0,<12.0.0a0
+          - ptxcompiler
     specific:
       - output_types: requirements
         matrices:
@@ -292,6 +376,34 @@ dependencies:
               arch: aarch64
             packages:
               - cupy-cuda11x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+  run_dask_cudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - dask>=2023.1.1
+          - distributed>=2023.1.1
+      - output_types: pyproject
+        packages:
+          - &cudf cudf==23.4.*
+          - *cupy_pip
+  run_cudf_kafka:
+    common:
+      - output_types: conda
+        packages:
+          - python-confluent-kafka==1.7.0
+      - output_types: [requirements, pyproject]
+        packages:
+          - *cudf
+          - confluent-kafka==1.7.0
+  run_custreamz:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - streamz
+      - output_types: [requirements, pyproject]
+        packages:
+          - *cudf
+          - cudf_kafka==23.4.*
   test_cpp:
     specific:
       - output_types: conda
@@ -320,27 +432,39 @@ dependencies:
               cuda: "11.8"
             packages:
               - cuda-nvtx=11.8
-  test_python:
+  test_python_common:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - pytest
+          - pytest-cov
+          - pytest-xdist
+  test_python_cudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - aiobotocore>=2.2.0
-          - boto3>=1.21.21
-          - botocore>=1.24.21
-          - dask-cuda=23.04.*
           - fastavro>=0.22.9
           - hypothesis
           - mimesis>=4.1.0
-          - moto>=4.0.8
           - pyorc
-          - pytest
           - pytest-benchmark
           - pytest-cases
-          - pytest-cov
-          - pytest-xdist
           - python-snappy>=0.6.0
-          - s3fs>=2022.3.0
           - scipy
+      - output_types: conda
+        packages:
+          - aiobotocore>=2.2.0
+          - boto3>=1.21.21
+          - botocore>=1.24.21
+          - msgpack-python
+          - moto>=4.0.8
+          - s3fs>=2022.3.0
+      - output_types: pyproject
+        packages:
+          - msgpack
+          - &tokenizers tokenizers==0.13.1
+          - &transformers transformers==4.24.0
+          - tzdata
     specific:
       - output_types: conda
         matrices:
@@ -352,7 +476,13 @@ dependencies:
               - pytorch<1.12.0
               # We only install these on x86_64 to avoid pulling pytorch as a
               # dependency of transformers.
-              - tokenizers==0.13.1
-              - transformers==4.24.0
+              - *tokenizers
+              - *transformers
           - matrix:
             packages:
+  test_python_dask_cudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - dask-cuda==23.4.*
+          - numba>=0.56.2
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 5b259b1dc66..6832a7aef26 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -3,18 +3,17 @@
 [build-system]
 build-backend = "setuptools.build_meta"
 requires = [
-    "wheel",
-    "setuptools",
-    "cython>=0.29,<0.30",
-    "scikit-build>=0.13.1",
     "cmake>=3.23.1,!=3.25.0",
+    "cython>=0.29,<0.30",
     "ninja",
-    "numpy",
-    # Hard pin the patch version used during the build.
-    "pyarrow==10.0.1",
+    "numpy>=1.21",
     "protoc-wheel",
+    "pyarrow==10.0.1.*",
     "rmm==23.4.*",
-]
+    "scikit-build>=0.13.1",
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "cudf"
@@ -28,7 +27,9 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
     "cachetools",
+    "cubinlinker",
     "cuda-python>=11.7.1,<12.0",
+    "cupy-cuda11x>=9.5.0,<12.0.0a0",
     "fsspec>=0.6.0",
     "numba>=0.56.2",
     "numpy>=1.21",
@@ -36,14 +37,11 @@ dependencies = [
     "packaging",
     "pandas>=1.3,<1.6.0dev0",
     "protobuf>=4.21.6,<4.22",
-    "typing_extensions",
-    # Allow floating minor versions for Arrow.
-    "pyarrow==10",
-    "rmm==23.4.*",
     "ptxcompiler",
-    "cubinlinker",
-    "cupy-cuda11x",
-]
+    "pyarrow==10.*",
+    "rmm==23.4.*",
+    "typing_extensions",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Database",
@@ -56,18 +54,22 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
+    "fastavro>=0.22.9",
+    "hypothesis",
+    "mimesis>=4.1.0",
+    "msgpack",
+    "pyorc",
     "pytest",
     "pytest-benchmark",
+    "pytest-cases",
+    "pytest-cov",
     "pytest-xdist",
-    "hypothesis",
-    "mimesis>=4.1.0",
-    "fastavro>=0.22.9",
     "python-snappy>=0.6.0",
-    "pyorc",
-    "msgpack",
+    "scipy",
+    "tokenizers==0.13.1",
     "transformers==4.24.0",
     "tzdata",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"
diff --git a/python/cudf_kafka/MANIFEST.in b/python/cudf_kafka/MANIFEST.in
new file mode 100644
index 00000000000..249a9238816
--- /dev/null
+++ b/python/cudf_kafka/MANIFEST.in
@@ -0,0 +1,3 @@
+# Cython files
+recursive-include cudf_kafka *.pxd
+recursive-include cudf_kafka *.pyx
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 308a7869bc0..ccaa08eeef5 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -3,10 +3,41 @@
 [build-system]
 
 requires = [
-    "wheel",
-    "setuptools",
     "cython>=0.29,<0.30",
+    "numpy>=1.21",
+    "pyarrow==10.0.1.*",
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "cudf_kafka"
+version = "23.04.00"
+description = "cuDF Kafka Datasource"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
 ]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.8"
+dependencies = [
+    "confluent-kafka==1.7.0",
+    "cudf==23.4.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.optional-dependencies]
+test = [
+    "pytest",
+    "pytest-cov",
+    "pytest-xdist",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cudf"
+Documentation = "https://docs.rapids.ai/api/cudf/stable/"
+
+[tool.setuptools]
+license-files = ["LICENSE"]
 
 [tool.isort]
 line_length = 79
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index c39b65cdb55..c915b7e80ab 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -10,10 +10,6 @@
 from setuptools import find_packages, setup
 from setuptools.extension import Extension
 
-install_requires = ["cudf", "cython"]
-
-extras_require = {"test": ["pytest", "pytest-xdist"]}
-
 cython_files = ["cudf_kafka/_lib/*.pyx"]
 
 CUDA_HOME = os.environ.get("CUDA_HOME", False)
@@ -85,23 +81,6 @@
 ]
 
 setup(
-    name="cudf_kafka",
-    version="23.04.00",
-    description="cuDF Kafka Datasource",
-    url="https://github.com/rapidsai/cudf",
-    author="NVIDIA Corporation",
-    license="Apache 2.0",
-    classifiers=[
-        "Intended Audience :: Developers",
-        "Topic :: Streaming",
-        "Topic :: Scientific/Engineering",
-        "Topic :: Apache Kafka",
-        "License :: OSI Approved :: Apache Software License",
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-    ],
     # Include the separately-compiled shared library
     ext_modules=cythonize(
         extensions,
@@ -115,7 +94,5 @@
         find_packages(include=["cudf_kafka._lib*"]),
         ["*.pxd"],
     ),
-    install_requires=install_requires,
-    extras_require=extras_require,
     zip_safe=False,
 )
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 315621fa3c1..657b3865495 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -3,9 +3,9 @@
 [build-system]
 build-backend = "setuptools.build_meta"
 requires = [
-    "wheel",
     "setuptools",
-]
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "custreamz"
@@ -18,9 +18,10 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "cudf",
-    "cudf_kafka",
-]
+    "cudf==23.4.*",
+    "cudf_kafka==23.4.*",
+    "streamz",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Streaming",
@@ -36,8 +37,9 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "pytest",
+    "pytest-cov",
     "pytest-xdist",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 79a9aca9e96..49e1cb38da4 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -3,9 +3,9 @@
 [build-system]
 build-backend = "setuptools.build_meta"
 requires = [
-    "wheel",
     "setuptools",
-]
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "dask_cudf"
@@ -18,14 +18,14 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
+    "cudf==23.4.*",
+    "cupy-cuda11x>=9.5.0,<12.0.0a0",
     "dask>=2023.1.1",
     "distributed>=2023.1.1",
     "fsspec>=0.6.0",
     "numpy>=1.21",
     "pandas>=1.3,<1.6.0dev0",
-    "cudf==23.4.*",
-    "cupy-cuda11x",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Database",
@@ -40,12 +40,12 @@ dynamic = ["entry-points"]
 
 [project.optional-dependencies]
 test = [
-    "numpy>=1.21",
-    "pandas>=1.3,<1.6.0dev0",
+    "dask-cuda==23.4.*",
+    "numba>=0.56.2",
     "pytest",
+    "pytest-cov",
     "pytest-xdist",
-    "numba>=0.56.2",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"

From a33e368bf34a797e7a756cf05f27b6bede2878c2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 15 Mar 2023 14:16:31 -0500
Subject: [PATCH 13/63] Mark dlpack tensor deleter as noexcept to match
 PyCapsule_Destructor signature. (#12921)

This is a fix for a Cython problem that appears when building with Cython 3.0.0 beta 1. There are a bunch of warnings I'd like to fix as well, but this is the only hard error. The fix is to use `noexcept` in the deleter definition.

```
Error compiling Cython file:
------------------------------------------------------------
...
        )
    return pycapsule.PyCapsule_New(
        dlpack_tensor,
        'dltensor',
        dlmanaged_tensor_pycapsule_deleter
        ^
------------------------------------------------------------
/home/bdice/code/cudf/python/cudf/cudf/_lib/interop.pyx:69:8: Cannot assign type 'void (object) except *' to 'PyCapsule_Destructor'
```

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - https://github.com/jakirkham
  - Lawrence Mitchell (https://github.com/wence-)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/12921
---
 python/cudf/cudf/_lib/interop.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 92840561563..c5d8c48fa2c 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cpython cimport pycapsule
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -70,7 +70,7 @@ def to_dlpack(list source_columns):
     )
 
 
-cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj):
+cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
     cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>0
     try:
         dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(

From 6d264b2e7930662048e93c28cf21f164a3c742ee Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 15 Mar 2023 14:51:33 -0700
Subject: [PATCH 14/63] Make Parquet writer `nullable` option application to
 single table writes (#12933)

When writing multiple tables into a single Parquet file, users can run into issues if a column does not have nulls in the first table, but have some in other tables. The `nullable` member of `column_in_metadata` was originally added to address this and allow users to enforce nullability of columns from multiple tables. Because of this, the `nullable` option is only applied to chunked writes.

Recently, a different use for the option has been identified, where tables are stored into individual Parquet files, which are later read and the read tables are concatenated. Without the option to enforce nullability, Parquet files can end up with different nullabilities, i.e. different schemas, causing concatenation to fail.

This PR allows the nullable option to apply to single writes as well. The write call throws if user tried to write a column with nulls as non-nullable.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12933
---
 cpp/include/cudf/io/types.hpp     |  2 --
 cpp/src/io/parquet/writer_impl.cu | 23 ++++++---------
 cpp/tests/io/parquet_test.cpp     | 47 +++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 6f97eb768d9..7426811a18d 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -519,8 +519,6 @@ class column_in_metadata {
   /**
    * @brief Set the nullability of this column
    *
-   * Only valid in case of chunked writes. In single writes, this option is ignored.
-   *
    * @param nullable Whether this column is nullable
    * @return this for chaining
    */
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 2c9bff33a14..5f407b5e774 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -509,20 +509,15 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
                             column_in_metadata const& col_meta,
                             bool single_write_mode)
 {
-  if (single_write_mode) {
-    return col->nullable();
-  } else {
-    if (col_meta.is_nullability_defined()) {
-      CUDF_EXPECTS(col_meta.nullable() || !col->nullable(),
-                   "Mismatch in metadata prescribed nullability and input column nullability. "
-                   "Metadata for nullable input column cannot prescribe nullability = false");
-      return col_meta.nullable();
-    } else {
-      // For chunked write, when not provided nullability, we assume the worst case scenario
-      // that all columns are nullable.
-      return true;
-    }
-  }
+  if (col_meta.is_nullability_defined()) {
+    CUDF_EXPECTS(col_meta.nullable() || !col->nullable(),
+                 "Mismatch in metadata prescribed nullability and input column nullability. "
+                 "Metadata for nullable input column cannot prescribe nullability = false");
+    return col_meta.nullable();
+  }
+  // For chunked write, when not provided nullability, we assume the worst case scenario
+  // that all columns are nullable.
+  return not single_write_mode or col->nullable();
 }
 
 /**
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index b682ecbbae9..b74a89504a0 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -5210,4 +5210,51 @@ TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes)
   }
 }
 
+TEST_F(ParquetWriterTest, UserNullability)
+{
+  auto weight_col = cudf::test::fixed_width_column_wrapper<float>{{57.5, 51.1, 15.3}};
+  auto ages_col   = cudf::test::fixed_width_column_wrapper<int32_t>{{30, 27, 5}};
+  auto struct_col = cudf::test::structs_column_wrapper{weight_col, ages_col};
+
+  auto expected = table_view({struct_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_nullability(false);
+  expected_metadata.column_metadata[0].child(0).set_nullability(true);
+
+  auto filepath = temp_env->get_temp_filepath("SingleWriteNullable.parquet");
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf::io::write_parquet(write_opts);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  EXPECT_FALSE(result.tbl->view().column(0).nullable());
+  EXPECT_TRUE(result.tbl->view().column(0).child(0).nullable());
+  EXPECT_FALSE(result.tbl->view().column(0).child(1).nullable());
+}
+
+TEST_F(ParquetWriterTest, UserNullabilityInvalid)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; });
+  auto col      = cudf::test::fixed_width_column_wrapper<double>{{57.5, 51.1, 15.3}, valids};
+  auto expected = table_view({col});
+
+  auto filepath = temp_env->get_temp_filepath("SingleWriteNullableInvalid.parquet");
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  // Should work without the nullability option
+  EXPECT_NO_THROW(cudf::io::write_parquet(write_opts));
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_nullability(false);
+  write_opts.set_metadata(&expected_metadata);
+  // Can't write a column with nulls as not nullable
+  EXPECT_THROW(cudf::io::write_parquet(write_opts), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 9ceecb1346bbdf79a3ba104a3e456cc07ce92762 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 16 Mar 2023 17:46:17 +0000
Subject: [PATCH 15/63] Implement `groupby.head` and `groupby.tail` (#12939)

These methods can be implemented by grouping the dataframe and
then selecting appropriate slices from each group. This is less
memory-efficient than it could be (since the entire grouping must
be constructed before discarding most of it).

- Closes #2592
- Closes #12245

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/12939
---
 python/cudf/cudf/core/groupby/groupby.py | 171 +++++++++++++++++++++++
 python/cudf/cudf/tests/test_groupby.py   |  80 ++++++++++-
 2 files changed, 247 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 8ff3e17d6ff..0e671cb6412 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -608,6 +608,177 @@ def _scan(self, op: str, *args, **kwargs):
 
     aggregate = agg
 
+    def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
+        """Return the head or tail of each group
+
+        Parameters
+        ----------
+        n
+           Number of entries to include (if negative, number of
+           entries to exclude)
+        take_head
+           Do we want the head or the tail of the group
+        preserve_order
+            If True, return the n rows from each group in original
+            dataframe order (this mimics pandas behavior though is
+            more expensive).
+
+        Returns
+        -------
+        New DataFrame or Series
+
+        Notes
+        -----
+        Unlike pandas, this returns an object in group order, not
+        original order, unless ``preserve_order`` is ``True``.
+        """
+        # A more memory-efficient implementation would merge the take
+        # into the grouping, but that probably requires a new
+        # aggregation scheme in libcudf. This is probably "fast
+        # enough" for most reasonable input sizes.
+        _, offsets, _, group_values = self._grouped()
+        group_offsets = np.asarray(offsets, dtype=np.int32)
+        size_per_group = np.diff(group_offsets)
+        # "Out of bounds" n for the group size either means no entries
+        # (negative) or all the entries (positive)
+        if n < 0:
+            size_per_group = np.maximum(
+                size_per_group + n, 0, out=size_per_group
+            )
+        else:
+            size_per_group = np.minimum(size_per_group, n, out=size_per_group)
+        if take_head:
+            group_offsets = group_offsets[:-1]
+        else:
+            group_offsets = group_offsets[1:] - size_per_group
+        to_take = np.arange(size_per_group.sum(), dtype=np.int32)
+        fixup = np.empty_like(size_per_group)
+        fixup[0] = 0
+        np.cumsum(size_per_group[:-1], out=fixup[1:])
+        to_take += np.repeat(group_offsets - fixup, size_per_group)
+        to_take = as_column(to_take)
+        result = group_values.iloc[to_take]
+        if preserve_order:
+            # Can't use _mimic_pandas_order because we need to
+            # subsample the gather map from the full input ordering,
+            # rather than permuting the gather map of the output.
+            _, (ordering,), _ = self._groupby.groups(
+                [arange(0, self.obj._data.nrows)]
+            )
+            # Invert permutation from original order to groups on the
+            # subset of entries we want.
+            gather_map = ordering.take(to_take).argsort()
+            return result.take(gather_map)
+        else:
+            return result
+
+    @_cudf_nvtx_annotate
+    def head(self, n: int = 5, *, preserve_order: bool = True):
+        """Return first n rows of each group
+
+        Parameters
+        ----------
+        n
+            If positive: number of entries to include from start of group
+            If negative: number of entries to exclude from end of group
+
+        preserve_order
+            If True (default), return the n rows from each group in
+            original dataframe order (this mimics pandas behavior
+            though is more expensive). If you don't need rows in
+            original dataframe order you will see a performance
+            improvement by setting ``preserve_order=False``. In both
+            cases, the original index is preserved, so ``.loc``-based
+            indexing will work identically.
+
+        Returns
+        -------
+        Series or DataFrame
+            Subset of the original grouped object as determined by n
+
+        See Also
+        --------
+        .tail
+
+        Examples
+        --------
+        >>> df = cudf.DataFrame(
+        ...     {
+        ...         "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3],
+        ...         "b": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        ...     }
+        ... )
+        >>> df.groupby("a").head(1)
+           a  b
+        0  1  0
+        1  0  1
+        3  2  3
+        6  3  6
+        >>> df.groupby("a").head(-2)
+           a  b
+        0  1  0
+        3  2  3
+        6  3  6
+        8  3  8
+        """
+        return self._head_tail(
+            n, take_head=True, preserve_order=preserve_order
+        )
+
+    @_cudf_nvtx_annotate
+    def tail(self, n: int = 5, *, preserve_order: bool = True):
+        """Return last n rows of each group
+
+        Parameters
+        ----------
+        n
+            If positive: number of entries to include from end of group
+            If negative: number of entries to exclude from start of group
+
+        preserve_order
+            If True (default), return the n rows from each group in
+            original dataframe order (this mimics pandas behavior
+            though is more expensive). If you don't need rows in
+            original dataframe order you will see a performance
+            improvement by setting ``preserve_order=False``. In both
+            cases, the original index is preserved, so ``.loc``-based
+            indexing will work identically.
+
+        Returns
+        -------
+        Series or DataFrame
+            Subset of the original grouped object as determined by n
+
+
+        See Also
+        --------
+        .head
+
+        Examples
+        --------
+        >>> df = cudf.DataFrame(
+        ...     {
+        ...         "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3],
+        ...         "b": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        ...     }
+        ... )
+        >>> df.groupby("a").tail(1)
+            a   b
+        1   0   1
+        5   1   5
+        7   2   7
+        10  3  10
+        >>> df.groupby("a").tail(-2)
+            a   b
+        5   1   5
+        7   2   7
+        9   3   9
+        10  3  10
+        """
+        return self._head_tail(
+            n, take_head=False, preserve_order=preserve_order
+        )
+
     def nth(self, n):
         """
         Return the nth row from each group.
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 0751ef7ca67..35a01b81042 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -2,6 +2,7 @@
 
 import datetime
 import itertools
+import operator
 import textwrap
 from decimal import Decimal
 
@@ -1474,7 +1475,6 @@ def test_grouping(grouper):
 @pytest.mark.parametrize("agg", [lambda x: x.count(), "count"])
 @pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]])
 def test_groupby_count(agg, by):
-
     pdf = pd.DataFrame(
         {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]}
     )
@@ -1540,7 +1540,6 @@ def test_groupby_nth(n, by):
     reason="https://github.com/pandas-dev/pandas/issues/43209",
 )
 def test_raise_data_error():
-
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
     gdf = cudf.from_pandas(pdf)
 
@@ -1551,7 +1550,6 @@ def test_raise_data_error():
 
 
 def test_drop_unsupported_multi_agg():
-
     gdf = cudf.DataFrame(
         {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
     )
@@ -2567,7 +2565,6 @@ def foo(x):
     ],
 )
 def test_groupby_apply_series_args(func, args):
-
     got = make_frame(DataFrame, 100).groupby("x").y.apply(func, *args)
     expect = (
         make_frame(pd.DataFrame, 100)
@@ -2963,3 +2960,78 @@ def test_groupby_dtypes(groups):
     pdf = df.to_pandas()
 
     assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes)
+
+
+class TestHeadTail:
+    @pytest.fixture(params=[-3, -2, -1, 0, 1, 2, 3], ids=lambda n: f"{n=}")
+    def n(self, request):
+        return request.param
+
+    @pytest.fixture(
+        params=[False, True], ids=["no-preserve-order", "preserve-order"]
+    )
+    def preserve_order(self, request):
+        return request.param
+
+    @pytest.fixture
+    def df(self):
+        return cudf.DataFrame(
+            {
+                "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3],
+                "b": [0, 1, 2, 4, 3, 5, 6, 7, 9, 8, 10],
+            }
+        )
+
+    @pytest.fixture(params=[True, False], ids=["head", "tail"])
+    def take_head(self, request):
+        return request.param
+
+    @pytest.fixture
+    def expected(self, df, n, take_head, preserve_order):
+        if n == 0:
+            # We'll get an empty dataframe in this case
+            return df._empty_like(keep_index=True)
+        else:
+            if preserve_order:
+                # Should match pandas here
+                g = df.to_pandas().groupby("a")
+                if take_head:
+                    return g.head(n=n)
+                else:
+                    return g.tail(n=n)
+            else:
+                # We groupby "a" which is the first column. This
+                # possibly relies on an implementation detail that for
+                # integer group keys, cudf produces groups in sorted
+                # (ascending) order.
+                keyfunc = operator.itemgetter(0)
+                if take_head or n == 0:
+                    # Head does group[:n] as does tail for n == 0
+                    slicefunc = operator.itemgetter(slice(None, n))
+                else:
+                    # Tail does group[-n:] except when n == 0
+                    slicefunc = operator.itemgetter(
+                        slice(-n, None) if n else slice(0)
+                    )
+                values_to_sort = np.hstack(
+                    [df.values_host, np.arange(len(df)).reshape(-1, 1)]
+                )
+                expect_a, expect_b, index = zip(
+                    *itertools.chain.from_iterable(
+                        slicefunc(list(group))
+                        for _, group in itertools.groupby(
+                            sorted(values_to_sort.tolist(), key=keyfunc),
+                            key=keyfunc,
+                        )
+                    )
+                )
+                return cudf.DataFrame(
+                    {"a": expect_a, "b": expect_b}, index=index
+                )
+
+    def test_head_tail(self, df, n, take_head, expected, preserve_order):
+        if take_head:
+            actual = df.groupby("a").head(n=n, preserve_order=preserve_order)
+        else:
+            actual = df.groupby("a").tail(n=n, preserve_order=preserve_order)
+        assert_eq(actual, expected)

From 1e377fc40d4146c2bcb7f4d7413b54a671a50e8b Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Thu, 16 Mar 2023 16:45:28 -0700
Subject: [PATCH 16/63] Implement initial support for avro logical types
 (#6482) (#12788)

This includes the date type.  Scaffolding has been put in place to handle the other logical types.  This closes #6482.

Authors:
  - Trent Nelson (https://github.com/tpn)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/12788
---
 cpp/src/io/avro/avro.cpp                      |  79 ++++---
 cpp/src/io/avro/avro.hpp                      |   5 +-
 cpp/src/io/avro/avro_common.hpp               |  69 +++++-
 cpp/src/io/avro/avro_gpu.cu                   | 114 +++++++---
 cpp/src/io/avro/avro_gpu.hpp                  |   5 +-
 cpp/src/io/avro/reader_impl.cu                |  39 +++-
 .../test_avro_reader_fastavro_integration.py  | 206 +++++++++++++++++-
 7 files changed, 446 insertions(+), 71 deletions(-)

diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 48c458109c1..aa0e36d9972 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,9 +124,11 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
   md->total_data_size = m_cur - (m_base + md->metadata_size);
   // Extract columns
   for (size_t i = 0; i < md->schema.size(); i++) {
-    type_kind_e kind = md->schema[i].kind;
-    if (kind > type_null && kind < type_record) {
-      // Primitive type column
+    type_kind_e kind                = md->schema[i].kind;
+    logicaltype_kind_e logical_kind = md->schema[i].logical_kind;
+
+    bool is_supported_kind = ((kind > type_null) && (kind < type_record));
+    if (is_supported_logical_type(logical_kind) || is_supported_kind) {
       column_desc col;
       int parent_idx       = md->schema[i].parent_idx;
       col.schema_data_idx  = (int32_t)i;
@@ -141,7 +143,9 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
                  --num_children) {
               int skip = 1;
               if (pos == i) {
-                col.parent_union_idx = md->schema[parent_idx].num_children - num_children;
+                // parent_idx will always be pointing to our immediate parent
+                // union at this point.
+                col.parent_union_idx = parent_idx;
               } else if (md->schema[pos].kind == type_null) {
                 col.schema_null_idx = pos;
                 break;
@@ -152,7 +156,9 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
               } while (skip != 0);
             }
           }
-          // Ignore the root or array entries
+          // We want to "inherit" the column name from our parent union's
+          // name, as long as we're not dealing with the root (parent_idx == 0)
+          // or array entries.
           if ((parent_idx != 0 && md->schema[parent_idx].kind != type_array) ||
               col.name.length() == 0) {
             if (col.name.length() > 0) { col.name.insert(0, 1, '.'); }
@@ -179,13 +185,14 @@ enum json_state_e {
   state_nextsymbol,
 };
 
-enum {
+enum attrtype_e {
   attrtype_none = -1,
   attrtype_type = 0,
   attrtype_name,
   attrtype_fields,
   attrtype_symbols,
   attrtype_items,
+  attrtype_logicaltype,
 };
 
 /**
@@ -205,26 +212,40 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, const std::string&
   int depth = 0, parent_idx = -1, entry_idx = -1;
   json_state_e state = state_attrname;
   std::string str;
-  const std::unordered_map<std::string, type_kind_e> typenames = {{"null", type_null},
-                                                                  {"boolean", type_boolean},
-                                                                  {"int", type_int},
-                                                                  {"long", type_long},
-                                                                  {"float", type_float},
-                                                                  {"double", type_double},
-                                                                  {"bytes", type_bytes},
-                                                                  {"string", type_string},
-                                                                  {"record", type_record},
-                                                                  {"enum", type_enum},
-                                                                  {"array", type_array}};
-  const std::unordered_map<std::string, int> attrnames         = {{"type", attrtype_type},
-                                                          {"name", attrtype_name},
-                                                          {"fields", attrtype_fields},
-                                                          {"symbols", attrtype_symbols},
-                                                          {"items", attrtype_items}};
-  int cur_attr                                                 = attrtype_none;
-  m_base                                                       = json_str.c_str();
-  m_cur                                                        = m_base;
-  m_end                                                        = m_base + json_str.length();
+  const std::unordered_map<std::string, type_kind_e> typenames = {
+    {"null", type_null},
+    {"boolean", type_boolean},
+    {"int", type_int},
+    {"long", type_long},
+    {"float", type_float},
+    {"double", type_double},
+    {"bytes", type_bytes},
+    {"string", type_string},
+    {"record", type_record},
+    {"enum", type_enum},
+    {"array", type_array},
+    {"union", type_union},
+    {"fixed", type_fixed},
+    {"decimal", type_decimal},
+    {"date", type_date},
+    {"time-millis", type_time_millis},
+    {"time-micros", type_time_micros},
+    {"timestamp-millis", type_timestamp_millis},
+    {"timestamp-micros", type_timestamp_micros},
+    {"local-timestamp-millis", type_local_timestamp_millis},
+    {"local-timestamp-micros", type_local_timestamp_micros},
+    {"duration", type_duration}};
+  const std::unordered_map<std::string, attrtype_e> attrnames = {
+    {"type", attrtype_type},
+    {"name", attrtype_name},
+    {"fields", attrtype_fields},
+    {"symbols", attrtype_symbols},
+    {"items", attrtype_items},
+    {"logicalType", attrtype_logicaltype}};
+  attrtype_e cur_attr = attrtype_none;
+  m_base              = json_str.c_str();
+  m_cur               = m_base;
+  m_end               = m_base + json_str.length();
   while (more_data()) {
     int c = *m_cur++;
     switch (c) {
@@ -250,6 +271,10 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, const std::string&
             auto t = typenames.find(str);
             if (t == typenames.end()) return false;
             schema[entry_idx].kind = t->second;
+          } else if (cur_attr == attrtype_logicaltype) {
+            auto t = typenames.find(str);
+            if (t == typenames.end()) return false;
+            schema[entry_idx].logical_kind = static_cast<logicaltype_kind_e>(t->second);
           } else if (cur_attr == attrtype_name) {
             if (entry_idx < 0) return false;
             schema[entry_idx].name = std::move(str);
diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp
index 1ca50f04d18..ef294893e4b 100644
--- a/cpp/src/io/avro/avro.hpp
+++ b/cpp/src/io/avro/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,8 @@ struct schema_entry {
   int32_t parent_idx   = -1;  // index of parent entry in schema array, negative if no parent
   int32_t num_children = 0;
   type_kind_e kind     = type_not_set;
-  std::string name     = "";
+  logicaltype_kind_e logical_kind = logicaltype_not_set;
+  std::string name                = "";
   std::vector<std::string> symbols;
 };
 
diff --git a/cpp/src/io/avro/avro_common.hpp b/cpp/src/io/avro/avro_common.hpp
index 229ffa5da04..a3025650ae9 100644
--- a/cpp/src/io/avro/avro_common.hpp
+++ b/cpp/src/io/avro/avro_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,8 +56,75 @@ enum type_kind_e {
   type_record,
   type_union,
   type_array,
+  type_fixed,
+  // Logical types
+  type_decimal,
+  type_uuid,
+  type_date,
+  type_time_millis,
+  type_time_micros,
+  type_timestamp_millis,
+  type_timestamp_micros,
+  type_local_timestamp_millis,
+  type_local_timestamp_micros,
+  type_duration,
 };
 
+enum logicaltype_kind_e {
+  logicaltype_not_set = 0,
+  // N.B. We intentionally mirror the logicaltype enum values with their
+  //      equivalent type enum value, as this allows us to cast the type
+  //      value directly to a logical type without an intermediate
+  //      mapping step, and vice versa, e.g.:
+  //
+  //        auto kind = type_date;
+  //        auto logical_kind = static_cast<logical_kind_e>(type_date);
+  //        // logical_kind == logicaltype_kind_e::logicaltype_date
+  //
+  //      And:
+  //
+  //        auto logical_kind = logicaltype_date;
+  //        auto kind = static_cast<type_kind_e>(logical_kind);
+  //        // kind == type_kind_e::type_date
+  //
+  logicaltype_decimal = type_decimal,
+  logicaltype_uuid,
+  logicaltype_date,
+  logicaltype_time_millis,
+  logicaltype_time_micros,
+  logicaltype_timestamp_millis,
+  logicaltype_timestamp_micros,
+  logicaltype_local_timestamp_millis,
+  logicaltype_local_timestamp_micros,
+  logicaltype_duration,
+};
+
+/**
+ * @brief Determines if the supplied logical type is currently supported.
+ *
+ * @param[in] logical_kind Supplies the logicaltype_kind_e enum value.
+ *
+ * @return true if the logical type is supported, false otherwise.
+ */
+inline constexpr bool is_supported_logical_type(logicaltype_kind_e logical_kind)
+{
+  switch (logical_kind) {
+    case logicaltype_date: return true;
+
+    case logicaltype_not_set: [[fallthrough]];
+    case logicaltype_decimal: [[fallthrough]];
+    case logicaltype_uuid: [[fallthrough]];
+    case logicaltype_time_millis: [[fallthrough]];
+    case logicaltype_time_micros: [[fallthrough]];
+    case logicaltype_timestamp_millis: [[fallthrough]];
+    case logicaltype_timestamp_micros: [[fallthrough]];
+    case logicaltype_local_timestamp_millis: [[fallthrough]];
+    case logicaltype_local_timestamp_micros: [[fallthrough]];
+    case logicaltype_duration: [[fallthrough]];
+    default: return false;
+  }
+}
+
 using cudf::io::detail::string_index_pair;
 
 }  // namespace avro
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 03edb7ed6cb..64c572424e0 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,8 +78,11 @@ avro_decode_row(schemadesc_s const* schema,
   uint32_t array_start = 0, array_repeat_count = 0;
   int array_children = 0;
   for (uint32_t i = 0; i < schema_len;) {
-    uint32_t kind = schema[i].kind;
-    int skip      = 0;
+    type_kind_e kind                = schema[i].kind;
+    logicaltype_kind_e logical_kind = schema[i].logical_kind;
+    int skip                        = 0;
+
+    if (is_supported_logical_type(logical_kind)) { kind = static_cast<type_kind_e>(logical_kind); }
 
     if (kind == type_union) {
       int skip_after;
@@ -93,7 +96,11 @@ avro_decode_row(schemadesc_s const* schema,
         --skip;
       }
       if (i >= schema_len || skip_after < 0) break;
-      kind = schema[i].kind;
+      kind         = schema[i].kind;
+      logical_kind = schema[i].logical_kind;
+      if (is_supported_logical_type(logical_kind)) {
+        kind = static_cast<type_kind_e>(logical_kind);
+      }
       skip = skip_after;
     }
 
@@ -106,36 +113,38 @@ avro_decode_row(schemadesc_s const* schema,
         }
         break;
 
-      case type_int:
-      case type_long:
-      case type_bytes:
-      case type_string:
-      case type_enum: {
+      case type_int: {
         int64_t v = avro_decode_zigzag_varint(cur, end);
-        if (kind == type_int) {
-          if (dataptr != nullptr && row < max_rows) {
-            static_cast<int32_t*>(dataptr)[row] = static_cast<int32_t>(v);
-          }
-        } else if (kind == type_long) {
-          if (dataptr != nullptr && row < max_rows) { static_cast<int64_t*>(dataptr)[row] = v; }
-        } else {  // string or enum
-          size_t count    = 0;
-          const char* ptr = nullptr;
-          if (kind == type_enum) {  // dictionary
-            size_t idx = schema[i].count + v;
-            if (idx < global_dictionary.size()) {
-              ptr   = global_dictionary[idx].first;
-              count = global_dictionary[idx].second;
-            }
-          } else if (v >= 0 && cur + v <= end) {  // string
-            ptr   = reinterpret_cast<const char*>(cur);
-            count = (size_t)v;
-            cur += count;
-          }
-          if (dataptr != nullptr && row < max_rows) {
-            static_cast<string_index_pair*>(dataptr)[row].first  = ptr;
-            static_cast<string_index_pair*>(dataptr)[row].second = count;
+        if (dataptr != nullptr && row < max_rows) {
+          static_cast<int32_t*>(dataptr)[row] = static_cast<int32_t>(v);
+        }
+      } break;
+
+      case type_long: {
+        int64_t v = avro_decode_zigzag_varint(cur, end);
+        if (dataptr != nullptr && row < max_rows) { static_cast<int64_t*>(dataptr)[row] = v; }
+      } break;
+
+      case type_bytes: [[fallthrough]];
+      case type_string: [[fallthrough]];
+      case type_enum: {
+        int64_t v       = avro_decode_zigzag_varint(cur, end);
+        size_t count    = 0;
+        const char* ptr = nullptr;
+        if (kind == type_enum) {  // dictionary
+          size_t idx = schema[i].count + v;
+          if (idx < global_dictionary.size()) {
+            ptr   = global_dictionary[idx].first;
+            count = global_dictionary[idx].second;
           }
+        } else if (v >= 0 && cur + v <= end) {  // string or bytes
+          ptr   = reinterpret_cast<const char*>(cur);
+          count = (size_t)v;
+          cur += count;
+        }
+        if (dataptr != nullptr && row < max_rows) {
+          static_cast<string_index_pair*>(dataptr)[row].first  = ptr;
+          static_cast<string_index_pair*>(dataptr)[row].second = count;
         }
       } break;
 
@@ -190,7 +199,48 @@ avro_decode_row(schemadesc_s const* schema,
           skip += schema[i].count;  // Should always be 1
         }
       } break;
+
+      case type_duration: {
+        // A duration logical type annotates Avro fixed type of size 12, which
+        // stores three little-endian unsigned integers that represent durations
+        // at different granularities of time. The first stores a number in
+        // months, the second stores a number in days, and the third stores a
+        // number in milliseconds.
+        CUDF_UNREACHABLE("avro type 'duration' not yet implemented");
+      } break;
+
+      // N.B. These aren't handled yet, see the discussion on
+      //      https://github.com/rapidsai/cudf/pull/12788.  The decoding logic
+      //      is correct, though, so there's no harm in having them here.
+      case type_timestamp_millis: [[fallthrough]];
+      case type_timestamp_micros: [[fallthrough]];
+      case type_local_timestamp_millis: [[fallthrough]];
+      case type_local_timestamp_micros: [[fallthrough]];
+      case type_time_millis: [[fallthrough]];
+      case type_time_micros: {
+        // N.B. time-millis is stored as a 32-bit int, however, cudf expects an
+        //      int64 for DURATION_MILLISECONDS.  From our perspective, the fact
+        //      that time-millis comes from a 32-bit int is hidden from us by
+        //      way of the zig-zag varint encoding, so we can safely treat them
+        //      both as int64_t.  Everything else is 64-bit in both avro and
+        //      cudf.
+        CUDF_UNREACHABLE("avro time/timestamp types not yet implemented");
+        //
+        // When we do implement these, the following decoding logic should
+        // be correct:
+        //
+        // int64_t v = avro_decode_zigzag_varint(cur, end);
+        // if (dataptr != nullptr && row < max_rows) { static_cast<int64_t*>(dataptr)[row] = v; }
+      } break;
+
+      case type_date: {
+        int64_t v = avro_decode_zigzag_varint(cur, end);
+        if (dataptr != nullptr && row < max_rows) {
+          static_cast<int32_t*>(dataptr)[row] = static_cast<int32_t>(v);
+        }
+      } break;
     }
+
     if (array_repeat_count != 0) {
       array_children--;
       if (schema[i].kind >= type_record) { array_children += schema[i].count; }
diff --git a/cpp/src/io/avro/avro_gpu.hpp b/cpp/src/io/avro/avro_gpu.hpp
index 7bfb3a75250..6575d76d8d9 100644
--- a/cpp/src/io/avro/avro_gpu.hpp
+++ b/cpp/src/io/avro/avro_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,8 @@ namespace gpu {
  * @brief Struct to describe the avro schema
  */
 struct schemadesc_s {
-  uint32_t kind;   // avro type kind
+  cudf::io::avro::type_kind_e kind;                 // avro type kind
+  cudf::io::avro::logicaltype_kind_e logical_kind;  // avro logicaltype kind
   uint32_t count;  // for records/unions: number of following child columns, for nulls: global
                    // null_count, for enums: dictionary ofs
   void* dataptr;   // Ptr to column data, or null if column not selected
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index d9da2f083d1..cd557ff166a 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -66,15 +66,42 @@ namespace {
  */
 type_id to_type_id(avro::schema_entry const* col)
 {
-  switch (col->kind) {
+  avro::type_kind_e kind;
+
+  // N.B. The switch statement seems a bit ridiculous for a single type, but the
+  //      plan is to incrementally add more types to it as support is added for
+  //      them in the future.
+  switch (col->logical_kind) {
+    case avro::logicaltype_date: kind = static_cast<avro::type_kind_e>(col->logical_kind); break;
+    case avro::logicaltype_not_set: [[fallthrough]];
+    default: kind = col->kind; break;
+  }
+
+  switch (kind) {
     case avro::type_boolean: return type_id::BOOL8;
     case avro::type_int: return type_id::INT32;
     case avro::type_long: return type_id::INT64;
     case avro::type_float: return type_id::FLOAT32;
     case avro::type_double: return type_id::FLOAT64;
-    case avro::type_bytes:
+    case avro::type_bytes: [[fallthrough]];
     case avro::type_string: return type_id::STRING;
+    case avro::type_date: return type_id::TIMESTAMP_DAYS;
+    case avro::type_timestamp_millis: return type_id::TIMESTAMP_MILLISECONDS;
+    case avro::type_timestamp_micros: return type_id::TIMESTAMP_MICROSECONDS;
+    case avro::type_local_timestamp_millis: return type_id::TIMESTAMP_MILLISECONDS;
+    case avro::type_local_timestamp_micros: return type_id::TIMESTAMP_MICROSECONDS;
     case avro::type_enum: return (!col->symbols.empty()) ? type_id::STRING : type_id::INT32;
+    // The avro time-millis and time-micros types are closest to Arrow's
+    // TIME32 and TIME64.  They're single-day units, i.e. they won't exceed
+    // 23:59:59.9999 (or .999999 for micros).  There's no equivalent cudf
+    // type for this; type_id::DURATION_MILLISECONDS/MICROSECONDS are close,
+    // but they're not semantically the same.
+    case avro::type_time_millis: [[fallthrough]];
+    case avro::type_time_micros: [[fallthrough]];
+    // There's no cudf equivalent for the avro duration type, which is a fixed
+    // 12 byte value which stores three little-endian unsigned 32-bit integers
+    // representing months, days, and milliseconds, respectively.
+    case avro::type_duration: [[fallthrough]];
     default: return type_id::EMPTY;
   }
 }
@@ -141,6 +168,7 @@ class metadata : public file_metadata {
             break;
           }
         }
+
         if (!column_in_array) {
           auto col_type = to_type_id(&schema[columns[i].schema_data_idx]);
           CUDF_EXPECTS(col_type != type_id::EMPTY, "Unsupported data type");
@@ -360,7 +388,9 @@ std::vector<column_buffer> decode_data(metadata& meta,
   int skip_field_cnt         = 0;
 
   for (size_t i = 0; i < meta.schema.size(); i++) {
-    type_kind_e kind = meta.schema[i].kind;
+    type_kind_e kind                = meta.schema[i].kind;
+    logicaltype_kind_e logical_kind = meta.schema[i].logical_kind;
+
     if (skip_field_cnt != 0) {
       // Exclude union and array members from min_row_data_size
       skip_field_cnt += meta.schema[i].num_children - 1;
@@ -382,7 +412,8 @@ std::vector<column_buffer> decode_data(metadata& meta,
       }
     }
     if (kind == type_enum && !meta.schema[i].symbols.size()) { kind = type_int; }
-    schema_desc[i].kind = kind;
+    schema_desc[i].kind         = kind;
+    schema_desc[i].logical_kind = logical_kind;
     schema_desc[i].count =
       (kind == type_enum) ? 0 : static_cast<uint32_t>(meta.schema[i].num_children);
     schema_desc[i].dataptr = nullptr;
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 9eb01ae31b4..ea23587ea70 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime
 import io
+import pathlib
+from typing import Optional
 
 import fastavro
 import pytest
@@ -22,8 +25,7 @@
 from cudf.testing.dataset_generator import rand_dataframe
 
 
-def cudf_from_avro_util(schema, records):
-
+def cudf_from_avro_util(schema: dict, records: list) -> cudf.DataFrame:
     schema = [] if schema is None else fastavro.parse_schema(schema)
     buffer = io.BytesIO()
     fastavro.writer(buffer, schema, records)
@@ -244,3 +246,201 @@ def test_avro_compression(rows, codec):
     got_df = cudf.read_avro(buffer)
 
     assert_eq(expected_df, got_df)
+
+
+avro_logical_type_params = [
+    # (avro logical type, avro primitive type, cudf expected dtype)
+    ("date", "int", "datetime64[s]"),
+]
+
+
+@pytest.mark.parametrize(
+    "logical_type, primitive_type, expected_dtype", avro_logical_type_params
+)
+@pytest.mark.parametrize("namespace", [None, "root_ns"])
+@pytest.mark.parametrize("nullable", [True, False])
+@pytest.mark.parametrize("prepend_null", [True, False])
+def test_can_detect_dtypes_from_avro_logical_type(
+    logical_type,
+    primitive_type,
+    expected_dtype,
+    namespace,
+    nullable,
+    prepend_null,
+):
+    avro_type = [{"logicalType": logical_type, "type": primitive_type}]
+    if nullable:
+        if prepend_null:
+            avro_type.insert(0, "null")
+        else:
+            avro_type.append("null")
+
+    schema = fastavro.parse_schema(
+        {
+            "type": "record",
+            "name": "test",
+            "namespace": namespace,
+            "fields": [{"name": "prop", "type": avro_type}],
+        }
+    )
+
+    actual = cudf_from_avro_util(schema, [])
+
+    expected = cudf.DataFrame(
+        {"prop": cudf.Series(None, None, expected_dtype)}
+    )
+
+    assert_eq(expected, actual)
+
+
+def get_days_from_epoch(date: Optional[datetime.date]) -> Optional[int]:
+    if date is None:
+        return None
+    return (date - datetime.date(1970, 1, 1)).days
+
+
+@pytest.mark.parametrize("namespace", [None, "root_ns"])
+@pytest.mark.parametrize("nullable", [True, False])
+@pytest.mark.parametrize("prepend_null", [True, False])
+def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null):
+
+    avro_type = {"logicalType": "date", "type": "int"}
+    if nullable:
+        if prepend_null:
+            avro_type = ["null", avro_type]
+        else:
+            avro_type = [avro_type, "null"]
+
+    schema_dict = {
+        "type": "record",
+        "name": "test",
+        "fields": [
+            {"name": "o_date", "type": avro_type},
+        ],
+    }
+
+    if namespace:
+        schema_dict["namespace"] = namespace
+
+    schema = fastavro.parse_schema(schema_dict)
+
+    # Insert some None values in no particular order.  These will get converted
+    # into avro "nulls" by the fastavro writer (or filtered out if we're not
+    # nullable).  The first and last dates are epoch min/max values, the rest
+    # are arbitrarily chosen.
+    dates = [
+        None,
+        datetime.date(1970, 1, 1),
+        datetime.date(1970, 1, 2),
+        datetime.date(1981, 10, 25),
+        None,
+        None,
+        datetime.date(2012, 5, 18),
+        None,
+        datetime.date(2019, 9, 3),
+        None,
+        datetime.date(9999, 12, 31),
+    ]
+
+    if not nullable:
+        dates = [date for date in dates if date is not None]
+
+    days_from_epoch = [get_days_from_epoch(date) for date in dates]
+
+    records = [{"o_date": day} for day in days_from_epoch]
+
+    actual = cudf_from_avro_util(schema, records)
+
+    expected = cudf.DataFrame(
+        {"o_date": cudf.Series(dates, dtype="datetime64[s]")}
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_alltypes_plain_avro():
+    # During development of the logical type support, the Java avro tests were
+    # triggering CUDA kernel crashes (null pointer dereferences).  We were able
+    # to replicate the behavior in a C++ test case, and then subsequently came
+    # up with this Python unit test to also trigger the problematic code path.
+    #
+    # So, unlike the other tests, this test is inherently reactive in nature,
+    # added simply to verify we fixed the problematic code path that was
+    # causing CUDA kernel crashes.
+    #
+    # See https://github.com/rapidsai/cudf/pull/12788#issuecomment-1468822875
+    # for more information.
+    relpath = "../../../../java/src/test/resources/alltypes_plain.avro"
+    path = pathlib.Path(__file__).parent.joinpath(relpath).resolve()
+    assert path.is_file(), path
+    path = str(path)
+
+    with open(path, "rb") as f:
+        reader = fastavro.reader(f)
+        records = [record for record in reader]
+
+    # For reference:
+    #
+    # >>> from pprint import pprint
+    # >>> pprint(reader.writer_schema)
+    # {'fields': [{'name': 'id', 'type': ['int', 'null']},
+    #             {'name': 'bool_col', 'type': ['boolean', 'null']},
+    #             {'name': 'tinyint_col', 'type': ['int', 'null']},
+    #             {'name': 'smallint_col', 'type': ['int', 'null']},
+    #             {'name': 'int_col', 'type': ['int', 'null']},
+    #             {'name': 'bigint_col', 'type': ['long', 'null']},
+    #             {'name': 'float_col', 'type': ['float', 'null']},
+    #             {'name': 'double_col', 'type': ['double', 'null']},
+    #             {'name': 'date_string_col', 'type': ['bytes', 'null']},
+    #             {'name': 'string_col', 'type': ['bytes', 'null']},
+    #             {'name': 'timestamp_col',
+    #              'type': [{'logicalType': 'timestamp-micros',
+    #                        'type': 'long'},
+    #                       'null']}],
+    #  'name': 'topLevelRecord',
+    #  'type': 'record'}
+    #
+    # >>> pprint(records[0])
+    # {'bigint_col': 0,
+    #  'bool_col': True,
+    #  'date_string_col': b'03/01/09',
+    #  'double_col': 0.0,
+    #  'float_col': 0.0,
+    #  'id': 4,
+    #  'int_col': 0,
+    #  'smallint_col': 0,
+    #  'string_col': b'0',
+    #  'timestamp_col': datetime.datetime(2009, 3, 1, 0, 0,
+    #                                     tzinfo=datetime.timezone.utc),
+    #  'tinyint_col': 0}
+
+    # Nothing particularly special about these columns, other than them being
+    # the ones that @davidwendt used to coerce the crash.
+    columns = ["bool_col", "int_col", "timestamp_col"]
+
+    # This next line would trigger the fatal CUDA kernel crash.
+    actual = cudf.read_avro(path, columns=columns)
+
+    # If we get here, we haven't crashed, obviously.  Verify the returned data
+    # frame meets our expectations.  We need to fiddle with the dtypes of the
+    # expected data frame in order to correctly match the schema definition and
+    # our corresponding read_avro()-returned data frame.
+
+    data = [{column: row[column] for column in columns} for row in records]
+    expected = cudf.DataFrame(data)
+
+    # The fastavro.reader supports the `'logicalType': 'timestamp-micros'` used
+    # by the 'timestamp_col' column, which is converted into Python
+    # datetime.datetime() objects (see output of pprint(records[0]) above).
+    # As we don't support that logical type yet in cudf, we need to convert to
+    # int64, then divide by 1000 to convert from nanoseconds to microseconds.
+    timestamps = expected["timestamp_col"].astype("int64")
+    timestamps //= 1000
+    expected["timestamp_col"] = timestamps
+
+    # Furthermore, we need to force the 'int_col' into an int32, per the schema
+    # definition.  (It ends up as an int64 due to cudf.DataFrame() defaulting
+    # all Python int values to int64 sans a dtype= override.)
+    expected["int_col"] = expected["int_col"].astype("int32")
+
+    assert_eq(actual, expected)

From 3540613436ee95296be61f66fed72de44314fb33 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Thu, 16 Mar 2023 23:53:46 -0700
Subject: [PATCH 17/63] Fix calculation of null counts for Parquet statistics
 (#12938)

The current Parquet writer sometimes generates wrong values for `null_count` in the column chunk statistics and page indexes. This occurs for nested schemas when nulls occur at a level above the leaf values. This PR fixes the calculation by adding a `non_leaf_nulls` field to the `statistics_group` struct. This field is added to the chunk `null_count` calculated over leaf values in `gpu_calculate_group_statistics()`.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/12938
---
 cpp/src/io/parquet/page_enc.cu              |   1 +
 cpp/src/io/statistics/column_statistics.cuh |  10 +-
 cpp/src/io/statistics/statistics.cuh        |   3 +-
 cpp/tests/io/parquet_test.cpp               | 133 ++++++++++++++++++++
 4 files changed, 144 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 5a12acec2a3..e48696fcb9b 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -282,6 +282,7 @@ __global__ void __launch_bounds__(128)
       g.col            = ck_g->col_desc;
       g.start_row      = fragments[frag_id].start_value_idx;
       g.num_rows       = fragments[frag_id].num_leaf_values;
+      g.non_leaf_nulls = fragments[frag_id].num_values - g.num_rows;
       groups[frag_id]  = g;
     }
   }
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index 125235ebf2f..0b09cb63d19 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,7 +129,13 @@ struct calculate_group_statistics_functor {
 
     chunk = block_reduce(chunk, storage);
 
-    if (t == 0) { s.ck = get_untyped_chunk(chunk); }
+    if (t == 0) {
+      // parquet wants total null count in stats, not just count of null leaf values
+      if constexpr (IO == detail::io_file_format::PARQUET) {
+        chunk.null_count += s.group.non_leaf_nulls;
+      }
+      s.ck = get_untyped_chunk(chunk);
+    }
   }
 
   template <typename T,
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index f2611f7cc26..8d24d443d42 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,6 +111,7 @@ struct statistics_group {
   const stats_column_desc* col;  //!< Column information
   uint32_t start_row;            //!< Start row of this group
   uint32_t num_rows;             //!< Number of rows in group
+  uint32_t non_leaf_nulls;       //!< Number of null non-leaf values in the group
 };
 
 struct statistics_merge_group {
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index b74a89504a0..e82b0c670b8 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -4279,6 +4279,9 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = parse_statistics(chunk);
 
+      // should be half nulls, except no nulls in column 0
+      EXPECT_EQ(stats.null_count, c == 0 ? 0 : num_rows / 2);
+
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
@@ -4364,6 +4367,9 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = parse_statistics(chunk);
 
+      // there should be no nulls except column 1 which is all nulls
+      EXPECT_EQ(stats.null_count, c == 1 ? num_rows : 0);
+
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
@@ -4465,6 +4471,133 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
   }
 }
 
+TEST_F(ParquetWriterTest, CheckColumnIndexListWithNulls)
+{
+  using cudf::test::iterators::null_at;
+  using cudf::test::iterators::nulls_at;
+  using lcw = cudf::test::lists_column_wrapper<int32_t>;
+
+  // 4 nulls
+  // [NULL, 2, NULL]
+  // []
+  // [4, 5]
+  // NULL
+  lcw col0{{{{1, 2, 3}, nulls_at({0, 2})}, {}, {4, 5}, {}}, null_at(3)};
+
+  // 4 nulls
+  // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]]
+  // [[7, 8]]
+  // []
+  // [[]]
+  lcw col1{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}}, lcw{}, lcw{lcw{}}};
+
+  // 4 nulls
+  // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]]
+  // [[7, 8]]
+  // []
+  // [[]]
+  lcw col2{{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, lcw{}, lcw{lcw{}}};
+
+  // 6 nulls
+  // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]]
+  // [[7, 8]]
+  // []
+  // [[]]
+  using dlcw = cudf::test::lists_column_wrapper<double>;
+  dlcw col3{{{{1., 2., 3.}, {}, {4., 5.}, {}, {{0., 6., 0.}, nulls_at({0, 2})}}, null_at(3)},
+            {{7., 8.}},
+            dlcw{},
+            dlcw{dlcw{}}};
+
+  // 4 nulls
+  // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]]
+  // [[7, 8]]
+  // []
+  // NULL
+  using ui16lcw = cudf::test::lists_column_wrapper<uint16_t>;
+  cudf::test::lists_column_wrapper<uint16_t> col4{
+    {{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, ui16lcw{}, ui16lcw{ui16lcw{}}},
+    null_at(3)};
+
+  // 6 nulls
+  // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]]
+  // [[7, 8]]
+  // []
+  // NULL
+  lcw col5{{{{{1, 2, 3}, {}, {4, 5}, {}, {{0, 6, 0}, nulls_at({0, 2})}}, null_at(3)},
+            {{7, 8}},
+            lcw{},
+            lcw{lcw{}}},
+           null_at(3)};
+
+  // 4 nulls
+  using strlcw = cudf::test::lists_column_wrapper<cudf::string_view>;
+  cudf::test::lists_column_wrapper<cudf::string_view> col6{
+    {{"Monday", "Monday", "Friday"}, {}, {"Monday", "Friday"}, {}, {"Sunday", "Funday"}},
+    {{"bee", "sting"}},
+    strlcw{},
+    strlcw{strlcw{}}};
+
+  // 11 nulls
+  // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]]
+  // [NULL, [[13],[14,15,16]],  NULL]
+  // [NULL, [], NULL, [[]]]
+  // NULL
+  lcw col7{{
+             {{{{1, 2, 3, 4}, nulls_at({0, 2})}}, {{{5, 6, 7}, nulls_at({0, 2})}, {8, 9}}},
+             {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, nulls_at({0, 2})},
+             {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, nulls_at({0, 2})},
+             lcw{lcw{lcw{}}},
+           },
+           null_at(3)};
+
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7});
+
+  int64_t const expected_null_counts[] = {4, 4, 4, 6, 4, 6, 4, 11};
+
+  auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet");
+  auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+                    .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+                    .compression(cudf::io::compression_type::NONE);
+
+  cudf::io::write_parquet(out_opts);
+
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::FileMetaData fmd;
+
+  read_footer(source, &fmd);
+
+  for (size_t r = 0; r < fmd.row_groups.size(); r++) {
+    auto const& rg = fmd.row_groups[r];
+    for (size_t c = 0; c < rg.columns.size(); c++) {
+      auto const& chunk = rg.columns[c];
+
+      // loop over offsets, read each page header, make sure it's a data page and that
+      // the first row index is correct
+      auto const oi = read_offset_index(source, chunk);
+
+      int64_t num_vals = 0;
+      for (size_t o = 0; o < oi.page_locations.size(); o++) {
+        auto const& page_loc = oi.page_locations[o];
+        auto const ph        = read_page_header(source, page_loc);
+        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
+        // last column has 2 values per row
+        EXPECT_EQ(page_loc.first_row_index * (c == rg.columns.size() - 1 ? 2 : 1), num_vals);
+        num_vals += ph.data_page_header.num_values;
+      }
+
+      // check null counts in column chunk stats and page indexes
+      auto const ci    = read_column_index(source, chunk);
+      auto const stats = parse_statistics(chunk);
+      EXPECT_EQ(stats.null_count, expected_null_counts[c]);
+
+      // should only be one page
+      EXPECT_FALSE(ci.null_pages[0]);
+      EXPECT_EQ(ci.null_counts[0], expected_null_counts[c]);
+    }
+  }
+}
+
 TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
 {
   const char* coldata[] = {

From 8881cb693baca067de5884bcfe364c5725c3c6ee Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 17 Mar 2023 08:43:18 -0400
Subject: [PATCH 18/63] Improve performance for replace-multi for long strings
 (#12858)

Adds more efficient algorithm for multi-string version of `cudf::strings::replace` for longer strings (greater than 256 bytes on average in each row).

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12858
---
 cpp/CMakeLists.txt                  |   1 +
 cpp/benchmarks/string/replace.cpp   |   4 +-
 cpp/src/strings/replace/multi.cu    | 500 ++++++++++++++++++++++++++++
 cpp/src/strings/replace/replace.cu  |  95 ------
 cpp/tests/strings/replace_tests.cpp | 150 ++++++++-
 5 files changed, 636 insertions(+), 114 deletions(-)
 create mode 100644 cpp/src/strings/replace/multi.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 12b812d0bbe..c50464762c0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -547,6 +547,7 @@ add_library(
   src/strings/regex/regex_program.cpp
   src/strings/repeat_strings.cu
   src/strings/replace/backref_re.cu
+  src/strings/replace/multi.cu
   src/strings/replace/multi_re.cu
   src/strings/replace/replace.cu
   src/strings/replace/replace_re.cu
diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp
index b25af14ec2a..cb570020f0e 100644
--- a/cpp/benchmarks/string/replace.cpp
+++ b/cpp/benchmarks/string/replace.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
   int const row_mult   = 8;
   int const min_rowlen = 1 << 5;
   int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
+  int const len_mult   = 2;
   generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
 }
 
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
new file mode 100644
index 00000000000..92ace4e7bc7
--- /dev/null
+++ b/cpp/src/strings/replace/multi.cu
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/char_tables.hpp>
+#include <cudf/strings/detail/replace.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/replace.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+/**
+ * @brief Threshold to decide on using string or character-parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the character-parallel function is used.
+ * Otherwise, a regular string-parallel function is used.
+ *
+ * This value was found using the replace-multi benchmark results using an
+ * RTX A6000.
+ */
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256;
+
+/**
+ * @brief Type used for holding the target position (first) and the
+ * target index (second).
+ */
+using target_pair = thrust::pair<size_type, size_type>;
+
+/**
+ * @brief Helper functions for performing character-parallel replace
+ */
+struct replace_multi_parallel_fn {
+  __device__ char const* get_base_ptr() const
+  {
+    return d_strings.child(strings_column_view::chars_column_index).data<char>();
+  }
+
+  __device__ size_type const* get_offsets_ptr() const
+  {
+    return d_strings.child(strings_column_view::offsets_column_index).data<size_type>() +
+           d_strings.offset();
+  }
+
+  __device__ string_view const get_string(size_type idx) const
+  {
+    return d_strings.element<string_view>(idx);
+  }
+
+  __device__ string_view const get_replacement_string(size_type idx) const
+  {
+    return d_replacements.size() == 1 ? d_replacements[0] : d_replacements[idx];
+  }
+
+  __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
+
+  /**
+   * @brief Returns the index of the target string found at the given byte position
+   * in the input strings column
+   *
+   * @param idx Index of the byte position in the chars column
+   * @param chars_bytes Number of bytes in the chars column
+   */
+  __device__ thrust::optional<size_type> has_target(size_type idx, size_type chars_bytes) const
+  {
+    auto const d_offsets = get_offsets_ptr();
+    auto const d_chars   = get_base_ptr() + d_offsets[0] + idx;
+    size_type str_idx    = -1;
+    for (std::size_t t = 0; t < d_targets.size(); ++t) {
+      auto const d_tgt = d_targets[t];
+      if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) &&
+          (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) {
+        if (str_idx < 0) {
+          auto const idx_itr =
+            thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx);
+          str_idx = thrust::distance(d_offsets, idx_itr) - 1;
+        }
+        auto const d_str = get_string(str_idx - d_offsets[0]);
+        if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return t; }
+      }
+    }
+    return thrust::nullopt;
+  }
+
+  /**
+   * @brief Count the number of strings that will be produced by the replace
+   *
+   * This includes segments of the string that are not replaced as well as those
+   * that are replaced.
+   *
+   * @param idx Index of the row in d_strings to be processed
+   * @param d_positions Positions of the targets found in the chars column
+   * @param d_targets_offsets Offsets identify which target positions go with the current string
+   * @return Number of substrings resulting from the replace operations on this row
+   */
+  __device__ size_type count_strings(size_type idx,
+                                     target_pair const* d_positions,
+                                     size_type const* d_targets_offsets) const
+  {
+    if (!is_valid(idx)) { return 0; }
+
+    auto const d_str             = get_string(idx);
+    auto const d_str_end         = d_str.data() + d_str.size_bytes();
+    auto const base_ptr          = get_base_ptr();
+    auto const targets_positions = cudf::device_span<target_pair const>(
+      d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]);
+
+    size_type count = 1;  // always at least one string
+    auto str_ptr    = d_str.data();
+    for (auto d_pair : targets_positions) {
+      auto const d_pos   = d_pair.first;
+      auto const d_tgt   = d_targets[d_pair.second];
+      auto const tgt_ptr = base_ptr + d_pos;
+      if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
+        auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
+        if (keep_size > 0) { count++; }  // don't bother counting empty strings
+
+        auto const d_repl = get_replacement_string(d_pair.second);
+        if (!d_repl.empty()) { count++; }
+
+        str_ptr += keep_size + d_tgt.size_bytes();
+      }
+    }
+
+    return count;
+  }
+
+  /**
+   * @brief Retrieve the strings for each row
+   *
+   * This will return string segments as string_index_pair objects for
+   * parts of the string that are not replaced interlaced with the
+   * appropriate replacement string where replacement targets are found.
+   *
+   * This function is called only once to produce both the string_index_pair objects
+   * and the output row size in bytes.
+   *
+   * @param idx Index of the row in d_strings
+   * @param d_offsets Offsets to identify where to store the results of the replace for this string
+   * @param d_positions The target positions found in the chars column
+   * @param d_targets_offsets The offsets to identify which target positions go with this string
+   * @param d_all_strings The output of all the produced string segments
+   * @return The size in bytes of the output string for this row
+   */
+  __device__ size_type get_strings(size_type idx,
+                                   size_type const* d_offsets,
+                                   target_pair const* d_positions,
+                                   size_type const* d_targets_offsets,
+                                   string_index_pair* d_all_strings) const
+  {
+    if (!is_valid(idx)) { return 0; }
+
+    auto const d_output  = d_all_strings + d_offsets[idx];
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+
+    auto const targets_positions = cudf::device_span<target_pair const>(
+      d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]);
+
+    size_type output_idx  = 0;
+    size_type output_size = 0;
+    auto str_ptr          = d_str.data();
+    for (auto d_pair : targets_positions) {
+      auto const d_pos   = d_pair.first;
+      auto const d_tgt   = d_targets[d_pair.second];
+      auto const tgt_ptr = base_ptr + d_pos;
+      if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
+        auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
+        if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; }
+        output_size += keep_size;
+
+        auto const d_repl = get_replacement_string(d_pair.second);
+        if (!d_repl.empty()) {
+          d_output[output_idx++] = string_index_pair{d_repl.data(), d_repl.size_bytes()};
+        }
+        output_size += d_repl.size_bytes();
+
+        str_ptr += keep_size + d_tgt.size_bytes();
+      }
+    }
+    // include any leftover parts of the string
+    if (str_ptr <= d_str_end) {
+      auto const left_size = static_cast<size_type>(thrust::distance(str_ptr, d_str_end));
+      d_output[output_idx] = string_index_pair{str_ptr, left_size};
+      output_size += left_size;
+    }
+    return output_size;
+  }
+
+  replace_multi_parallel_fn(column_device_view const& d_strings,
+                            device_span<string_view const> d_targets,
+                            device_span<string_view const> d_replacements)
+    : d_strings(d_strings), d_targets{d_targets}, d_replacements{d_replacements}
+  {
+  }
+
+ protected:
+  column_device_view d_strings;
+  device_span<string_view const> d_targets;
+  device_span<string_view const> d_replacements;
+};
+
+/**
+ * @brief Used by the copy-if function to produce target_pair objects
+ *
+ * Using an inplace lambda caused a runtime crash in thrust::copy_if
+ * (this happens sometimes when passing device lambdas to thrust algorithms)
+ */
+struct pair_generator {
+  __device__ target_pair operator()(int idx) const
+  {
+    auto pos = fn.has_target(idx, chars_bytes);
+    return target_pair{idx, pos.value_or(-1)};
+  }
+  replace_multi_parallel_fn fn;
+  size_type chars_bytes;
+};
+
+struct copy_if_fn {
+  __device__ bool operator()(target_pair pos) { return pos.second >= 0; }
+};
+
+std::unique_ptr<column> replace_character_parallel(strings_column_view const& input,
+                                                   strings_column_view const& targets,
+                                                   strings_column_view const& repls,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto const strings_count = input.size();
+  auto const chars_bytes =
+    cudf::detail::get_value<size_type>(input.offsets(), input.offset() + strings_count, stream) -
+    cudf::detail::get_value<size_type>(input.offsets(), input.offset(), stream);
+
+  auto d_targets =
+    create_string_vector_from_column(targets, stream, rmm::mr::get_current_device_resource());
+  auto d_replacements =
+    create_string_vector_from_column(repls, stream, rmm::mr::get_current_device_resource());
+
+  replace_multi_parallel_fn fn{*d_strings, d_targets, d_replacements};
+
+  // count the number of targets in the entire column
+  auto const target_count = thrust::count_if(rmm::exec_policy(stream),
+                                             thrust::make_counting_iterator<size_type>(0),
+                                             thrust::make_counting_iterator<size_type>(chars_bytes),
+                                             [fn, chars_bytes] __device__(size_type idx) {
+                                               return fn.has_target(idx, chars_bytes).has_value();
+                                             });
+  // Create a vector of every target position in the chars column.
+  // These may include overlapping targets which will be resolved later.
+  auto targets_positions = rmm::device_uvector<target_pair>(target_count, stream);
+  auto d_positions       = targets_positions.data();
+
+  auto const copy_itr =
+    cudf::detail::make_counting_transform_iterator(0, pair_generator{fn, chars_bytes});
+  auto const copy_end = thrust::copy_if(
+    rmm::exec_policy(stream), copy_itr, copy_itr + chars_bytes, d_positions, copy_if_fn{});
+
+  // create a vector of offsets to each string's set of target positions
+  auto const targets_offsets = [&] {
+    auto string_indices = rmm::device_uvector<size_type>(target_count, stream);
+
+    auto const pos_itr = cudf::detail::make_counting_transform_iterator(
+      0, [d_positions] __device__(auto idx) -> size_type { return d_positions[idx].first; });
+    auto pos_count = std::distance(d_positions, copy_end);
+
+    thrust::upper_bound(rmm::exec_policy(stream),
+                        input.offsets_begin(),
+                        input.offsets_end(),
+                        pos_itr,
+                        pos_itr + pos_count,
+                        string_indices.begin());
+
+    // compute offsets per string
+    auto targets_offsets   = rmm::device_uvector<size_type>(strings_count + 1, stream);
+    auto d_targets_offsets = targets_offsets.data();
+
+    // memset to zero-out the target counts for any null-entries or strings with no targets
+    thrust::uninitialized_fill(
+      rmm::exec_policy(stream), targets_offsets.begin(), targets_offsets.end(), 0);
+
+    // next, count the number of targets per string
+    auto d_string_indices = string_indices.data();
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       target_count,
+                       [d_string_indices, d_targets_offsets] __device__(size_type idx) {
+                         auto const str_idx = d_string_indices[idx] - 1;
+                         atomicAdd(d_targets_offsets + str_idx, 1);
+                       });
+    // finally, convert the counts into offsets
+    thrust::exclusive_scan(rmm::exec_policy(stream),
+                           targets_offsets.begin(),
+                           targets_offsets.end(),
+                           targets_offsets.begin());
+    return targets_offsets;
+  }();
+  auto const d_targets_offsets = targets_offsets.data();
+
+  // compute the number of string segments produced by replace in each string
+  auto counts = rmm::device_uvector<size_type>(strings_count, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings_count),
+                    counts.begin(),
+                    [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type {
+                      return fn.count_strings(idx, d_positions, d_targets_offsets);
+                    });
+
+  // create offsets from the counts
+  auto offsets =
+    std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
+  auto const total_strings =
+    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
+  auto const d_strings_offsets = offsets->view().data<size_type>();
+
+  // build a vector of all the positions for all the strings
+  auto indices   = rmm::device_uvector<string_index_pair>(total_strings, stream);
+  auto d_indices = indices.data();
+  auto d_sizes   = counts.data();  // reusing this vector to hold output sizes now
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    strings_count,
+    [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__(
+      size_type idx) {
+      d_sizes[idx] =
+        fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices);
+    });
+
+  // use this utility to gather the string parts into a contiguous chars column
+  auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr);
+
+  // create offsets from the sizes
+  offsets =
+    std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
+
+  // build the strings columns from the chars and offsets
+  return make_strings_column(strings_count,
+                             std::move(offsets),
+                             std::move(chars->release().children.back()),
+                             input.null_count(),
+                             copy_bitmask(input.parent(), stream, mr));
+}
+
+/**
+ * @brief Function logic for the replace_string_parallel
+ *
+ * Performs the multi-replace operation with a thread per string.
+ * This performs best on smaller strings. @see AVG_CHAR_BYTES_THRESHOLD
+ */
+struct replace_multi_fn {
+  column_device_view const d_strings;
+  column_device_view const d_targets;
+  column_device_view const d_repls;
+  int32_t* d_offsets{};
+  char* d_chars{};
+
+  __device__ void operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) { d_offsets[idx] = 0; }
+      return;
+    }
+    auto const d_str   = d_strings.element<string_view>(idx);
+    char const* in_ptr = d_str.data();
+
+    size_type bytes = d_str.size_bytes();
+    size_type spos  = 0;
+    size_type lpos  = 0;
+    char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
+
+    // check each character against each target
+    while (spos < d_str.size_bytes()) {
+      for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) {
+        auto const d_tgt = d_targets.element<string_view>(tgt_idx);
+        if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
+            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
+        {
+          auto const d_repl = (d_repls.size() == 1) ? d_repls.element<string_view>(0)
+                                                    : d_repls.element<string_view>(tgt_idx);
+          bytes += d_repl.size_bytes() - d_tgt.size_bytes();
+          if (out_ptr) {
+            out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);
+            out_ptr = copy_string(out_ptr, d_repl);
+            lpos    = spos + d_tgt.size_bytes();
+          }
+          spos += d_tgt.size_bytes() - 1;
+          break;
+        }
+      }
+      ++spos;
+    }
+    if (out_ptr)  // copy remainder
+      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
+    else
+      d_offsets[idx] = bytes;
+  }
+};
+
+std::unique_ptr<column> replace_string_parallel(strings_column_view const& input,
+                                                strings_column_view const& targets,
+                                                strings_column_view const& repls,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  auto d_strings      = column_device_view::create(input.parent(), stream);
+  auto d_targets      = column_device_view::create(targets.parent(), stream);
+  auto d_replacements = column_device_view::create(repls.parent(), stream);
+
+  auto children = cudf::strings::detail::make_strings_children(
+    replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);
+
+  return make_strings_column(input.size(),
+                             std::move(children.first),
+                             std::move(children.second),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
+}
+
+}  // namespace
+
+std::unique_ptr<column> replace(strings_column_view const& input,
+                                strings_column_view const& targets,
+                                strings_column_view const& repls,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
+  CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
+               "Parameters targets must not be empty and must not have nulls");
+  CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)),
+               "Parameters repls must not be empty and must not have nulls");
+  if (repls.size() > 1)
+    CUDF_EXPECTS(repls.size() == targets.size(), "Sizes for targets and repls must match");
+
+  return (input.size() == input.null_count() ||
+          ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD))
+           ? replace_string_parallel(input, targets, repls, stream, mr)
+           : replace_character_parallel(input, targets, repls, stream, mr);
+}
+
+}  // namespace detail
+
+// external API
+
+std::unique_ptr<column> replace(strings_column_view const& strings,
+                                strings_column_view const& targets,
+                                strings_column_view const& repls,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::replace(strings, targets, repls, cudf::get_default_stream(), mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index d1a377a4bda..3fc969a4c1f 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -704,92 +704,6 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
 
-namespace {
-/**
- * @brief Function logic for the replace_multi API.
- *
- * This will perform the multi-replace operation on each string.
- */
-struct replace_multi_fn {
-  column_device_view const d_strings;
-  column_device_view const d_targets;
-  column_device_view const d_repls;
-  int32_t* d_offsets{};
-  char* d_chars{};
-
-  __device__ void operator()(size_type idx)
-  {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const d_str   = d_strings.element<string_view>(idx);
-    char const* in_ptr = d_str.data();
-
-    size_type bytes = d_str.size_bytes();
-    size_type spos  = 0;
-    size_type lpos  = 0;
-    char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
-
-    // check each character against each target
-    while (spos < d_str.size_bytes()) {
-      for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) {
-        auto const d_tgt = d_targets.element<string_view>(tgt_idx);
-        if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
-            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
-        {
-          auto const d_repl = (d_repls.size() == 1) ? d_repls.element<string_view>(0)
-                                                    : d_repls.element<string_view>(tgt_idx);
-          bytes += d_repl.size_bytes() - d_tgt.size_bytes();
-          if (out_ptr) {
-            out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);
-            out_ptr = copy_string(out_ptr, d_repl);
-            lpos    = spos + d_tgt.size_bytes();
-          }
-          spos += d_tgt.size_bytes() - 1;
-          break;
-        }
-      }
-      ++spos;
-    }
-    if (out_ptr)  // copy remainder
-      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
-    else
-      d_offsets[idx] = bytes;
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<column> replace(strings_column_view const& strings,
-                                strings_column_view const& targets,
-                                strings_column_view const& repls,
-                                rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
-{
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
-  CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
-               "Parameters targets must not be empty and must not have nulls");
-  CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)),
-               "Parameters repls must not be empty and must not have nulls");
-  if (repls.size() > 1)
-    CUDF_EXPECTS(repls.size() == targets.size(), "Sizes for targets and repls must match");
-
-  auto d_strings = column_device_view::create(strings.parent(), stream);
-  auto d_targets = column_device_view::create(targets.parent(), stream);
-  auto d_repls   = column_device_view::create(repls.parent(), stream);
-
-  // this utility calls the given functor to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    replace_multi_fn{*d_strings, *d_targets, *d_repls}, strings.size(), stream, mr);
-
-  return make_strings_column(strings.size(),
-                             std::move(children.first),
-                             std::move(children.second),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
-}
-
 std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       rmm::cuda_stream_view stream,
@@ -854,14 +768,5 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
   return detail::replace_slice(strings, repl, start, stop, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> replace(strings_column_view const& strings,
-                                strings_column_view const& targets,
-                                strings_column_view const& repls,
-                                rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::replace(strings, targets, repls, cudf::get_default_stream(), mr);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 32e097838c0..85185b2deab 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -290,28 +290,22 @@ TEST_F(StringsReplaceTest, ReplaceSlice)
 
 TEST_F(StringsReplaceTest, ReplaceSliceError)
 {
-  std::vector<const char*> h_strings{"Héllo", "thesé", nullptr, "are not", "important", ""};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  auto strings_view = cudf::strings_column_view(strings);
-  EXPECT_THROW(cudf::strings::replace_slice(strings_view, cudf::string_scalar(""), 4, 1),
-               cudf::logic_error);
+  cudf::test::strings_column_wrapper input({"Héllo", "thesé", "are not", "important", ""});
+  EXPECT_THROW(
+    cudf::strings::replace_slice(cudf::strings_column_view(input), cudf::string_scalar(""), 4, 1),
+    cudf::logic_error);
 }
 
 TEST_F(StringsReplaceTest, ReplaceMulti)
 {
-  auto strings      = build_corpus();
-  auto strings_view = cudf::strings_column_view(strings);
+  auto input        = build_corpus();
+  auto strings_view = cudf::strings_column_view(input);
 
-  std::vector<const char*> h_targets{"the ", "a ", "to "};
-  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+  cudf::test::strings_column_wrapper targets({"the ", "a ", "to "});
   auto targets_view = cudf::strings_column_view(targets);
 
   {
-    std::vector<const char*> h_repls{"_ ", "A ", "2 "};
-    cudf::test::strings_column_wrapper repls(h_repls.begin(), h_repls.end());
+    cudf::test::strings_column_wrapper repls({"_ ", "A ", "2 "});
     auto repls_view = cudf::strings_column_view(repls);
 
     auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
@@ -331,8 +325,7 @@ TEST_F(StringsReplaceTest, ReplaceMulti)
   }
 
   {
-    std::vector<const char*> h_repls{"* "};
-    cudf::test::strings_column_wrapper repls(h_repls.begin(), h_repls.end());
+    cudf::test::strings_column_wrapper repls({"* "});
     auto repls_view = cudf::strings_column_view(repls);
 
     auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
@@ -352,6 +345,129 @@ TEST_F(StringsReplaceTest, ReplaceMulti)
   }
 }
 
+TEST_F(StringsReplaceTest, ReplaceMultiLong)
+{
+  // The length of the strings are to trigger the code path governed by the AVG_CHAR_BYTES_THRESHOLD
+  // setting in the multi.cu.
+  auto input = cudf::test::strings_column_wrapper(
+    {"This string needs to be very long to trigger the long-replace internal functions. "
+     "This string needs to be very long to trigger the long-replace internal functions. "
+     "This string needs to be very long to trigger the long-replace internal functions. "
+     "This string needs to be very long to trigger the long-replace internal functions.",
+     "012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012"
+     "345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345"
+     "678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678"
+     "901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901"
+     "2345678901234567890123456789",
+     "012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012"
+     "345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345"
+     "678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678"
+     "901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901"
+     "2345678901234567890123456789",
+     "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+     "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+     "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+     "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+     "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá",
+     "",
+     ""},
+    {1, 1, 1, 1, 0, 1});
+  auto strings_view = cudf::strings_column_view(input);
+
+  auto targets      = cudf::test::strings_column_wrapper({"78901", "bananá", "ápple", "78"});
+  auto targets_view = cudf::strings_column_view(targets);
+
+  {
+    cudf::test::strings_column_wrapper repls({"x", "PEAR", "avocado", "$$"});
+    auto repls_view = cudf::strings_column_view(repls);
+
+    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+
+    cudf::test::strings_column_wrapper expected(
+      {"This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions.",
+       "0123456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456"
+       "x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x"
+       "23456x23456x23456x23456x23456x23456x23456x23456x23456x23456$$9",
+       "0123456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456"
+       "x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x"
+       "23456x23456x23456x23456x23456x23456x23456x23456x23456x23456$$9",
+       "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR "
+       "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR "
+       "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR "
+       "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR "
+       "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR",
+       "",
+       ""},
+      {1, 1, 1, 1, 0, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+
+  {
+    cudf::test::strings_column_wrapper repls({"*"});
+    auto repls_view = cudf::strings_column_view(repls);
+
+    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+
+    cudf::test::strings_column_wrapper expected(
+      {"This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions.",
+       "0123456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*"
+       "23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*"
+       "23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*9",
+       "0123456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*"
+       "23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*"
+       "23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*9",
+       "Test string for overlap check: banana* * ** ban* * * Test string for overlap check: "
+       "banana* * ** ban* * * Test string for overlap check: banana* * ** ban* * * Test string for "
+       "overlap check: banana* * ** ban* * * Test string for overlap check: banana* * ** ban* * *",
+       "",
+       ""},
+      {1, 1, 1, 1, 0, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+
+  {
+    targets =
+      cudf::test::strings_column_wrapper({"01234567890123456789012345678901234567890123456789012345"
+                                          "6789012345678901234567890123456789012"
+                                          "34567890123456789012345678901234567890123456789012345678"
+                                          "9012345678901234567890123456789012345"
+                                          "67890123456789012345678901234567890123456789012345678901"
+                                          "2345678901234567890123456789012345678"
+                                          "90123456789012345678901234567890123456789012345678901234"
+                                          "5678901234567890123456789012345678901"
+                                          "2345678901234567890123456789",
+                                          "78"});
+    targets_view    = cudf::strings_column_view(targets);
+    auto repls      = cudf::test::strings_column_wrapper({""});
+    auto repls_view = cudf::strings_column_view(repls);
+
+    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+
+    cudf::test::strings_column_wrapper expected(
+      {"This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions.",
+       "",
+       "",
+       "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+       "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+       "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+       "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+       "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá",
+       "",
+       ""},
+      {1, 1, 1, 1, 0, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
 TEST_F(StringsReplaceTest, EmptyStringsColumn)
 {
   cudf::column_view zero_size_strings_column(

From d9e1b90dc135f5dfb7dbf91f24de91c3c046843a Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 17 Mar 2023 09:38:03 -0400
Subject: [PATCH 19/63] cudftestutil supports static gtest dependencies
 (#12957)

cudftestutil needs to be statically so that it can support gtest. So to allow this and the preload hooks we have to roll `tests/utilities/default_stream.cpp` into a separate lib.

Will correct downstream issues including: https://github.com/NVIDIA/spark-rapids-jni/issues/1006

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/12957
---
 conda/recipes/libcudf/meta.yaml |  2 +-
 cpp/CMakeLists.txt              | 42 ++++++++++++++++++++++++++-------
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index b31ff37d23b..852f27b2c9d 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -74,7 +74,7 @@ outputs:
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
-        - test -f $PREFIX/lib/libcudftestutil.so
+        - test -f $PREFIX/lib/libcudftestutil.a
         - test -f $PREFIX/lib/libcudf_identify_stream_usage_mode_cudf.so
         - test -f $PREFIX/lib/libcudf_identify_stream_usage_mode_testing.so
         - test -f $PREFIX/include/cudf/aggregation.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c50464762c0..0fcd1895972 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -73,7 +73,7 @@ option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compila
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 
 set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON)
-if(${CUDA_STATIC_RUNTIME})
+if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS)
   set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL OFF)
 endif()
 option(
@@ -741,15 +741,39 @@ add_library(cudf::cudf ALIAS cudf)
 
 if(CUDF_BUILD_TESTUTIL)
   add_library(
-    # This library must also be compiled as a dynamic library to support
-    # LD_PRELOAD injection of symbols. We currently leverage this for
-    # stream-related library validation and may make use of it for other
-    # similar features in the future.
-    cudftestutil SHARED
+    cudftest_default_stream
+    # When compiled as a dynamic library allows us to use LD_PRELOAD injection of symbols. We
+    # currently leverage this for stream-related library validation and may make use of it for
+    # other similar features in the future.
+    tests/utilities/default_stream.cpp
+  )
+  set_target_properties(
+    cudftest_default_stream
+    PROPERTIES BUILD_RPATH "\$ORIGIN"
+               INSTALL_RPATH "\$ORIGIN"
+               # set target compile options
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+  target_link_libraries(
+    cudftest_default_stream
+    PUBLIC cudf
+    PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+
+  add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
+
+  # Needs to be static so that we support usage of static builds of gtest which doesn't compile with
+  # fPIC enabled and therefore can't be embedded into shared libraries.
+  add_library(
+    cudftestutil STATIC
     tests/io/metadata_utilities.cpp
     tests/utilities/base_fixture.cpp
     tests/utilities/column_utilities.cu
-    tests/utilities/default_stream.cpp
     tests/utilities/table_utilities.cu
     tests/utilities/tdigest_utilities.cu
   )
@@ -774,7 +798,7 @@ if(CUDF_BUILD_TESTUTIL)
 
   target_link_libraries(
     cudftestutil
-    PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf
+    PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf cudftest_default_stream
     PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
   )
 
@@ -872,7 +896,7 @@ install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf ${CUDF_SOURCE_DIR}/include/cud
 
 if(CUDF_BUILD_TESTUTIL)
   install(
-    TARGETS cudftestutil
+    TARGETS cudftest_default_stream cudftestutil
     DESTINATION ${lib_dir}
     EXPORT cudf-testing-exports
   )

From 49e87b8bc10643821a3f49ebc0d1045a38c48376 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 17 Mar 2023 14:53:31 -0500
Subject: [PATCH 20/63] Declare a different name for nan_equality.UNEQUAL to
 prevent Cython warnings. (#12947)

Closes #9462.

```
warning: cudf/_lib/cpp/types.pxd:51:8: 'UNEQUAL' redeclared
```

We've been dealing with warnings like this for as long as I've worked on cudf. I think it'd be good to fix this by renaming the Cython enum. We don't rely on this name very much at the Cython/Python level, so it's a small change. (This has been irking me for a long time and I want to see this warning go away in the build logs.)

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/12947
---
 python/cudf/cudf/_lib/cpp/types.pxd | 6 ++++--
 python/cudf/cudf/_lib/lists.pyx     | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd
index b1a257feedf..e4106ffb99d 100644
--- a/python/cudf/cudf/_lib/cpp/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, uint32_t
 
@@ -47,8 +47,10 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         UNEQUAL "cudf::null_equality::UNEQUAL"
 
     ctypedef enum nan_equality "cudf::nan_equality":
+        # These names differ from the C++ names due to Cython warnings if
+        # "UNEQUAL" is declared by both null_equality and nan_equality.
         ALL_EQUAL "cudf::nan_equality::ALL_EQUAL"
-        UNEQUAL "cudf::nan_equality::UNEQUAL"
+        NANS_UNEQUAL "cudf::nan_equality::UNEQUAL"
 
     ctypedef enum type_id "cudf::type_id":
         EMPTY                  "cudf::type_id::EMPTY"
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 47e9dccc8e6..199641fd2ce 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -84,7 +84,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
         null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
     )
     cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL
+        nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.NANS_UNEQUAL
     )
 
     cdef unique_ptr[column] c_result

From caef9a6541fda63b32e9b90a0fe67b8620ba16bb Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 17 Mar 2023 16:39:10 -0400
Subject: [PATCH 21/63] Include gtest in build environment. (#12956)

Google test is currently being pulled via CPM when our packages are being built instead of using the conda package. This results in gtest (and all its CMake files) being bundled into the cudf package, which interferes with package discovery.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Mark Sadang (https://github.com/msadang)

URL: https://github.com/rapidsai/cudf/pull/12956
---
 conda/environments/all_cuda-118_arch-x86_64.yaml |  2 ++
 conda/recipes/libcudf/meta.yaml                  |  6 ++++--
 dependencies.yaml                                | 11 +++++++++--
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index ef92a9ad80f..b71101e7e3b 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -30,6 +30,8 @@ dependencies:
 - fmt>=9.1.0,<10
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
+- gmock==1.10.0.*
+- gtest==1.10.0.*
 - hypothesis
 - ipython
 - libarrow==10.0.1.*
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 852f27b2c9d..caa807bd7ec 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -51,6 +51,8 @@ requirements:
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
     - spdlog {{ spdlog_version }}
+    - gtest {{ gtest_version }}
+    - gmock {{ gtest_version }}
 
 outputs:
   - name: libcudf
@@ -71,6 +73,8 @@ outputs:
         - librmm ={{ minor_version }}
         - libarrow {{ libarrow_version }}
         - dlpack {{ dlpack_version }}
+        - gtest {{ gtest_version }}
+        - gmock {{ gtest_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
@@ -379,8 +383,6 @@ outputs:
         - {{ pin_subpackage('libcudf', exact=True) }}
         - {{ pin_subpackage('libcudf_kafka', exact=True) }}
         - cudatoolkit {{ cuda_spec }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
diff --git a/dependencies.yaml b/dependencies.yaml
index e3fe3bbda0d..12a6d1b370e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -212,11 +212,13 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=9.1.0,<10
-          - librdkafka=1.7.0
-          - spdlog>=1.11.0,<1.12
+          - &gtest gtest==1.10.0.*
+          - &gmock gmock==1.10.0.*
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - libarrow==10.0.1.*
+          - librdkafka=1.7.0
+          - spdlog>=1.11.0,<1.12
   build_wheels:
     common:
       - output_types: pyproject
@@ -405,6 +407,11 @@ dependencies:
           - *cudf
           - cudf_kafka==23.4.*
   test_cpp:
+    common:
+      - output_types: conda
+        packages:
+          - *gtest
+          - *gmock
     specific:
       - output_types: conda
         matrices:

From 8fbfb4ae5f083ac8473023f9e4982e8021b83c6d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 17 Mar 2023 17:13:40 -0400
Subject: [PATCH 22/63] Remove MANIFEST.in use auto-generated one for sdists
 and package_data for wheels (#12960)

Using MANIFEST.in currently runs into a pretty nasty scikit-build bug (https://github.com/scikit-build/scikit-build/issues/886) that results in any file included by the manifest being copied from the install tree back into the source tree whenever an in place build occurs after an install, overwriting any local changes. We need an alternative approach to ensure that all necessary files are included in built packages. There are two types:
- sdists: scikit-build automatically generates a manifest during sdist generation if we don't provide one, and that manifest is reliably complete. It contains all files needed for a source build up to the cudf C++ code (which has always been true and is something we can come back to improving later if desired).
- wheels: The autogenerated manifest is not used during wheel generation because the manifest generation hook is not invoked during wheel builds, so to include data in the wheels we must provide the `package_data` argument to `setup`. In this case we do not need to include CMake or pyx files because the result does not need to be possible to build from, it just needs pxd files for other packages to cimport if desired.

I also reverted #12945, which was a stopgap solution to avoid this underlying problem. That change would have caused import issues inside the python/cudf directory when installing (the lack of an inplace build would have made the source tree unimportable) so this fix removes that minor limitation introduced in that PR.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12960
---
 build.sh                      |  5 ++---
 python/cudf/MANIFEST.in       | 16 ----------------
 python/cudf/setup.py          |  5 +++--
 python/cudf_kafka/MANIFEST.in |  3 ---
 python/cudf_kafka/setup.py    |  8 +++-----
 5 files changed, 8 insertions(+), 29 deletions(-)
 delete mode 100644 python/cudf/MANIFEST.in
 delete mode 100644 python/cudf_kafka/MANIFEST.in

diff --git a/build.sh b/build.sh
index 0de87826ba7..bee66d819b4 100755
--- a/build.sh
+++ b/build.sh
@@ -331,10 +331,9 @@ fi
 if buildAll || hasArg cudf; then
 
     cd ${REPODIR}/python/cudf
+    python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     if [[ ${INSTALL_TARGET} != "" ]]; then
-        python setup.py install --single-version-externally-managed --record=record.txt -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
-    else
-        python setup.py build_ext --inplace -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
+        python setup.py install --single-version-externally-managed --record=record.txt  -- -DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS} -- -j${PARALLEL_LEVEL:-1}
     fi
 fi
 
diff --git a/python/cudf/MANIFEST.in b/python/cudf/MANIFEST.in
deleted file mode 100644
index 4d3155158f8..00000000000
--- a/python/cudf/MANIFEST.in
+++ /dev/null
@@ -1,16 +0,0 @@
-# Cython files
-recursive-include cudf *.pxd
-recursive-include cudf *.pyx
-
-# Typing files
-recursive-include cudf *.pyi
-
-# C++ files
-recursive-include cudf *.hpp
-recursive-include udf_cpp *.hpp
-recursive-include udf_cpp *.cuh
-
-# Build files. Don't use a recursive include on '.' in case the repo is dirty
-include . CMakeLists.txt
-recursive-include cudf CMakeLists.txt
-recursive-include cmake *
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 8a7ebf574fe..96b91b4ccc0 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -3,8 +3,9 @@
 from setuptools import find_packages
 from skbuild import setup
 
+packages = find_packages(include=["cudf*", "udf_cpp*"])
 setup(
-    include_package_data=True,
-    packages=find_packages(include=["cudf", "cudf.*"]),
+    packages=packages,
+    package_data={key: ["*.pxd", "*.hpp", "*.cuh"] for key in packages},
     zip_safe=False,
 )
diff --git a/python/cudf_kafka/MANIFEST.in b/python/cudf_kafka/MANIFEST.in
deleted file mode 100644
index 249a9238816..00000000000
--- a/python/cudf_kafka/MANIFEST.in
+++ /dev/null
@@ -1,3 +0,0 @@
-# Cython files
-recursive-include cudf_kafka *.pxd
-recursive-include cudf_kafka *.pyx
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index c915b7e80ab..d955d95858a 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -80,6 +80,7 @@
     )
 ]
 
+packages = find_packages(include=["cudf_kafka*"])
 setup(
     # Include the separately-compiled shared library
     ext_modules=cythonize(
@@ -89,10 +90,7 @@
             profile=False, language_level=3, embedsignature=True
         ),
     ),
-    packages=find_packages(include=["cudf_kafka", "cudf_kafka.*"]),
-    package_data=dict.fromkeys(
-        find_packages(include=["cudf_kafka._lib*"]),
-        ["*.pxd"],
-    ),
+    packages=packages,
+    package_data={key: ["*.pxd"] for key in packages},
     zip_safe=False,
 )

From 1a98adacba5ff37d942968f2906fbdd1dc363662 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 20 Mar 2023 10:02:24 -0400
Subject: [PATCH 23/63] Remove default detail mrs: part1 (#12964)

This is the first PR in a sequence removing default mr parameters in detail APIs. Contributes to #12944.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/12964
---
 cpp/include/cudf/detail/binaryop.hpp          |  54 +++---
 .../detail/calendrical_month_sequence.cuh     |   4 +-
 cpp/include/cudf/detail/datetime.hpp          | 118 ++++++-------
 cpp/include/cudf/detail/fill.hpp              |  15 +-
 .../detail/groupby/group_replace_nulls.hpp    |  13 +-
 .../cudf/detail/groupby/sort_helper.hpp       |  24 ++-
 cpp/include/cudf/detail/interop.hpp           |  21 +--
 cpp/include/cudf/detail/label_bins.hpp        |  17 +-
 .../cudf/detail/reduction_functions.hpp       | 157 ++++++++----------
 cpp/src/groupby/groupby.cu                    |   4 +-
 cpp/src/groupby/sort/functors.hpp             |   6 +-
 cpp/src/groupby/sort/scan.cpp                 |   4 +-
 cpp/src/text/generate_ngrams.cu               |   3 +-
 13 files changed, 199 insertions(+), 241 deletions(-)

diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index ffd8be971ab..e5609568d10 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,13 +30,12 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  std::string const& ptx,
-  data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         std::string const& ptx,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::binary_operation(scalar const&, column_view const&, binary_operator,
@@ -44,13 +43,12 @@ std::unique_ptr<column> binary_operation(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> binary_operation(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(scalar const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator,
@@ -58,13 +56,12 @@ std::unique_ptr<column> binary_operation(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         scalar const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
@@ -72,12 +69,11 @@ std::unique_ptr<column> binary_operation(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/calendrical_month_sequence.cuh b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
index 9dba0ba8961..59fb6758973 100644
--- a/cpp/include/cudf/detail/calendrical_month_sequence.cuh
+++ b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ struct calendrical_month_sequence_functor {
     scalar const& input,
     size_type months,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    rmm::mr::device_memory_resource* mr)
   {
     // Return empty column if n = 0
     if (n == 0) return cudf::make_empty_column(input.type());
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index c2e3c32b65f..c5160958165 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,70 +29,63 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_year(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_year(cudf::column_view const& column,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_month(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_month(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_month(cudf::column_view const& column,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_day(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_day(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_day(cudf::column_view const& column,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_weekday(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_weekday(cudf::column_view const& column,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_hour(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_hour(cudf::column_view const& column,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_minute(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_minute(cudf::column_view const& column,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_second(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_second(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_second(cudf::column_view const& column,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&,
@@ -100,10 +93,9 @@ std::unique_ptr<cudf::column> extract_second(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_millisecond_fraction(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_millisecond_fraction(cudf::column_view const& column,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&,
@@ -111,10 +103,9 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_microsecond_fraction(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_microsecond_fraction(cudf::column_view const& column,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&,
@@ -122,30 +113,27 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_nanosecond_fraction(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_nanosecond_fraction(cudf::column_view const& column,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> last_day_of_month(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> last_day_of_month(cudf::column_view const& column,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> day_of_year(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&,
@@ -153,11 +141,10 @@ std::unique_ptr<cudf::column> day_of_year(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> add_calendrical_months(
-  cudf::column_view const& timestamps,
-  cudf::column_view const& months,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
+                                                     cudf::column_view const& months,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&,
@@ -165,26 +152,23 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> add_calendrical_months(
-  cudf::column_view const& timestamps,
-  cudf::scalar const& months,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
+                                                     cudf::scalar const& months,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> is_leap_year(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> is_leap_year(cudf::column_view const& column,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<cudf::column> extract_quarter(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_quarter(cudf::column_view const& column,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace datetime
diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index e34acfff6b9..caaccfb4851 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,13 +43,12 @@ void fill_in_place(mutable_column_view& destination,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> fill(
-  column_view const& input,
-  size_type begin,
-  size_type end,
-  scalar const& value,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> fill(column_view const& input,
+                             size_type begin,
+                             size_type end,
+                             scalar const& value,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
index 9e64048b7b4..e081a626c75 100644
--- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,12 +36,11 @@ namespace detail {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param[in] mr Device memory resource used to allocate device memory of the returned column.
  */
-std::unique_ptr<column> group_replace_nulls(
-  cudf::column_view const& grouped_value,
-  device_span<size_type const> group_labels,
-  cudf::replace_policy replace_policy,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_value,
+                                            device_span<size_type const> group_labels,
+                                            cudf::replace_policy replace_policy,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index e2510d75a83..663ff44ca56 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -85,10 +85,9 @@ struct sort_groupby_helper {
    * @param values The value column to group and sort
    * @return the sorted and grouped column
    */
-  std::unique_ptr<column> sorted_values(
-    column_view const& values,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::unique_ptr<column> sorted_values(column_view const& values,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Groups a column of values according to `keys`
@@ -100,28 +99,25 @@ struct sort_groupby_helper {
    * @param values The value column to group
    * @return the grouped column
    */
-  std::unique_ptr<column> grouped_values(
-    column_view const& values,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::unique_ptr<column> grouped_values(column_view const& values,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Get a table of sorted unique keys
    *
    * @return a new table in which each row is a unique row in the sorted key table.
    */
-  std::unique_ptr<table> unique_keys(
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::unique_ptr<table> unique_keys(rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Get a table of sorted keys
    *
    * @return a new table containing the sorted keys.
    */
-  std::unique_ptr<table> sorted_keys(
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::unique_ptr<table> sorted_keys(rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Get the number of groups in `keys`
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 452144da167..7117517487c 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -40,20 +40,18 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> from_dlpack(
-  DLManagedTensor const* managed_tensor,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::to_dlpack
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-DLManagedTensor* to_dlpack(
-  table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+DLManagedTensor* to_dlpack(table_view const& input,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr);
 
 // Creating arrow as per given type_id and buffer arguments
 template <typename... Ts>
@@ -120,10 +118,9 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> from_arrow(
-  arrow::Table const& input_table,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
index f556c81c371..7f3cf033e66 100644
--- a/cpp/include/cudf/detail/label_bins.hpp
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,14 +45,13 @@ namespace detail {
  *
  * @param stream Stream view on which to allocate resources and queue execution.
  */
-std::unique_ptr<column> label_bins(
-  column_view const& input,
-  column_view const& left_edges,
-  inclusive left_inclusive,
-  column_view const& right_edges,
-  inclusive right_inclusive,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> label_bins(column_view const& input,
+                                   column_view const& left_edges,
+                                   inclusive left_inclusive,
+                                   column_view const& right_edges,
+                                   inclusive right_inclusive,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 /** @} */  // end of group
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp
index 1f892bb90c5..c554ea6a83e 100644
--- a/cpp/include/cudf/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/detail/reduction_functions.hpp
@@ -42,12 +42,11 @@ namespace reduction {
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Sum as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> sum(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> sum(column_view const& col,
+                            data_type const output_dtype,
+                            std::optional<std::reference_wrapper<scalar const>> init,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes minimum of elements in input column
@@ -63,12 +62,11 @@ std::unique_ptr<scalar> sum(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Minimum element as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> min(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> min(column_view const& col,
+                            data_type const output_dtype,
+                            std::optional<std::reference_wrapper<scalar const>> init,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes maximum of elements in input column
@@ -84,12 +82,11 @@ std::unique_ptr<scalar> min(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Maximum element as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> max(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> max(column_view const& col,
+                            data_type const output_dtype,
+                            std::optional<std::reference_wrapper<scalar const>> init,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes any of elements in input column is true when typecasted to bool
@@ -106,12 +103,11 @@ std::unique_ptr<scalar> max(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return bool scalar if any of elements is true when typecasted to bool
  */
-std::unique_ptr<scalar> any(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> any(column_view const& col,
+                            data_type const output_dtype,
+                            std::optional<std::reference_wrapper<scalar const>> init,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes all of elements in input column is true when typecasted to bool
@@ -128,12 +124,11 @@ std::unique_ptr<scalar> any(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return bool scalar if all of elements is true when typecasted to bool
  */
-std::unique_ptr<scalar> all(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> all(column_view const& col,
+                            data_type const output_dtype,
+                            std::optional<std::reference_wrapper<scalar const>> init,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes product of elements in input column
@@ -150,12 +145,11 @@ std::unique_ptr<scalar> all(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Product as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> product(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> product(column_view const& col,
+                                data_type const output_dtype,
+                                std::optional<std::reference_wrapper<scalar const>> init,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes sum of squares of elements in input column
@@ -171,11 +165,10 @@ std::unique_ptr<scalar> product(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Sum of squares as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> sum_of_squares(
-  column_view const& col,
-  data_type const output_dtype,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> sum_of_squares(column_view const& col,
+                                       data_type const output_dtype,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes mean of elements in input column
@@ -191,11 +184,10 @@ std::unique_ptr<scalar> sum_of_squares(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Mean as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> mean(
-  column_view const& col,
-  data_type const output_dtype,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> mean(column_view const& col,
+                             data_type const output_dtype,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes variance of elements in input column
@@ -213,12 +205,11 @@ std::unique_ptr<scalar> mean(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Variance as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> variance(
-  column_view const& col,
-  data_type const output_dtype,
-  size_type ddof,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> variance(column_view const& col,
+                                 data_type const output_dtype,
+                                 size_type ddof,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes standard deviation of elements in input column
@@ -236,12 +227,11 @@ std::unique_ptr<scalar> variance(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Standard deviation as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> standard_deviation(
-  column_view const& col,
-  data_type const output_dtype,
-  size_type ddof,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> standard_deviation(column_view const& col,
+                                           data_type const output_dtype,
+                                           size_type ddof,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Returns nth element in input column
@@ -267,12 +257,11 @@ std::unique_ptr<scalar> standard_deviation(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return nth element as scalar
  */
-std::unique_ptr<scalar> nth_element(
-  column_view const& col,
-  size_type n,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> nth_element(column_view const& col,
+                                    size_type n,
+                                    null_policy null_handling,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Collect input column into a (list) scalar
@@ -283,11 +272,10 @@ std::unique_ptr<scalar> nth_element(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return collected list as scalar
  */
-std::unique_ptr<scalar> collect_list(
-  column_view const& col,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> collect_list(column_view const& col,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Merge a bunch of list scalars into single list scalar
@@ -297,10 +285,9 @@ std::unique_ptr<scalar> collect_list(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return merged list as scalar
  */
-std::unique_ptr<scalar> merge_lists(
-  lists_column_view const& col,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> merge_lists(lists_column_view const& col,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Collect input column into a (list) scalar without duplicated elements
@@ -313,13 +300,12 @@ std::unique_ptr<scalar> merge_lists(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return collected list with unique elements as scalar
  */
-std::unique_ptr<scalar> collect_set(
-  column_view const& col,
-  null_policy null_handling,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> collect_set(column_view const& col,
+                                    null_policy null_handling,
+                                    null_equality nulls_equal,
+                                    nan_equality nans_equal,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Merge a bunch of list scalars into single list scalar then drop duplicated elements
@@ -331,12 +317,11 @@ std::unique_ptr<scalar> collect_set(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return collected list with unique elements as scalar
  */
-std::unique_ptr<scalar> merge_sets(
-  lists_column_view const& col,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
+                                   null_equality nulls_equal,
+                                   nan_equality nans_equal,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 1979108eaa2..df590c0c4b9 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -37,6 +37,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -305,7 +306,8 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
     thrust::make_counting_iterator(values.num_columns()),
     std::back_inserter(results),
     [&](size_type i) {
-      auto grouped_values = helper().grouped_values(values.column(i), stream);
+      auto grouped_values =
+        helper().grouped_values(values.column(i), stream, rmm::mr::get_current_device_resource());
       return cudf::detail::segmented_shift(
         grouped_values->view(), group_offsets, offsets[i], fill_values[i].get(), stream, mr);
     });
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index bcc190c745b..be36956b929 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,7 +78,7 @@ struct store_result_functor {
       // It's overridden in scan implementation.
       return sorted_values->view();
     else
-      return (grouped_values = helper.grouped_values(values, stream))->view();
+      return (grouped_values = helper.grouped_values(values, stream, mr))->view();
   };
 
   /**
@@ -90,7 +90,7 @@ struct store_result_functor {
   column_view get_sorted_values()
   {
     return sorted_values ? sorted_values->view()
-                         : (sorted_values = helper.sorted_values(values, stream))->view();
+                         : (sorted_values = helper.sorted_values(values, stream, mr))->view();
   };
 
  protected:
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 743ca5e8065..1aaa06750db 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ struct scan_result_functor final : store_result_functor {
     if (grouped_values)
       return grouped_values->view();
     else
-      return (grouped_values = helper.grouped_values(values, stream))->view();
+      return (grouped_values = helper.grouped_values(values, stream, mr))->view();
   };
 };
 
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 1d5a738f8ce..8039729d749 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -108,7 +108,8 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                              if (d_strings.is_null(idx)) return false;
                              return !d_strings.element<cudf::string_view>(idx).empty();
                            },
-                           stream)
+                           stream,
+                           rmm::mr::get_current_device_resource())
                            ->release();
     strings_count = table_offsets.front()->size() - 1;
     auto result   = std::move(table_offsets.front());

From aeb05bfaacc43647dfce4a1a0a287545b0c06a87 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 20 Mar 2023 10:04:44 -0400
Subject: [PATCH 24/63] Remove default detail mrs: part7 (#12970)

This is the seventh PR in a sequence removing default mr parameters in detail APIs. Contributes to https://github.com/rapidsai/cudf/issues/12944.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/12970
---
 cpp/benchmarks/io/json/nested_json.cpp        |  5 ++-
 cpp/src/io/json/json_column.cu                | 12 +++--
 cpp/src/io/json/nested_json.hpp               | 40 ++++++++---------
 cpp/src/io/json/nested_json_gpu.cu            |  2 +-
 cpp/src/join/conditional_join.cu              |  8 ++--
 cpp/src/join/cross_join.cu                    | 11 +++--
 cpp/src/join/join_common_utils.cuh            |  9 ++--
 cpp/src/join/mixed_join.cu                    |  7 ++-
 cpp/src/join/mixed_join_semi.cu               |  4 +-
 cpp/src/join/semi_join.cu                     |  4 +-
 cpp/src/lists/copying/concatenate.cu          |  9 ++--
 cpp/src/lists/reverse.cu                      |  5 ++-
 cpp/src/lists/set_operations.cu               | 44 ++++++++++--------
 cpp/src/lists/stream_compaction/distinct.cu   |  7 +--
 cpp/src/lists/utilities.hpp                   | 18 ++++----
 cpp/src/merge/merge.cu                        |  2 +-
 cpp/src/reductions/reductions.cpp             | 15 +++----
 cpp/src/reductions/scan/scan_inclusive.cu     | 13 +++---
 cpp/src/replace/clamp.cu                      | 45 +++++++++----------
 cpp/src/stream_compaction/distinct.cu         |  3 +-
 cpp/src/stream_compaction/distinct_reduce.cuh |  2 +-
 cpp/tests/io/json_tree.cpp                    | 24 +++++++---
 22 files changed, 154 insertions(+), 135 deletions(-)

diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp
index 416cf403671..d03f36ca81f 100644
--- a/cpp/benchmarks/io/json/nested_json.cpp
+++ b/cpp/benchmarks/io/json/nested_json.cpp
@@ -171,7 +171,8 @@ void BM_NESTED_JSON(nvbench::state& state)
     cudf::io::json::detail::device_parse_nested_json(
       cudf::device_span<char const>{input->data(), static_cast<size_t>(input->size())},
       default_options,
-      cudf::get_default_stream());
+      cudf::get_default_stream(),
+      rmm::mr::get_current_device_resource());
   });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
@@ -202,7 +203,7 @@ void BM_NESTED_JSON_DEPTH(nvbench::state& state)
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     // Allocate device-side temporary storage & run algorithm
     cudf::io::json::detail::device_parse_nested_json(
-      input, default_options, cudf::get_default_stream());
+      input, default_options, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 16273b35a11..d174cc8aca3 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -893,7 +893,8 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     // Parse the JSON and get the token stream
     const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream);
     // gpu tree generation
-    return get_tree_representation(tokens_gpu, token_indices_gpu, stream);
+    return get_tree_representation(
+      tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
@@ -913,8 +914,13 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     return h_node_categories[0] == NC_LIST and h_node_categories[1] == NC_LIST;
   }();
 
-  auto [gpu_col_id, gpu_row_offsets] = records_orient_tree_traversal(
-    d_input, gpu_tree, is_array_of_arrays, options.is_enabled_lines(), stream);
+  auto [gpu_col_id, gpu_row_offsets] =
+    records_orient_tree_traversal(d_input,
+                                  gpu_tree,
+                                  is_array_of_arrays,
+                                  options.is_enabled_lines(),
+                                  stream,
+                                  rmm::mr::get_current_device_resource());
 
   device_json_column root_column(stream, mr);
   root_column.type = json_col_t::ListColumn;
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 409786d5f1d..f44b7d1ddcc 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -191,11 +191,10 @@ void get_stack_context(device_span<SymbolT const> json_in,
  * @return A tree representation of the input JSON string as vectors of node type, parent index,
  * level, begin index, and end index in the input JSON string
  */
-tree_meta_t get_tree_representation(
-  device_span<PdaTokenT const> tokens,
-  device_span<SymbolOffsetT const> token_indices,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
+                                    device_span<SymbolOffsetT const> token_indices,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Traverse the tree representation of the JSON input in records orient format and populate
@@ -211,13 +210,12 @@ tree_meta_t get_tree_representation(
  * @return A tuple of the output column indices and the row offsets within each column for each node
  */
 std::tuple<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-records_orient_tree_traversal(
-  device_span<SymbolT const> d_input,
-  tree_meta_t const& d_tree,
-  bool is_array_of_arrays,
-  bool is_enabled_lines,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+records_orient_tree_traversal(device_span<SymbolT const> d_input,
+                              tree_meta_t const& d_tree,
+                              bool is_array_of_arrays,
+                              bool is_enabled_lines,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Searches for and selects nodes at level `row_array_children_level`. For each selected
@@ -258,11 +256,10 @@ reduce_to_column_tree(tree_meta_t& tree,
  * All processing is done in device memory.
  *
  */
-table_with_metadata device_parse_nested_json(
-  device_span<SymbolT const> input,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
+                                             cudf::io::json_reader_options const& options,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Parses the given JSON string and generates table from the given input.
@@ -273,11 +270,10 @@ table_with_metadata device_parse_nested_json(
  * @param mr Optional, resource with which to allocate
  * @return The data parsed from the given JSON input
  */
-table_with_metadata host_parse_nested_json(
-  device_span<SymbolT const> input,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+table_with_metadata host_parse_nested_json(device_span<SymbolT const> input,
+                                           cudf::io::json_reader_options const& options,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index fb58b48d68d..30b3911089f 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1169,7 +1169,7 @@ void make_json_column(json_column& root_column,
                       cudf::io::json_reader_options const& options,
                       bool include_quote_char,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                      rmm::mr::device_memory_resource* mr)
 {
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index cf1476d8bcc..c3073524467 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,8 @@ conditional_join(table_view const& left,
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream);
+      case join_kind::FULL_JOIN:
+        return get_trivial_left_join_indices(left, stream, rmm::mr::get_current_device_resource());
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
@@ -75,7 +76,8 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped = get_trivial_left_join_indices(right, stream);
+        auto ret_flipped =
+          get_trivial_left_join_indices(right, stream, rmm::mr::get_current_device_resource());
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index 7358726d69d..07057acf37e 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,11 +37,10 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<cudf::table> cross_join(
-  cudf::table_view const& left,
-  cudf::table_view const& right,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
+                                        cudf::table_view const& right,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty");
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 44cddd2720e..bc4c62291b2 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -140,10 +140,9 @@ class pair_equality {
  */
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-get_trivial_left_join_indices(
-  table_view const& left,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+get_trivial_left_join_indices(table_view const& left,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Builds the hash table based on the given `build_table`.
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 46e337a3363..d35aeab39ec 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -80,7 +80,9 @@ mixed_join(
       // Left and full joins all return all the row indices from
       // left with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
-      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream);
+      case join_kind::FULL_JOIN:
+        return get_trivial_left_join_indices(
+          left_conditional, stream, rmm::mr::get_current_device_resource());
       // Inner joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
         return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
@@ -96,7 +98,8 @@ mixed_join(
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped = get_trivial_left_join_indices(right_conditional, stream);
+        auto ret_flipped = get_trivial_left_join_indices(
+          right_conditional, stream, rmm::mr::get_current_device_resource());
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index b32df9316e2..fced5b1b33f 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -118,7 +118,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
       // Anti and semi return all the row indices from left
       // with a corresponding NULL from the right.
       case join_kind::LEFT_ANTI_JOIN:
-        return get_trivial_left_join_indices(left_conditional, stream).first;
+        return get_trivial_left_join_indices(
+                 left_conditional, stream, rmm::mr::get_current_device_resource())
+          .first;
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::LEFT_SEMI_JOIN:
         return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index cc523b2ac7f..2585ff457ee 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   cudf::table_view const& right_keys,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty");
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 22083f7ce99..8ca26c0ebfb 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,10 +91,9 @@ std::unique_ptr<column> merge_offsets(host_span<lists_column_view const> columns
 /**
  * @copydoc cudf::lists::detail::concatenate
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   std::vector<lists_column_view> lists_columns;
   lists_columns.reserve(columns.size());
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index c9c88270e10..d606f11bdb9 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
   auto const child = input.get_sliced_child(stream);
 
   // The labels are also a map from each list element to its corresponding zero-based list index.
-  auto const labels = generate_labels(input, child.size(), stream);
+  auto const labels =
+    generate_labels(input, child.size(), stream, rmm::mr::get_current_device_resource());
 
   // The offsets of the output lists column.
   auto out_offsets = get_normalized_offsets(input, stream, mr);
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index a31b7c6e5be..8df99153d74 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,12 +73,14 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
   // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence reults
   //   computed in the previous step.
 
-  auto const lhs_child  = lhs.get_sliced_child(stream);
-  auto const rhs_child  = rhs.get_sliced_child(stream);
-  auto const lhs_labels = generate_labels(lhs, lhs_child.size(), stream);
-  auto const rhs_labels = generate_labels(rhs, rhs_child.size(), stream);
-  auto const lhs_table  = table_view{{lhs_labels->view(), lhs_child}};
-  auto const rhs_table  = table_view{{rhs_labels->view(), rhs_child}};
+  auto const lhs_child = lhs.get_sliced_child(stream);
+  auto const rhs_child = rhs.get_sliced_child(stream);
+  auto const lhs_labels =
+    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const rhs_labels =
+    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
+  auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   // Check existence for each row of the rhs_table in lhs_table.
   auto const contained =
@@ -140,12 +142,14 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
   // - Extract rows of the rhs table using the existence results computed in the previous step.
   // - Remove duplicate rows, and build the output lists.
 
-  auto const lhs_child  = lhs.get_sliced_child(stream);
-  auto const rhs_child  = rhs.get_sliced_child(stream);
-  auto const lhs_labels = generate_labels(lhs, lhs_child.size(), stream);
-  auto const rhs_labels = generate_labels(rhs, rhs_child.size(), stream);
-  auto const lhs_table  = table_view{{lhs_labels->view(), lhs_child}};
-  auto const rhs_table  = table_view{{rhs_labels->view(), rhs_child}};
+  auto const lhs_child = lhs.get_sliced_child(stream);
+  auto const rhs_child = rhs.get_sliced_child(stream);
+  auto const lhs_labels =
+    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const rhs_labels =
+    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
+  auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   auto const contained =
     cudf::detail::contains(lhs_table, rhs_table, nulls_equal, nans_equal, stream);
@@ -215,12 +219,14 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
   // - Extract rows of the lhs table using that difference results.
   // - Remove duplicate rows, and build the output lists.
 
-  auto const lhs_child  = lhs.get_sliced_child(stream);
-  auto const rhs_child  = rhs.get_sliced_child(stream);
-  auto const lhs_labels = generate_labels(lhs, lhs_child.size(), stream);
-  auto const rhs_labels = generate_labels(rhs, rhs_child.size(), stream);
-  auto const lhs_table  = table_view{{lhs_labels->view(), lhs_child}};
-  auto const rhs_table  = table_view{{rhs_labels->view(), rhs_child}};
+  auto const lhs_child = lhs.get_sliced_child(stream);
+  auto const rhs_child = rhs.get_sliced_child(stream);
+  auto const lhs_labels =
+    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const rhs_labels =
+    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
+  auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   auto const contained =
     cudf::detail::contains(rhs_table, lhs_table, nulls_equal, nans_equal, stream);
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index d0e4557663e..48d8babb4fa 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,8 +46,9 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
 
   if (input.is_empty()) { return empty_like(input.parent()); }
 
-  auto const child  = input.get_sliced_child(stream);
-  auto const labels = generate_labels(input, child.size(), stream);
+  auto const child = input.get_sliced_child(stream);
+  auto const labels =
+    generate_labels(input, child.size(), stream, rmm::mr::get_current_device_resource());
 
   auto const distinct_table =
     cudf::detail::stable_distinct(table_view{{labels->view(), child}},  // input table
diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp
index 76f8879c4d3..c881e828677 100644
--- a/cpp/src/lists/utilities.hpp
+++ b/cpp/src/lists/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,11 +33,10 @@ namespace cudf::lists::detail {
  * @param mr Device memory resource used to allocate the returned object
  * @return A column containing list labels corresponding to each element in the child column
  */
-std::unique_ptr<column> generate_labels(
-  lists_column_view const& input,
-  size_type n_elements,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> generate_labels(lists_column_view const& input,
+                                        size_type n_elements,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Reconstruct an offsets column from the input list labels column.
@@ -61,9 +60,8 @@ std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
  * @param mr Device memory resource used to allocate the returned object
  * @return The output offsets column with values start from 0
  */
-std::unique_ptr<column> get_normalized_offsets(
-  lists_column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 05842348807..ec0cc5af44d 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -241,7 +241,7 @@ struct column_merger {
     column_view const& lcol,
     column_view const& rcol,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const
+    rmm::mr::device_memory_resource* mr) const
   {
     auto lsz         = lcol.size();
     auto merged_size = lsz + rcol.size();
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d28cdee1de2..cae2699aac7 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -142,13 +142,12 @@ struct reduce_dispatch_functor {
   }
 };
 
-std::unique_ptr<scalar> reduce(
-  column_view const& col,
-  reduce_aggregation const& agg,
-  data_type output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<scalar> reduce(column_view const& col,
+                               reduce_aggregation const& agg,
+                               data_type output_dtype,
+                               std::optional<std::reference_wrapper<scalar const>> init,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!init.has_value() || col.type() == init.value().get().type(),
                "column and initial value must be the same type");
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 04a96666962..f453e7757a7 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -246,12 +246,11 @@ struct scan_dispatcher {
 
 }  // namespace
 
-std::unique_ptr<column> scan_inclusive(
-  column_view const& input,
-  scan_aggregation const& agg,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       scan_aggregation const& agg,
+                                       null_policy null_handling,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   auto output = scan_agg_dispatch<scan_dispatcher>(input, agg, null_handling, stream, mr);
 
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index a275683d82c..68b496e0ab8 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -213,28 +213,26 @@ std::enable_if_t<std::is_same_v<T, string_view>, std::unique_ptr<cudf::column>>
 }  // namespace
 
 template <typename T, typename OptionalScalarIterator, typename ReplaceScalarIterator>
-std::unique_ptr<column> clamp(
-  column_view const& input,
-  OptionalScalarIterator lo_itr,
-  ReplaceScalarIterator lo_replace_itr,
-  OptionalScalarIterator hi_itr,
-  ReplaceScalarIterator hi_replace_itr,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> clamp(column_view const& input,
+                              OptionalScalarIterator lo_itr,
+                              ReplaceScalarIterator lo_replace_itr,
+                              OptionalScalarIterator hi_itr,
+                              ReplaceScalarIterator hi_replace_itr,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   return clamper<T>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
 }
 
 struct dispatch_clamp {
   template <typename T>
-  std::unique_ptr<column> operator()(
-    column_view const& input,
-    scalar const& lo,
-    scalar const& lo_replace,
-    scalar const& hi,
-    scalar const& hi_replace,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<column> operator()(column_view const& input,
+                                     scalar const& lo,
+                                     scalar const& lo_replace,
+                                     scalar const& hi,
+                                     scalar const& hi_replace,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
 
@@ -352,14 +350,13 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> clamp(
-  column_view const& input,
-  scalar const& lo,
-  scalar const& lo_replace,
-  scalar const& hi,
-  scalar const& hi_replace,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> clamp(column_view const& input,
+                              scalar const& lo,
+                              scalar const& lo_replace,
+                              scalar const& hi,
+                              scalar const& hi_replace,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(lo.type() == hi.type(), "mismatching types of limit scalars");
   CUDF_EXPECTS(lo_replace.type() == hi_replace.type(), "mismatching types of replace scalars");
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 083b1b2eb46..cc60b2a12ea 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -104,7 +104,8 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
                                                     keep,
                                                     nulls_equal,
                                                     nans_equal,
-                                                    stream);
+                                                    stream,
+                                                    rmm::mr::get_current_device_resource());
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
diff --git a/cpp/src/stream_compaction/distinct_reduce.cuh b/cpp/src/stream_compaction/distinct_reduce.cuh
index e360d03280a..8ec1fa18205 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cuh
+++ b/cpp/src/stream_compaction/distinct_reduce.cuh
@@ -82,6 +82,6 @@ rmm::device_uvector<size_type> hash_reduce_by_row(
   null_equality nulls_equal,
   nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index c6b181fe8a1..94a7c8edcf9 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -590,7 +590,8 @@ TEST_F(JsonTest, TreeRepresentation)
     cudf::io::json::detail::get_token_stream(d_input, options, stream);
 
   // Get the JSON's tree representation
-  auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream);
+  auto gpu_tree = cuio_json::detail::get_tree_representation(
+    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -676,7 +677,8 @@ TEST_F(JsonTest, TreeRepresentation2)
     cudf::io::json::detail::get_token_stream(d_input, options, stream);
 
   // Get the JSON's tree representation
-  auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream);
+  auto gpu_tree = cuio_json::detail::get_tree_representation(
+    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -749,7 +751,8 @@ TEST_F(JsonTest, TreeRepresentation3)
     cudf::io::json::detail::get_token_stream(d_input, options, stream);
 
   // Get the JSON's tree representation
-  auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream);
+  auto gpu_tree = cuio_json::detail::get_tree_representation(
+    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -774,7 +777,8 @@ TEST_F(JsonTest, TreeRepresentationError)
 
   // Get the JSON's tree representation
   // This JSON is invalid and will raise an exception.
-  EXPECT_THROW(cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream),
+  EXPECT_THROW(cuio_json::detail::get_tree_representation(
+                 tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()),
                cudf::logic_error);
 }
 
@@ -864,15 +868,21 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
   auto [cpu_col_id, cpu_row_offsets] =
     records_orient_tree_traversal_cpu(input, cpu_tree, is_array_of_arrays, json_lines, stream);
   // gpu tree generation
-  auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream);
+  auto gpu_tree = cuio_json::detail::get_tree_representation(
+    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   // Print tree representation
   if (std::getenv("NJP_DEBUG_DUMP") != nullptr) {
     printf("BEFORE traversal (gpu_tree):\n");
     print_tree(gpu_tree);
   }
   // gpu tree traversal
-  auto [gpu_col_id, gpu_row_offsets] = cuio_json::detail::records_orient_tree_traversal(
-    d_input, gpu_tree, is_array_of_arrays, json_lines, stream);
+  auto [gpu_col_id, gpu_row_offsets] =
+    cuio_json::detail::records_orient_tree_traversal(d_input,
+                                                     gpu_tree,
+                                                     is_array_of_arrays,
+                                                     json_lines,
+                                                     stream,
+                                                     rmm::mr::get_current_device_resource());
   // Print tree representation
   if (std::getenv("NJP_DEBUG_DUMP") != nullptr) {
     printf("AFTER  traversal (gpu_tree):\n");

From 565efb74a85153088eee43e45d5d877cae731685 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 20 Mar 2023 12:57:35 -0400
Subject: [PATCH 25/63] Remove default detail mrs: part6 (#12969)

This is the sixth PR in a sequence removing default mr parameters in detail APIs. Contributes to https://github.com/rapidsai/cudf/issues/12944.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/12969
---
 cpp/benchmarks/iterator/iterator.cu           |   8 +-
 cpp/include/cudf/detail/gather.cuh            |   6 +-
 cpp/include/cudf/detail/null_mask.cuh         |   3 +-
 .../detail/utilities/vector_factories.hpp     |  58 ++---
 .../cudf/lists/lists_column_factories.hpp     |  11 +-
 .../cudf/structs/detail/concatenate.hpp       |   9 +-
 cpp/include/cudf_test/column_wrapper.hpp      |  21 +-
 cpp/include/cudf_test/tdigest_utilities.cuh   |   5 +-
 cpp/src/copying/concatenate.cu                |   6 +-
 cpp/src/dictionary/detail/concatenate.cu      |   5 +-
 cpp/src/groupby/hash/groupby.cu               |   3 +-
 cpp/src/groupby/sort/group_quantiles.cu       |   5 +-
 cpp/src/hash/unordered_multiset.cuh           |  10 +-
 cpp/src/io/avro/reader_impl.cu                |   9 +-
 cpp/src/io/comp/uncomp.cpp                    |   7 +-
 cpp/src/io/csv/csv_gpu.cu                     |   6 +-
 cpp/src/io/csv/reader_impl.cu                 |  31 +--
 cpp/src/io/json/experimental/read_json.cpp    |   3 +-
 cpp/src/io/json/json_column.cu                |   6 +-
 cpp/src/io/json/json_gpu.cu                   |   4 +-
 cpp/src/io/json/nested_json_gpu.cu            |   6 +-
 cpp/src/io/json/reader_impl.cu                |  39 ++--
 cpp/src/io/json/write_json.cu                 |   6 +-
 cpp/src/io/orc/reader_impl.cu                 |   9 +-
 cpp/src/io/orc/timezone.cpp                   |  12 +-
 cpp/src/io/orc/writer_impl.cu                 |  27 ++-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  16 +-
 cpp/src/io/parquet/writer_impl.cu             |  12 +-
 cpp/src/io/utilities/column_buffer.cpp        |   7 +-
 cpp/src/io/utilities/parsing_utils.cu         |   8 +-
 cpp/src/io/utilities/trie.cu                  |   5 +-
 cpp/src/lists/dremel.cu                       |   3 +-
 cpp/src/merge/merge.cu                        |   6 +-
 cpp/src/partitioning/partitioning.cu          |   8 +-
 cpp/src/quantiles/quantile.cu                 |   5 +-
 cpp/src/quantiles/quantiles.cu                |   5 +-
 cpp/src/reductions/struct_minmax_util.cuh     |   3 +-
 cpp/src/rolling/grouped_rolling.cu            |   8 +-
 cpp/src/strings/convert/convert_datetime.cu   |   3 +-
 cpp/src/strings/copying/concatenate.cu        |   3 +-
 cpp/src/strings/filter_chars.cu               |   5 +-
 cpp/src/strings/json/json_path.cu             |   9 +-
 cpp/src/strings/replace/backref_re.cu         |   8 +-
 cpp/src/strings/replace/multi_re.cu           |   5 +-
 cpp/src/strings/translate.cu                  |   4 +-
 cpp/src/table/row_operators.cu                |  14 +-
 cpp/src/transform/row_bit_count.cu            |   5 +-
 cpp/tests/bitmask/bitmask_tests.cpp           |   2 +-
 .../device_atomics/device_atomics_test.cu     |  15 +-
 cpp/tests/fixed_point/fixed_point_tests.cu    |   8 +-
 cpp/tests/io/nested_json_test.cpp             |  34 ++-
 cpp/tests/io/parquet_test.cpp                 |   3 +-
 cpp/tests/io/type_inference_test.cu           |  86 ++++----
 cpp/tests/iterator/iterator_tests.cuh         |   6 +-
 cpp/tests/iterator/value_iterator_test.cuh    |   5 +-
 .../iterator/value_iterator_test_strings.cu   |  20 +-
 .../partitioning/hash_partition_test.cpp      |   4 +-
 .../reductions/segmented_reduction_tests.cpp  | 206 +++++++++---------
 cpp/tests/scalar/scalar_device_view_test.cu   |   5 +-
 cpp/tests/strings/contains_tests.cpp          |   8 +-
 cpp/tests/strings/factories_test.cu           |  18 +-
 cpp/tests/strings/integers_tests.cpp          |   5 +-
 cpp/tests/table/table_view_tests.cu           |   6 +-
 cpp/tests/types/type_dispatcher_test.cu       |   8 +-
 cpp/tests/utilities/tdigest_utilities.cu      |  12 +-
 cpp/tests/utilities_tests/span_tests.cu       |   6 +-
 java/src/main/native/src/row_conversion.cu    |  62 ++++--
 67 files changed, 547 insertions(+), 429 deletions(-)

diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index 73060200d00..1b1cf9b7e9d 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -140,8 +140,8 @@ void BM_iterator(benchmark::State& state)
   cudf::column_view hasnull_F = wrap_hasnull_F;
 
   // Initialize dev_result to false
-  auto dev_result =
-    cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(1, cudf::get_default_stream());
+  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
@@ -210,7 +210,7 @@ void BM_pair_iterator(benchmark::State& state)
 
   // Initialize dev_result to false
   auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
-    1, cudf::get_default_stream());
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index ac2865c05c5..5460a0e5a76 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -583,10 +583,12 @@ void gather_bitmask(table_view const& source,
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
-  auto d_target_masks = make_device_uvector_async(target_masks, stream);
+  auto d_target_masks =
+    make_device_uvector_async(target_masks, stream, rmm::mr::get_current_device_resource());
 
   auto const device_source = table_device_view::create(source, stream);
-  auto d_valid_counts      = make_zeroed_device_uvector_async<size_type>(target.size(), stream);
+  auto d_valid_counts      = make_zeroed_device_uvector_async<size_type>(
+    target.size(), stream, rmm::mr::get_current_device_resource());
 
   // Dispatch operation enum to get implementation
   auto const impl = [op]() {
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index cb9ced6fc28..ce2619d767e 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -426,7 +426,8 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
 
   // Construct a contiguous host buffer of indices and copy to device.
   auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
-  auto const d_indices = make_device_uvector_async(h_indices, stream);
+  auto const d_indices =
+    make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
 
   // Compute the bit counts over each segment.
   auto first_bit_indices_begin = thrust::make_transform_iterator(
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 75df0d92d0a..c446a7b5148 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -48,10 +48,9 @@ namespace detail {
  * @return A device_uvector containing zeros
  */
 template <typename T>
-rmm::device_uvector<T> make_zeroed_device_uvector_async(
-  std::size_t size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_zeroed_device_uvector_async(std::size_t size,
+                                                        rmm::cuda_stream_view stream,
+                                                        rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -70,10 +69,9 @@ rmm::device_uvector<T> make_zeroed_device_uvector_async(
  * @return A device_uvector containing zeros
  */
 template <typename T>
-rmm::device_uvector<T> make_zeroed_device_uvector_sync(
-  std::size_t size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_zeroed_device_uvector_sync(std::size_t size,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -94,10 +92,9 @@ rmm::device_uvector<T> make_zeroed_device_uvector_sync(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_async(
-  host_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -126,9 +123,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_async(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -146,10 +141,9 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_async(
-  device_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_async(device_span<T const> source_data,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -178,9 +172,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_async(
     device_span<typename Container::value_type const>{c}, stream, mr);
@@ -199,10 +191,9 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_sync(
-  host_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_sync(host_span<T const> source_data,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -227,9 +218,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_sync(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -247,10 +236,9 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_sync(
-  device_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_sync(device_span<T const> source_data,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -275,9 +263,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
diff --git a/cpp/include/cudf/lists/lists_column_factories.hpp b/cpp/include/cudf/lists/lists_column_factories.hpp
index a6eacb97e91..fea1118748c 100644
--- a/cpp/include/cudf/lists/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/lists_column_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,11 +35,10 @@ namespace detail {
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @param[in] mr Device memory resource used to allocate the returned column's device memory.
  */
-std::unique_ptr<cudf::column> make_lists_column_from_scalar(
-  list_scalar const& value,
-  size_type size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& value,
+                                                            size_type size,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index a098703e4b0..82ccca188e2 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,10 +48,9 @@ namespace detail {
  * @param mr      Device memory resource used to allocate the returned column's device memory.
  * @return        New column with concatenated results.
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 91773b2c3f1..6341e2e10b0 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -732,9 +732,11 @@ class strings_column_wrapper : public detail::column_wrapper {
   {
     auto all_valid        = thrust::make_constant_iterator(true);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
-    auto d_chars   = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
-    auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
-    wrapped        = cudf::make_strings_column(d_chars, d_offsets);
+    auto d_chars          = cudf::detail::make_device_uvector_sync(
+      chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto d_offsets = cudf::detail::make_device_uvector_sync(
+      offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    wrapped = cudf::make_strings_column(d_chars, d_offsets);
   }
 
   /**
@@ -772,10 +774,13 @@ class strings_column_wrapper : public detail::column_wrapper {
     size_type num_strings = std::distance(begin, end);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
     auto null_mask        = detail::make_null_mask_vector(v, v + num_strings);
-    auto d_chars   = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
-    auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
-    auto d_bitmask = cudf::detail::make_device_uvector_sync(null_mask, cudf::get_default_stream());
-    wrapped        = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
+    auto d_chars          = cudf::detail::make_device_uvector_sync(
+      chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto d_offsets = cudf::detail::make_device_uvector_sync(
+      offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto d_bitmask = cudf::detail::make_device_uvector_sync(
+      null_mask, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
   }
 
   /**
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index ce45ad91be1..d23d7f29a6c 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -168,7 +168,8 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv,
   // verify min/max
   thrust::host_vector<device_span<T const>> h_spans;
   h_spans.push_back({input_values.begin<T>(), static_cast<size_t>(input_values.size())});
-  auto spans = cudf::detail::make_device_uvector_async(h_spans, cudf::get_default_stream());
+  auto spans = cudf::detail::make_device_uvector_async(
+    h_spans, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto expected_min = cudf::make_fixed_width_column(
     data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED);
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 5d36d70696c..6d6ef9fd7b0 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -76,7 +76,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
                  std::back_inserter(device_views),
                  [](auto const& col) { return *col; });
 
-  auto d_views = make_device_uvector_async(device_views, stream);
+  auto d_views =
+    make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource());
 
   // Compute the partition offsets
   auto offsets = thrust::host_vector<size_t>(views.size() + 1);
@@ -87,7 +88,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
     thrust::plus{});
-  auto d_offsets         = make_device_uvector_async(offsets, stream);
+  auto d_offsets =
+    make_device_uvector_async(offsets, stream, rmm::mr::get_current_device_resource());
   auto const output_size = offsets.back();
 
   return std::make_tuple(
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index d4f3a9ca495..bc54f65bbd3 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -114,7 +114,8 @@ struct compute_children_offsets_fn {
       [](auto lhs, auto rhs) {
         return offsets_pair{lhs.first + rhs.first, lhs.second + rhs.second};
       });
-    return cudf::detail::make_device_uvector_sync(offsets, stream);
+    return cudf::detail::make_device_uvector_sync(
+      offsets, stream, rmm::mr::get_current_device_resource());
   }
 
  private:
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 72ac6255549..f8203218760 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -481,7 +481,8 @@ void compute_single_pass_aggs(table_view const& keys,
   // prepare to launch kernel to do the actual aggregation
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
   auto d_values       = table_device_view::create(flattened_values, stream);
-  auto const d_aggs   = cudf::detail::make_device_uvector_async(agg_kinds, stream);
+  auto const d_aggs   = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
   auto const skip_key_rows_with_nulls =
     keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index 90ca5a5c90e..a9edcfecbf7 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -156,7 +156,8 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  auto dv_quantiles = cudf::detail::make_device_uvector_async(quantiles, stream);
+  auto dv_quantiles = cudf::detail::make_device_uvector_async(
+    quantiles, stream, rmm::mr::get_current_device_resource());
 
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index c017fd43079..55036bec6a6 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -84,10 +84,10 @@ class unordered_multiset {
     auto d_column = column_device_view::create(col, stream);
     auto d_col    = *d_column;
 
-    auto hash_bins_start =
-      cudf::detail::make_zeroed_device_uvector_async<size_type>(2 * d_col.size() + 1, stream);
-    auto hash_bins_end =
-      cudf::detail::make_zeroed_device_uvector_async<size_type>(2 * d_col.size() + 1, stream);
+    auto hash_bins_start = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
+    auto hash_bins_end = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
     auto hash_data = rmm::device_uvector<Element>(d_col.size(), stream);
 
     Hasher hasher;
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index cd557ff166a..60a1b4263b2 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -444,7 +444,8 @@ std::vector<column_buffer> decode_data(metadata& meta,
     }
   }
 
-  auto block_list = cudf::detail::make_device_uvector_async(meta.block_list, stream);
+  auto block_list = cudf::detail::make_device_uvector_async(
+    meta.block_list, stream, rmm::mr::get_current_device_resource());
 
   schema_desc.host_to_device(stream);
 
@@ -574,8 +575,10 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
           }
         }
 
-        d_global_dict      = cudf::detail::make_device_uvector_async(h_global_dict, stream);
-        d_global_dict_data = cudf::detail::make_device_uvector_async(h_global_dict_data, stream);
+        d_global_dict = cudf::detail::make_device_uvector_async(
+          h_global_dict, stream, rmm::mr::get_current_device_resource());
+        d_global_dict_data = cudf::detail::make_device_uvector_async(
+          h_global_dict_data, stream, rmm::mr::get_current_device_resource());
 
         stream.synchronize();
       }
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 6778ddead28..008c7215cca 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -509,9 +509,10 @@ size_t decompress_zstd(host_span<uint8_t const> src,
                        rmm::cuda_stream_view stream)
 {
   // Init device span of spans (source)
-  auto const d_src = cudf::detail::make_device_uvector_async(src, stream);
-  auto hd_srcs     = hostdevice_vector<device_span<uint8_t const>>(1, stream);
-  hd_srcs[0]       = d_src;
+  auto const d_src =
+    cudf::detail::make_device_uvector_async(src, stream, rmm::mr::get_current_device_resource());
+  auto hd_srcs = hostdevice_vector<device_span<uint8_t const>>(1, stream);
+  hd_srcs[0]   = d_src;
   hd_srcs.host_to_device(stream);
 
   // Init device span of spans (temporary destination)
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 4f6f8162246..51e3783bac5 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -794,8 +794,8 @@ std::vector<column_type_histogram> detect_column_types(
   const int block_size = csvparse_block_dim;
   const int grid_size  = (row_starts.size() + block_size - 1) / block_size;
 
-  auto d_stats =
-    detail::make_zeroed_device_uvector_async<column_type_histogram>(num_active_columns, stream);
+  auto d_stats = detail::make_zeroed_device_uvector_async<column_type_histogram>(
+    num_active_columns, stream, rmm::mr::get_current_device_resource());
 
   data_type_detection<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_starts, d_stats);
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 2e38ea7f4ab..9c1ff67d97c 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -523,13 +523,13 @@ void infer_column_types(parse_options const& parse_opts,
     });
   if (num_inferred_columns == 0) { return; }
 
-  auto const column_stats =
-    cudf::io::csv::gpu::detect_column_types(parse_opts.view(),
-                                            data,
-                                            make_device_uvector_async(column_flags, stream),
-                                            row_offsets,
-                                            num_inferred_columns,
-                                            stream);
+  auto const column_stats = cudf::io::csv::gpu::detect_column_types(
+    parse_opts.view(),
+    data,
+    make_device_uvector_async(column_flags, stream, rmm::mr::get_current_device_resource()),
+    row_offsets,
+    num_inferred_columns,
+    stream);
   stream.synchronize();
 
   auto inf_col_idx = 0;
@@ -595,14 +595,15 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
     h_valid[i] = out_buffers[i].null_mask();
   }
 
-  cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(),
-                                             data,
-                                             make_device_uvector_async(column_flags, stream),
-                                             row_offsets,
-                                             make_device_uvector_async(column_types, stream),
-                                             make_device_uvector_async(h_data, stream),
-                                             make_device_uvector_async(h_valid, stream),
-                                             stream);
+  cudf::io::csv::gpu::decode_row_column_data(
+    parse_opts.view(),
+    data,
+    make_device_uvector_async(column_flags, stream, rmm::mr::get_current_device_resource()),
+    row_offsets,
+    make_device_uvector_async(column_types, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(h_data, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(h_valid, stream, rmm::mr::get_current_device_resource()),
+    stream);
 
   return out_buffers;
 }
diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp
index 70a0b66ebc6..c18b15708ab 100644
--- a/cpp/src/io/json/experimental/read_json.cpp
+++ b/cpp/src/io/json/experimental/read_json.cpp
@@ -80,7 +80,8 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
     auto const uncomp_data = decompress(compression, buffer);
     return cudf::detail::make_device_uvector_sync(
       host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
-      stream);
+      stream,
+      rmm::mr::get_current_device_resource());
   }
 }
 
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index d174cc8aca3..f4d65f37cdb 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -602,8 +602,10 @@ void make_device_json_column(device_span<SymbolT const> input,
                                             col.validity.data()};
   }
 
-  auto d_ignore_vals  = cudf::detail::make_device_uvector_async(ignore_vals, stream);
-  auto d_columns_data = cudf::detail::make_device_uvector_async(columns_data, stream);
+  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
+    ignore_vals, stream, rmm::mr::get_current_device_resource());
+  auto d_columns_data = cudf::detail::make_device_uvector_async(
+    columns_data, stream, rmm::mr::get_current_device_resource());
 
   // 3. scatter string offsets to respective columns, set validity bits
   thrust::for_each_n(
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 8b6c0f9d528..d1711db0484 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -578,7 +578,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
       return d_column_infos;
     } else {
       return cudf::detail::make_zeroed_device_uvector_async<cudf::io::column_type_histogram>(
-        num_columns, stream);
+        num_columns, stream, rmm::mr::get_current_device_resource());
     }
   }();
 
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 30b3911089f..77749b42781 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1597,9 +1597,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
 
       // Move string_offsets and string_lengths to GPU
       rmm::device_uvector<json_column::row_offset_t> d_string_offsets =
-        cudf::detail::make_device_uvector_async(json_col.string_offsets, stream);
+        cudf::detail::make_device_uvector_async(
+          json_col.string_offsets, stream, rmm::mr::get_current_device_resource());
       rmm::device_uvector<json_column::row_offset_t> d_string_lengths =
-        cudf::detail::make_device_uvector_async(json_col.string_lengths, stream);
+        cudf::detail::make_device_uvector_async(
+          json_col.string_lengths, stream, rmm::mr::get_current_device_resource());
 
       // Prepare iterator that returns (string_offset, string_length)-tuples
       auto offset_length_it =
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 6e1089796de..7ae8deb8055 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -340,8 +340,8 @@ rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reade
                "Error finding the record within the specified byte range.\n");
 
   // Upload the raw data that is within the rows of interest
-  return cudf::detail::make_device_uvector_async(h_data.subspan(start_offset, bytes_to_upload),
-                                                 stream);
+  return cudf::detail::make_device_uvector_async(
+    h_data.subspan(start_offset, bytes_to_upload), stream, rmm::mr::get_current_device_resource());
 }
 
 std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
@@ -512,11 +512,14 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
     h_valid[i]  = out_buffers[i].null_mask();
   }
 
-  auto d_dtypes = cudf::detail::make_device_uvector_async<data_type>(h_dtypes, stream);
-  auto d_data   = cudf::detail::make_device_uvector_async<void*>(h_data, stream);
-  auto d_valid  = cudf::detail::make_device_uvector_async<cudf::bitmask_type*>(h_valid, stream);
-  auto d_valid_counts =
-    cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(num_columns, stream);
+  auto d_dtypes = cudf::detail::make_device_uvector_async<data_type>(
+    h_dtypes, stream, rmm::mr::get_current_device_resource());
+  auto d_data = cudf::detail::make_device_uvector_async<void*>(
+    h_data, stream, rmm::mr::get_current_device_resource());
+  auto d_valid = cudf::detail::make_device_uvector_async<cudf::bitmask_type*>(
+    h_valid, stream, rmm::mr::get_current_device_resource());
+  auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
+    num_columns, stream, rmm::mr::get_current_device_resource());
 
   cudf::io::json::gpu::convert_json_to_columns(
     parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream);
@@ -530,13 +533,18 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   auto repl_chars   = std::vector<char>{'"', '\\', '\t', '\r', '\b'};
   auto repl_offsets = std::vector<size_type>{0, 1, 2, 3, 4, 5};
 
-  auto target = make_strings_column(cudf::detail::make_device_uvector_async(target_chars, stream),
-                                    cudf::detail::make_device_uvector_async(target_offsets, stream),
-                                    {},
-                                    0,
-                                    stream);
-  auto repl   = make_strings_column(cudf::detail::make_device_uvector_async(repl_chars, stream),
-                                  cudf::detail::make_device_uvector_async(repl_offsets, stream),
+  auto target =
+    make_strings_column(cudf::detail::make_device_uvector_async(
+                          target_chars, stream, rmm::mr::get_current_device_resource()),
+                        cudf::detail::make_device_uvector_async(
+                          target_offsets, stream, rmm::mr::get_current_device_resource()),
+                        {},
+                        0,
+                        stream);
+  auto repl = make_strings_column(cudf::detail::make_device_uvector_async(
+                                    repl_chars, stream, rmm::mr::get_current_device_resource()),
+                                  cudf::detail::make_device_uvector_async(
+                                    repl_offsets, stream, rmm::mr::get_current_device_resource()),
                                   {},
                                   0,
                                   stream);
@@ -617,7 +625,8 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
   auto d_data = rmm::device_uvector<char>(0, stream);
 
   if (should_load_whole_source(reader_opts)) {
-    d_data = cudf::detail::make_device_uvector_async(h_data, stream);
+    d_data = cudf::detail::make_device_uvector_async(
+      h_data, stream, rmm::mr::get_current_device_resource());
   }
 
   auto rec_starts = find_record_starts(reader_opts, h_data, d_data, stream);
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index b4bcb5548de..9e56b20114c 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -552,14 +552,16 @@ std::unique_ptr<column> make_strings_column_from_host(host_span<std::string cons
 {
   std::string const host_chars =
     std::accumulate(host_strings.begin(), host_strings.end(), std::string(""));
-  auto d_chars = cudf::detail::make_device_uvector_async(host_chars, stream);
+  auto d_chars = cudf::detail::make_device_uvector_async(
+    host_chars, stream, rmm::mr::get_current_device_resource());
   std::vector<cudf::size_type> offsets(host_strings.size() + 1, 0);
   std::transform_inclusive_scan(host_strings.begin(),
                                 host_strings.end(),
                                 offsets.begin() + 1,
                                 std::plus<cudf::size_type>{},
                                 [](auto& str) { return str.size(); });
-  auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream);
+  auto d_offsets =
+    cudf::detail::make_device_uvector_sync(offsets, stream, rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(
     host_strings.size(), std::move(d_offsets), std::move(d_chars), {}, 0);
 }
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 96eb20e1e66..bbc88a16c6a 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -576,8 +576,8 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
       prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
     }
   }
-  auto const d_prefix_sums_to_update =
-    cudf::detail::make_device_uvector_async(prefix_sums_to_update, stream);
+  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
+    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
 
   thrust::for_each(rmm::exec_policy(stream),
                    d_prefix_sums_to_update.begin(),
@@ -1038,7 +1038,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                       selected_columns.levels[level].size(),
                       [&]() {
                         return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                          total_num_stripes, stream);
+                          total_num_stripes, stream, rmm::mr::get_current_device_resource());
                       });
 
       // Tracker for eventually deallocating compressed and uncompressed data
@@ -1270,7 +1270,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
             });
 
           if (buff_data.size()) {
-            auto const dev_buff_data = cudf::detail::make_device_uvector_async(buff_data, stream);
+            auto const dev_buff_data = cudf::detail::make_device_uvector_async(
+              buff_data, stream, rmm::mr::get_current_device_resource());
             generate_offsets_for_list(dev_buff_data, stream);
           }
         }
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
index 810dfe87320..416369cc3f0 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/io/orc/timezone.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <algorithm>
 #include <fstream>
 
@@ -461,9 +463,11 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
                         .count();
   }
 
-  rmm::device_uvector<int64_t> d_ttimes  = cudf::detail::make_device_uvector_async(ttimes, stream);
-  rmm::device_uvector<int32_t> d_offsets = cudf::detail::make_device_uvector_async(offsets, stream);
-  auto const gmt_offset                  = get_gmt_offset(ttimes, offsets, orc_utc_offset);
+  rmm::device_uvector<int64_t> d_ttimes =
+    cudf::detail::make_device_uvector_async(ttimes, stream, rmm::mr::get_current_device_resource());
+  rmm::device_uvector<int32_t> d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, stream, rmm::mr::get_current_device_resource());
+  auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset);
   stream.synchronize();
 
   return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)};
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 00b5c5428b1..1ee90bde1d2 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -418,7 +418,7 @@ void init_dictionaries(orc_table_view& orc_table,
                  [&](auto& col_idx) {
                    auto& str_column = orc_table.column(col_idx);
                    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     str_column.size(), stream);
+                     str_column.size(), stream, rmm::mr::get_current_device_resource());
                  });
 
   // Create views of the temporary buffers in device memory
@@ -428,7 +428,8 @@ void init_dictionaries(orc_table_view& orc_table,
     dict_indices.begin(), dict_indices.end(), std::back_inserter(dict_indices_views), [](auto& di) {
       return device_span<uint32_t>{di};
     });
-  auto d_dict_indices_views = cudf::detail::make_device_uvector_async(dict_indices_views, stream);
+  auto d_dict_indices_views = cudf::detail::make_device_uvector_async(
+    dict_indices_views, stream, rmm::mr::get_current_device_resource());
 
   gpu::InitDictionaryIndices(orc_table.d_columns,
                              *dict,
@@ -772,7 +773,8 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
                                 aligned_rgs.count() * sizeof(rowgroup_rows),
                                 cudaMemcpyDefault,
                                 stream.value()));
-  auto const d_stripes = cudf::detail::make_device_uvector_async(segmentation.stripes, stream);
+  auto const d_stripes = cudf::detail::make_device_uvector_async(
+    segmentation.stripes, stream, rmm::mr::get_current_device_resource());
 
   // One thread per column, per stripe
   thrust::for_each_n(
@@ -1675,7 +1677,8 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
   }
 
   // Attach null masks to device column views (async)
-  auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(mask_ptrs, stream);
+  auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(
+    mask_ptrs, stream, rmm::mr::get_current_device_resource());
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator(0ul),
@@ -1765,7 +1768,8 @@ orc_table_view make_orc_table_view(table_view const& table,
     orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) {
       return orc_column.orc_kind();
     });
-  auto const d_type_kinds = cudf::detail::make_device_uvector_async(type_kinds, stream);
+  auto const d_type_kinds = cudf::detail::make_device_uvector_async(
+    type_kinds, stream, rmm::mr::get_current_device_resource());
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
   using stack_value_type = thrust::pair<column_device_view const*, thrust::optional<uint32_t>>;
@@ -1815,7 +1819,8 @@ orc_table_view make_orc_table_view(table_view const& table,
   return {std::move(orc_columns),
           std::move(d_orc_columns),
           str_col_indexes,
-          cudf::detail::make_device_uvector_sync(str_col_indexes, stream)};
+          cudf::detail::make_device_uvector_sync(
+            str_col_indexes, stream, rmm::mr::get_current_device_resource())};
 }
 
 hostdevice_2dvector<rowgroup_rows> calculate_rowgroup_bounds(orc_table_view const& orc_table,
@@ -1983,7 +1988,7 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
                  std::back_inserter(data),
                  [&](auto& idx) {
                    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     orc_table.columns[idx].size(), stream);
+                     orc_table.columns[idx].size(), stream, rmm::mr::get_current_device_resource());
                  });
   std::vector<rmm::device_uvector<uint32_t>> index;
   std::transform(orc_table.string_column_indices.begin(),
@@ -1991,7 +1996,7 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
                  std::back_inserter(index),
                  [&](auto& idx) {
                    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     orc_table.columns[idx].size(), stream);
+                     orc_table.columns[idx].size(), stream, rmm::mr::get_current_device_resource());
                  });
   stream.synchronize();
 
@@ -2006,8 +2011,10 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
 
   return {std::move(data),
           std::move(index),
-          cudf::detail::make_device_uvector_sync(data_ptrs, stream),
-          cudf::detail::make_device_uvector_sync(index_ptrs, stream),
+          cudf::detail::make_device_uvector_sync(
+            data_ptrs, stream, rmm::mr::get_current_device_resource()),
+          cudf::detail::make_device_uvector_sync(
+            index_ptrs, stream, rmm::mr::get_current_device_resource()),
           std::move(is_dict_enabled)};
 }
 
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 6b5d4ba3640..e5c2b7aa842 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -468,10 +468,12 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
 
     host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
                                                              codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, stream);
+    auto const d_comp_in = cudf::detail::make_device_uvector_async(
+      comp_in_view, stream, rmm::mr::get_current_device_resource());
     host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
                                                         codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, stream);
+    auto const d_comp_out = cudf::detail::make_device_uvector_async(
+      comp_out_view, stream, rmm::mr::get_current_device_resource());
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
@@ -523,8 +525,10 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
 
   // now copy the uncompressed V2 def and rep level data
   if (not copy_in.empty()) {
-    auto const d_copy_in  = cudf::detail::make_device_uvector_async(copy_in, stream);
-    auto const d_copy_out = cudf::detail::make_device_uvector_async(copy_out, stream);
+    auto const d_copy_in = cudf::detail::make_device_uvector_async(
+      copy_in, stream, rmm::mr::get_current_device_resource());
+    auto const d_copy_out = cudf::detail::make_device_uvector_async(
+      copy_out, stream, rmm::mr::get_current_device_resource());
 
     gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
     stream.synchronize();
@@ -1489,8 +1493,8 @@ void reader::impl::preprocess_pages(size_t skip_rows,
     // Build index for string dictionaries since they can't be indexed
     // directly due to variable-sized elements
     _chunk_itm_data.str_dict_index =
-      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(total_str_dict_indexes,
-                                                                        _stream);
+      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+        total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
 
     // Update chunks with pointers to string dict indices
     for (size_t c = 0, page_count = 0, str_ofs = 0; c < chunks.size(); c++) {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5f407b5e774..e6e14908f36 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -853,7 +853,8 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   _nullability = std::vector<uint8_t>(r_nullability.crbegin(), r_nullability.crend());
   // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
   // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
-  _d_nullability = cudf::detail::make_device_uvector_async(_nullability, stream);
+  _d_nullability = cudf::detail::make_device_uvector_async(
+    _nullability, stream, rmm::mr::get_current_device_resource());
 
   _is_list = (_max_rep_level > 0);
 
@@ -928,7 +929,8 @@ void writer::impl::init_row_group_fragments(
   device_span<int const> part_frag_offset,
   uint32_t fragment_size)
 {
-  auto d_partitions = cudf::detail::make_device_uvector_async(partitions, stream);
+  auto d_partitions = cudf::detail::make_device_uvector_async(
+    partitions, stream, rmm::mr::get_current_device_resource());
   gpu::InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host(stream, true);
 }
@@ -936,7 +938,8 @@ void writer::impl::init_row_group_fragments(
 void writer::impl::calculate_page_fragments(device_span<gpu::PageFragment> frag,
                                             host_span<size_type const> frag_sizes)
 {
-  auto d_frag_sz = cudf::detail::make_device_uvector_async(frag_sizes, stream);
+  auto d_frag_sz = cudf::detail::make_device_uvector_async(
+    frag_sizes, stream, rmm::mr::get_current_device_resource());
   gpu::CalculatePageFragments(frag, d_frag_sz, stream);
 }
 
@@ -1507,7 +1510,8 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
     num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0);
   part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
 
-  auto d_part_frag_offset = cudf::detail::make_device_uvector_async(part_frag_offset, stream);
+  auto d_part_frag_offset = cudf::detail::make_device_uvector_async(
+    part_frag_offset, stream, rmm::mr::get_current_device_resource());
   cudf::detail::hostdevice_2dvector<gpu::PageFragment> row_group_fragments(
     num_columns, num_fragments, stream);
 
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 6c14975101c..430d7c4a26d 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -43,7 +45,8 @@ void column_buffer::create(size_type _size,
       // make_zeroed_device_uvector_async here and instead let it use the
       // default rmm memory resource.
       _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
-        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(size, stream));
+        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+          size, stream, rmm::mr::get_current_device_resource()));
       break;
 
     // list columns store a buffer of int32's as offsets to represent
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index a03789464cc..5c5cbd1c01d 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -126,7 +126,8 @@ cudf::size_type find_all_from_set(device_span<char const> data,
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
   const int grid_size = divCeil(data.size(), (size_t)block_size);
 
-  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
+  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
+    1, stream, rmm::mr::get_current_device_resource());
   for (char key : keys) {
     count_and_set_positions<T><<<grid_size, block_size, 0, stream.value()>>>(
       data.data(), data.size(), result_offset, key, d_count.data(), positions);
@@ -143,7 +144,8 @@ cudf::size_type find_all_from_set(host_span<char const> data,
                                   rmm::cuda_stream_view stream)
 {
   rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream);
-  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
+  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
+    1, stream, rmm::mr::get_current_device_resource());
 
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
diff --git a/cpp/src/io/utilities/trie.cu b/cpp/src/io/utilities/trie.cu
index bf03d6a6a89..e2ace7258f7 100644
--- a/cpp/src/io/utilities/trie.cu
+++ b/cpp/src/io/utilities/trie.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,7 +101,8 @@ rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<s
     // Only add the terminating character if any nodes were added
     if (has_children) { nodes.push_back(serial_trie_node(trie_terminating_character)); }
   }
-  return cudf::detail::make_device_uvector_sync(nodes, stream);
+  return cudf::detail::make_device_uvector_sync(
+    nodes, stream, rmm::mr::get_current_device_resource());
 }
 
 }  // namespace detail
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index c96a21df905..5136cc8cd37 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -266,7 +266,8 @@ dremel_data get_encoding(column_view h_col,
     max_vals_size += column_ends[l] - column_offsets[l];
   }
 
-  auto d_nullability = cudf::detail::make_device_uvector_async(nullability, stream);
+  auto d_nullability = cudf::detail::make_device_uvector_async(
+    nullability, stream, rmm::mr::get_current_device_resource());
 
   rmm::device_uvector<uint8_t> rep_level(max_vals_size, stream);
   rmm::device_uvector<uint8_t> def_level(max_vals_size, stream);
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index ec0cc5af44d..83ee6793efb 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -187,10 +187,12 @@ index_vector generate_merged_indices(table_view const& left_table,
   auto lhs_device_view = table_device_view::create(left_table, stream);
   auto rhs_device_view = table_device_view::create(right_table, stream);
 
-  auto d_column_order = cudf::detail::make_device_uvector_async(column_order, stream);
+  auto d_column_order = cudf::detail::make_device_uvector_async(
+    column_order, stream, rmm::mr::get_current_device_resource());
 
   if (nullable) {
-    auto d_null_precedence = cudf::detail::make_device_uvector_async(null_precedence, stream);
+    auto d_null_precedence = cudf::detail::make_device_uvector_async(
+      null_precedence, stream, rmm::mr::get_current_device_resource());
 
     auto ineq_op = detail::row_lexicographic_tagged_comparator<true>(
       *lhs_device_view, *rhs_device_view, d_column_order.data(), d_null_precedence.data());
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 54dffc85aca..13f46195392 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -493,11 +493,11 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     rmm::device_uvector<size_type>(grid_size * num_partitions, stream);
 
   // Holds the total number of rows in each partition
-  auto global_partition_sizes =
-    cudf::detail::make_zeroed_device_uvector_async<size_type>(num_partitions, stream);
+  auto global_partition_sizes = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+    num_partitions, stream, rmm::mr::get_current_device_resource());
 
-  auto row_partition_offset =
-    cudf::detail::make_zeroed_device_uvector_async<size_type>(num_rows, stream);
+  auto row_partition_offset = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+    num_rows, stream, rmm::mr::get_current_device_resource());
 
   auto const row_hasher = experimental::row::hash::row_hasher(table_to_hash, stream);
   auto const hasher =
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 785aa839956..4a9c2e3a902 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,7 +85,8 @@ struct quantile_functor {
     auto d_input  = column_device_view::create(input, stream);
     auto d_output = mutable_column_device_view::create(output->mutable_view(), stream);
 
-    auto q_device = cudf::detail::make_device_uvector_sync(q, stream);
+    auto q_device =
+      cudf::detail::make_device_uvector_sync(q, stream, rmm::mr::get_current_device_resource());
 
     if (!cudf::is_dictionary(input.type())) {
       auto sorted_data =
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index e71508bab09..c6760e77403 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,8 @@ std::unique_ptr<table> quantiles(table_view const& input,
     return detail::select_quantile<size_type>(selector, size, q, interp);
   };
 
-  auto const q_device = cudf::detail::make_device_uvector_async(q, stream);
+  auto const q_device =
+    cudf::detail::make_device_uvector_async(q, stream, rmm::mr::get_current_device_resource());
 
   auto quantile_idx_iter = thrust::make_transform_iterator(q_device.begin(), quantile_idx_lookup);
 
diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
index 796d10a3477..b2106066ff2 100644
--- a/cpp/src/reductions/struct_minmax_util.cuh
+++ b/cpp/src/reductions/struct_minmax_util.cuh
@@ -118,7 +118,8 @@ class comparison_binop_generator {
       // level structs column (which is stored at the first position in the null_orders array) to
       // achieve this purpose.
       if (input.has_nulls()) { null_orders.front() = cudf::null_order::AFTER; }
-      null_orders_dvec = cudf::detail::make_device_uvector_async(null_orders, stream);
+      null_orders_dvec = cudf::detail::make_device_uvector_async(
+        null_orders, stream, rmm::mr::get_current_device_resource());
     }
     // else: Don't need to generate nulls order to copy to device memory if we have all null orders
     // are BEFORE (that happens when we have is_min_op == false).
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 2b4b6373c35..b208e7cd980 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -467,8 +467,10 @@ get_null_bounds_for_orderby_column(column_view const& orderby_column,
       cudf::device_span<cudf::size_type const>(group_offsets.data(), num_groups);
 
     // When there are no nulls, just copy the input group offsets to the output.
-    return std::make_tuple(cudf::detail::make_device_uvector_async(group_offsets_span, stream),
-                           cudf::detail::make_device_uvector_async(group_offsets_span, stream));
+    return std::make_tuple(cudf::detail::make_device_uvector_async(
+                             group_offsets_span, stream, rmm::mr::get_current_device_resource()),
+                           cudf::detail::make_device_uvector_async(
+                             group_offsets_span, stream, rmm::mr::get_current_device_resource()));
   }
 }
 
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 177fcab03f9..8d273eff4bb 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -160,7 +160,8 @@ struct format_compiler {
     }
 
     // copy format_items to device memory
-    d_items = cudf::detail::make_device_uvector_async(items, stream);
+    d_items = cudf::detail::make_device_uvector_async(
+      items, stream, rmm::mr::get_current_device_resource());
   }
 
   device_span<format_item const> format_items() { return device_span<format_item const>(d_items); }
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index e3ee59c631f..92b71d128e1 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -85,7 +85,8 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
       return static_cast<size_t>(col.size());
     });
   thrust::inclusive_scan(thrust::host, offset_it, input_offsets.end(), offset_it);
-  auto d_input_offsets   = cudf::detail::make_device_uvector_async(input_offsets, stream);
+  auto d_input_offsets = cudf::detail::make_device_uvector_async(
+    input_offsets, stream, rmm::mr::get_current_device_resource());
   auto const output_size = input_offsets.back();
 
   // Compute the partition offsets and size of chars column
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 8a6a4d44b1e..3e38b5fa775 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -132,7 +132,8 @@ std::unique_ptr<column> filter_characters(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
     });
-  rmm::device_uvector<char_range> table = cudf::detail::make_device_uvector_async(htable, stream);
+  rmm::device_uvector<char_range> table =
+    cudf::detail::make_device_uvector_async(htable, stream, rmm::mr::get_current_device_resource());
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index c6ea47ec0f3..128d450cbe8 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -673,11 +673,10 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
   } while (op.type != path_operator_type::END);
 
   auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END;
-  return is_empty
-           ? std::pair(thrust::nullopt, 0)
-           : std::pair(
-               thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)),
-               max_stack_depth);
+  return is_empty ? std::pair(thrust::nullopt, 0)
+                  : std::pair(thrust::make_optional(cudf::detail::make_device_uvector_sync(
+                                h_operators, stream, rmm::mr::get_current_device_resource())),
+                              max_stack_depth);
 }
 
 #define PARSE_TRY(_x)                                                       \
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 383337c9088..d25af8c8931 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,9 +118,9 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
 
   // parse the repl string for back-ref indicators
   auto group_count = std::min(99, d_prog->group_counts());  // group count should NOT exceed 99
-  auto const parse_result = parse_backrefs(replacement, group_count);
-  rmm::device_uvector<backref_type> backrefs =
-    cudf::detail::make_device_uvector_async(parse_result.second, stream);
+  auto const parse_result                    = parse_backrefs(replacement, group_count);
+  rmm::device_uvector<backref_type> backrefs = cudf::detail::make_device_uvector_async(
+    parse_result.second, stream, rmm::mr::get_current_device_resource());
   string_scalar repl_scalar(parse_result.first, true, stream);
   string_view const d_repl_template = repl_scalar.value();
 
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index f3bc7fc82ec..50b2dc27671 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,7 +169,8 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                    prog->set_working_memory(d_buffer, size);
                    return *prog;
                  });
-  auto d_progs = cudf::detail::make_device_uvector_async(progs, stream);
+  auto d_progs =
+    cudf::detail::make_device_uvector_async(progs, stream, rmm::mr::get_current_device_resource());
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
   auto const d_repls   = column_device_view::create(replacements.parent(), stream);
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 7f134059ded..e7b637c52f3 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,7 +106,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
   });
   // copy translate table to device memory
   rmm::device_uvector<translate_table> table =
-    cudf::detail::make_device_uvector_async(htable, stream);
+    cudf::detail::make_device_uvector_async(htable, stream, rmm::mr::get_current_device_resource());
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 8a63a6f6411..ae49ad17e53 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -268,7 +268,8 @@ auto list_lex_preprocess(table_view table, rmm::cuda_stream_view stream)
       dremel_device_views.push_back(dremel_data.back());
     }
   }
-  auto d_dremel_device_views = detail::make_device_uvector_sync(dremel_device_views, stream);
+  auto d_dremel_device_views = detail::make_device_uvector_sync(
+    dremel_device_views, stream, rmm::mr::get_current_device_resource());
   return std::make_tuple(std::move(dremel_data), std::move(d_dremel_device_views));
 }
 
@@ -355,10 +356,13 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(
   auto [verticalized_lhs, new_column_order, new_null_precedence, verticalized_col_depths] =
     decompose_structs(t, column_order, null_precedence);
 
-  auto d_t               = table_device_view::create(verticalized_lhs, stream);
-  auto d_column_order    = detail::make_device_uvector_async(new_column_order, stream);
-  auto d_null_precedence = detail::make_device_uvector_async(new_null_precedence, stream);
-  auto d_depths          = detail::make_device_uvector_async(verticalized_col_depths, stream);
+  auto d_t            = table_device_view::create(verticalized_lhs, stream);
+  auto d_column_order = detail::make_device_uvector_async(
+    new_column_order, stream, rmm::mr::get_current_device_resource());
+  auto d_null_precedence = detail::make_device_uvector_async(
+    new_null_precedence, stream, rmm::mr::get_current_device_resource());
+  auto d_depths = detail::make_device_uvector_async(
+    verticalized_col_depths, stream, rmm::mr::get_current_device_resource());
 
   if (detail::has_nested_columns(t)) {
     auto [dremel_data, d_dremel_device_view] = list_lex_preprocess(verticalized_lhs, stream);
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 634fdd70831..b982a010e6e 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -500,7 +500,8 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   auto d_cols = contiguous_copy_column_device_views<column_device_view>(cols, stream);
 
   // move stack info to the gpu
-  rmm::device_uvector<column_info> d_info = cudf::detail::make_device_uvector_async(info, stream);
+  rmm::device_uvector<column_info> d_info =
+    cudf::detail::make_device_uvector_async(info, stream, rmm::mr::get_current_device_resource());
 
   // each thread needs to maintain a stack of row spans of size max_branch_depth. we will use
   // shared memory to do this rather than allocating a potentially gigantic temporary buffer
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index 00ec7bd218b..7805828ad55 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -87,7 +87,7 @@ rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fil
 {
   if (!fill_valid) {
     return cudf::detail::make_zeroed_device_uvector_sync<cudf::bitmask_type>(
-      size, cudf::get_default_stream());
+      size, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   } else {
     auto ret = rmm::device_uvector<cudf::bitmask_type>(size, cudf::get_default_stream());
     CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(),
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 43874b84114..5694513647b 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -141,9 +141,10 @@ struct AtomicsTest : public cudf::test::BaseFixture {
     result_init[4] = result_init[1];
     result_init[5] = result_init[2];
 
-    auto dev_data = cudf::detail::make_device_uvector_sync(v, cudf::get_default_stream());
-    auto dev_result =
-      cudf::detail::make_device_uvector_sync(result_init, cudf::get_default_stream());
+    auto dev_data = cudf::detail::make_device_uvector_sync(
+      v, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto dev_result = cudf::detail::make_device_uvector_sync(
+      result_init, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
     if (block_size == 0) { block_size = vec_size; }
 
@@ -294,8 +295,10 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
     exact[2] = std::accumulate(
       v.begin(), v.end(), identity[2], [](T acc, uint64_t i) { return acc ^ T(i); });
 
-    auto dev_result = cudf::detail::make_device_uvector_sync(identity, cudf::get_default_stream());
-    auto dev_data   = cudf::detail::make_device_uvector_sync(v, cudf::get_default_stream());
+    auto dev_result = cudf::detail::make_device_uvector_sync(
+      identity, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto dev_data = cudf::detail::make_device_uvector_sync(
+      v, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
     if (block_size == 0) { block_size = vec_size; }
 
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index ab9970dc370..9631e433a5e 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,7 +83,8 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
 
   std::vector<decimal32> vec1(1000, decimal32{1, scale_type{-2}});
-  auto d_vec1 = cudf::detail::make_device_uvector_sync(vec1, cudf::get_default_stream());
+  auto d_vec1 = cudf::detail::make_device_uvector_sync(
+    vec1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto const sum = thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
                                   std::cbegin(d_vec1),
@@ -96,7 +97,8 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
   //       change inclusive scan to run on device (avoid copying to host)
   thrust::inclusive_scan(std::cbegin(vec1), std::cend(vec1), std::begin(vec1));
 
-  d_vec1 = cudf::detail::make_device_uvector_sync(vec1, cudf::get_default_stream());
+  d_vec1 = cudf::detail::make_device_uvector_sync(
+    vec1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   std::vector<int32_t> vec2(1000);
   std::iota(std::begin(vec2), std::end(vec2), 1);
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 3c01bd4de25..8af530b0002 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -470,7 +470,9 @@ TEST_P(JsonParserTest, ExtractColumn)
 
   std::string const input = R"( [{"a":0.0, "b":1.0}, {"a":0.1, "b":1.1}, {"a":0.2, "b":1.2}] )";
   auto const d_input      = cudf::detail::make_device_uvector_async(
-    cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+    cudf::host_span<char const>{input.c_str(), input.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, default_options, stream, mr);
 
@@ -508,7 +510,9 @@ TEST_P(JsonParserTest, UTF_JSON)
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])";
   auto const d_ascii_pass      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()}, stream);
+    cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
 
   CUDF_EXPECT_NO_THROW(json_parser(d_ascii_pass, default_options, stream, mr));
 
@@ -521,7 +525,9 @@ TEST_P(JsonParserTest, UTF_JSON)
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip ʒakotɛ"}}])";
   auto const d_utf_failed      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()}, stream);
+    cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_failed, default_options, stream, mr));
 
   // utf-8 string that passes parsing.
@@ -534,7 +540,9 @@ TEST_P(JsonParserTest, UTF_JSON)
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}},
   {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip ʒakotɛ"}}])";
   auto const d_utf_pass      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()}, stream);
+    cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_pass, default_options, stream, mr));
 }
 
@@ -555,7 +563,9 @@ TEST_P(JsonParserTest, ExtractColumnWithQuotes)
 
   std::string const input = R"( [{"a":"0.0", "b":1.0}, {"b":1.1}, {"b":2.1, "a":"2.0"}] )";
   auto const d_input      = cudf::detail::make_device_uvector_async(
-    cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+    cudf::host_span<char const>{input.c_str(), input.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, options, stream, mr);
 
@@ -599,14 +609,18 @@ TEST_P(JsonParserTest, ExpectFailMixStructAndList)
   // libcudf does not currently support a mix of lists and structs.
   for (auto const& input : inputs_fail) {
     auto const d_input = cudf::detail::make_device_uvector_async(
-      cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+      cudf::host_span<char const>{input.c_str(), input.size()},
+      stream,
+      rmm::mr::get_current_device_resource());
     EXPECT_THROW(auto const cudf_table = json_parser(d_input, options, stream, mr),
                  cudf::logic_error);
   }
 
   for (auto const& input : inputs_succeed) {
     auto const d_input = cudf::detail::make_device_uvector_async(
-      cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+      cudf::host_span<char const>{input.c_str(), input.size()},
+      stream,
+      rmm::mr::get_current_device_resource());
     CUDF_EXPECT_NO_THROW(auto const cudf_table = json_parser(d_input, options, stream, mr));
   }
 }
@@ -626,8 +640,10 @@ TEST_P(JsonParserTest, EmptyString)
   cudf::io::json_reader_options default_options{};
 
   std::string const input = R"([])";
-  auto const d_input      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+  auto const d_input =
+    cudf::detail::make_device_uvector_sync(cudf::host_span<char const>{input.c_str(), input.size()},
+                                           stream,
+                                           rmm::mr::get_current_device_resource());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, default_options, stream, mr);
 
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index e82b0c670b8..8a16fd9a05a 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -1128,7 +1128,8 @@ TEST_F(ParquetWriterTest, BufferSource)
     auto const d_input = cudf::detail::make_device_uvector_sync(
       cudf::host_span<uint8_t const>{reinterpret_cast<uint8_t const*>(out_buffer.data()),
                                      out_buffer.size()},
-      cudf::get_default_stream());
+      cudf::get_default_stream(),
+      rmm::mr::get_current_device_resource());
     auto const d_buffer = cudf::device_span<std::byte const>(
       reinterpret_cast<std::byte const*>(d_input.data()), d_input.size());
     cudf::io::parquet_reader_options in_opts =
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
index ea6eb9b93ef..81c6563cd2d 100644
--- a/cpp/tests/io/type_inference_test.cu
+++ b/cpp/tests/io/type_inference_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,12 +51,12 @@ TEST_F(TypeInference, Basic)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 4, 7};
-  auto const string_length = std::vector<std::size_t>{2, 2, 1};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 4, 7};
+  auto const string_length   = std::vector<std::size_t>{2, 2, 1};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -84,12 +84,12 @@ TEST_F(TypeInference, Null)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 1, 4};
-  auto const string_length = std::vector<std::size_t>{0, 2, 1};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 1, 4};
+  auto const string_length   = std::vector<std::size_t>{0, 2, 1};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -117,12 +117,12 @@ TEST_F(TypeInference, AllNull)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 1, 1};
-  auto const string_length = std::vector<std::size_t>{0, 0, 4};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 1, 1};
+  auto const string_length   = std::vector<std::size_t>{0, 0, 4};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -150,12 +150,12 @@ TEST_F(TypeInference, String)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 8, 12};
-  auto const string_length = std::vector<std::size_t>{6, 3, 4};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 8, 12};
+  auto const string_length   = std::vector<std::size_t>{6, 3, 4};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -183,12 +183,12 @@ TEST_F(TypeInference, Bool)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 6, 12};
-  auto const string_length = std::vector<std::size_t>{4, 5, 5};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 6, 12};
+  auto const string_length   = std::vector<std::size_t>{4, 5, 5};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -216,12 +216,12 @@ TEST_F(TypeInference, Timestamp)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 10};
-  auto const string_length = std::vector<std::size_t>{8, 9};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 10};
+  auto const string_length   = std::vector<std::size_t>{8, 9};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -250,12 +250,12 @@ TEST_F(TypeInference, InvalidInput)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 3, 5, 7, 9};
-  auto const string_length = std::vector<std::size_t>{1, 1, 1, 1, 1};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 3, 5, 7, 9};
+  auto const string_length   = std::vector<std::size_t>{1, 1, 1, 1, 1};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 894e117ba40..882de994e67 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,8 +87,8 @@ struct IteratorTest : public cudf::test::BaseFixture {
   {
     InputIterator d_in_last = d_in + num_items;
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
-    auto dev_expected =
-      cudf::detail::make_device_uvector_sync(expected, cudf::get_default_stream());
+    auto dev_expected = cudf::detail::make_device_uvector_sync(
+      expected, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
     // using a temporary vector and calling transform and all_of separately is
     // equivalent to thrust::equal but compiles ~3x faster
diff --git a/cpp/tests/iterator/value_iterator_test.cuh b/cpp/tests/iterator/value_iterator_test.cuh
index fa931d34a0e..8252ce88f39 100644
--- a/cpp/tests/iterator/value_iterator_test.cuh
+++ b/cpp/tests/iterator/value_iterator_test.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,8 @@ template <typename T>
 void non_null_iterator(IteratorTest<T>& testFixture)
 {
   auto host_array = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
-  auto dev_array  = cudf::detail::make_device_uvector_sync(host_array, cudf::get_default_stream());
+  auto dev_array  = cudf::detail::make_device_uvector_sync(
+    host_array, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // calculate the expected value by CPU.
   thrust::host_vector<T> replaced_array(host_array);
diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu
index 8b4080fa493..d0e62c09a03 100644
--- a/cpp/tests/iterator/value_iterator_test_strings.cu
+++ b/cpp/tests/iterator/value_iterator_test_strings.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,8 @@ auto strings_to_string_views(std::vector<std::string>& input_strings)
   std::vector<int32_t> offsets;
   std::tie(chars, offsets) = cudf::test::detail::make_chars_and_offsets(
     input_strings.begin(), input_strings.end(), all_valid);
-  auto dev_chars = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
+  auto dev_chars = cudf::detail::make_device_uvector_sync(
+    chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // calculate the expected value by CPU. (but contains device pointers)
   thrust::host_vector<cudf::string_view> replaced_array(input_strings.size());
@@ -51,8 +52,9 @@ TEST_F(StringIteratorTest, string_view_null_iterator)
   using T = cudf::string_view;
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
-  T init       = T{initmsg.data(), int(initmsg.size())};
+  auto initmsg = cudf::detail::make_device_uvector_sync(
+    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  T init = T{initmsg.data(), int(initmsg.size())};
 
   // data and valid arrays
   std::vector<std::string> host_values(
@@ -86,8 +88,9 @@ TEST_F(StringIteratorTest, string_view_no_null_iterator)
   // T init = T{"", 0};
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
-  T init       = T{initmsg.data(), int(initmsg.size())};
+  auto initmsg = cudf::detail::make_device_uvector_sync(
+    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  T init = T{initmsg.data(), int(initmsg.size())};
 
   // data array
   std::vector<std::string> host_values(
@@ -110,8 +113,9 @@ TEST_F(StringIteratorTest, string_scalar_iterator)
   // T init = T{"", 0};
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
-  T init       = T{initmsg.data(), int(initmsg.size())};
+  auto initmsg = cudf::detail::make_device_uvector_sync(
+    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  T init = T{initmsg.data(), int(initmsg.size())};
 
   // data array
   std::vector<std::string> host_values(100, zero);
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 9d206c5397d..a1508b5b973 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -308,8 +308,8 @@ void run_fixed_width_test(size_t cols,
 
   // Make a table view of the partition numbers
   constexpr cudf::data_type dtype{cudf::type_id::INT32};
-  auto d_partitions =
-    cudf::detail::make_device_uvector_sync(partitions, cudf::get_default_stream());
+  auto d_partitions = cudf::detail::make_device_uvector_sync(
+    partitions, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   cudf::column_view partitions_col(dtype, rows, d_partitions.data());
   cudf::table_view partitions_table({partitions_col});
 
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 47bcbb874cf..40b0d268580 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -49,9 +49,9 @@ TYPED_TEST(SegmentedReductionTest, SumExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{6, 4, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -97,9 +97,9 @@ TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{15, 15, 1, XXX, XXX, XXX},
                                                                         {1, 1, 1, 0, 0, 0}};
 
@@ -147,9 +147,9 @@ TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{3, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -195,9 +195,9 @@ TYPED_TEST(SegmentedReductionTest, MinExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{1, 1, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -244,9 +244,9 @@ TYPED_TEST(SegmentedReductionTest, AnyExcludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX},
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {false, false, true, true, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, true, true, true, false, true, true, false, false}};
@@ -284,9 +284,9 @@ TYPED_TEST(SegmentedReductionTest, AllExcludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0},
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {true, true, bool{XXX}, true, bool{XXX}, bool{XXX}, false, false, false},
     {true, true, false, true, false, false, true, true, true}};
@@ -335,9 +335,9 @@ TYPED_TEST(SegmentedReductionTest, SumIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{6, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -386,9 +386,9 @@ TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{15, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -439,9 +439,9 @@ TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{3, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -490,9 +490,9 @@ TYPED_TEST(SegmentedReductionTest, MinIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{1, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -542,9 +542,9 @@ TYPED_TEST(SegmentedReductionTest, AnyIncludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX},
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {false, bool{XXX}, true, bool{XXX}, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, false, true, false, false, true, true, false, false}};
@@ -605,9 +605,9 @@ TYPED_TEST(SegmentedReductionTest, AllIncludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0},
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {true, bool{XXX}, bool{XXX}, true, bool{XXX}, bool{XXX}, false, bool{XXX}, false},
     {true, false, false, true, false, false, true, false, true}};
@@ -670,9 +670,9 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
 
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>{
     {1, 2, 3, 4, 5, 6, 7}, {true, true, true, true, true, true, true}};
-  auto const offsets = std::vector<cudf::size_type>{1, 3, 4};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{1, 3, 4};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{{5, 4}, {true, true}};
 
   auto res =
@@ -720,10 +720,10 @@ TEST_F(SegmentedReductionTestUntyped, NonNullableInput)
   // outputs: {1, 5, 4}
   // output nullmask: {1, 1, 1}
 
-  auto const input   = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 3, 7};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 3, 7};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<int32_t>{{1, XXX, 5, 22}, {true, false, true, true}};
 
@@ -767,9 +767,9 @@ TEST_F(SegmentedReductionTestUntyped, Mean)
 {
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -786,9 +786,9 @@ TEST_F(SegmentedReductionTestUntyped, MeanNulls)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 20, 30, 40, 50, 60, 0, 80, 90}, {1, 1, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -806,9 +806,9 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquares)
 {
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
@@ -826,9 +826,9 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquaresNulls)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 20, 30, 40, 50, 60, 0, 80, 90}, {1, 1, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT64};
 
@@ -848,9 +848,9 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviation)
   constexpr float NaN{std::numeric_limits<float>::quiet_NaN()};
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -868,9 +868,9 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviationNulls)
   constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -890,9 +890,9 @@ TEST_F(SegmentedReductionTestUntyped, Variance)
   constexpr float NaN{std::numeric_limits<float>::quiet_NaN()};
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -910,9 +910,9 @@ TEST_F(SegmentedReductionTestUntyped, VarianceNulls)
   constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -931,9 +931,9 @@ TEST_F(SegmentedReductionTestUntyped, Errors)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const null_policy = cudf::null_policy::EXCLUDE;
   auto const output_type = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS};
   auto const str_input =
@@ -999,10 +999,10 @@ TEST_F(SegmentedReductionTestUntyped, Errors)
 
 TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
 {
-  auto const input   = cudf::test::fixed_width_column_wrapper<int32_t>{};
-  auto const offsets = std::vector<cudf::size_type>{0};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{};
+  auto const offsets   = std::vector<cudf::size_type>{0};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{};
 
   auto res =
@@ -1036,10 +1036,10 @@ TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
 
 TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
 {
-  auto const input   = cudf::test::fixed_width_column_wrapper<int32_t>{};
-  auto const offsets = std::vector<cudf::size_type>{0, 0, 0, 0, 0, 0};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{};
+  auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0, 0, 0};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<int32_t>{{XXX, XXX, XXX, XXX, XXX}, {0, 0, 0, 0, 0}};
 
@@ -1087,9 +1087,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxWithNulls)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1115,9 +1115,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinWithNulls)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1143,9 +1143,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxNonNullableInput)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 4, 4};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 4, 4};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1168,9 +1168,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinNonNullableInput)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 4, 4};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 4, 4};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1193,9 +1193,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, Sum)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1231,9 +1231,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, Product)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 12, 12};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 12, 12};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_product_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1268,9 +1268,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, SumOfSquares)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1431,10 +1431,10 @@ TEST_F(SegmentedReductionStringTest, MinExcludeNulls)
 
 TEST_F(SegmentedReductionStringTest, EmptyInputWithOffsets)
 {
-  auto const input   = cudf::test::strings_column_wrapper{};
-  auto const offsets = std::vector<cudf::size_type>{0, 0, 0, 0};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::strings_column_wrapper{};
+  auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::strings_column_wrapper({XXX, XXX, XXX}, {0, 0, 0});
 
   auto result =
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index c7365d63e1c..9e0f68573a5 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,7 +130,8 @@ TEST_F(StringScalarDeviceViewTest, Value)
 
   auto scalar_device_view = cudf::get_scalar_device_view(s);
   rmm::device_scalar<bool> result{cudf::get_default_stream()};
-  auto value_v = cudf::detail::make_device_uvector_sync(value, cudf::get_default_stream());
+  auto value_v = cudf::detail::make_device_uvector_sync(
+    value, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   test_string_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     scalar_device_view, value_v.data(), value.size(), result.data());
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 5331c4c34d8..316f24e4167 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -298,9 +298,11 @@ TEST_F(StringsContainsTests, HexTest)
   std::vector<cudf::offset_type> offsets(
     {thrust::make_counting_iterator<cudf::offset_type>(0),
      thrust::make_counting_iterator<cudf::offset_type>(0) + count + 1});
-  auto d_chars   = cudf::detail::make_device_uvector_sync(ascii_chars, cudf::get_default_stream());
-  auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
-  auto input     = cudf::make_strings_column(d_chars, d_offsets);
+  auto d_chars = cudf::detail::make_device_uvector_sync(
+    ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_offsets = cudf::detail::make_device_uvector_sync(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto input = cudf::make_strings_column(d_chars, d_offsets);
 
   auto strings_view = cudf::strings_column_view(input->view());
   for (auto ch : ascii_chars) {
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index e3df8db721d..77857049e7a 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -78,7 +78,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     }
     h_offsets[idx + 1] = offset;
   }
-  auto d_strings = cudf::detail::make_device_uvector_sync(strings, cudf::get_default_stream());
+  auto d_strings = cudf::detail::make_device_uvector_sync(
+    strings, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   CUDF_CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyDefault));
   auto column = cudf::make_strings_column(d_strings);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
@@ -143,10 +144,13 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   }
 
   std::vector<cudf::bitmask_type> h_nulls{h_null_mask};
-  auto d_buffer  = cudf::detail::make_device_uvector_sync(h_buffer, cudf::get_default_stream());
-  auto d_offsets = cudf::detail::make_device_uvector_sync(h_offsets, cudf::get_default_stream());
-  auto d_nulls   = cudf::detail::make_device_uvector_sync(h_nulls, cudf::get_default_stream());
-  auto column    = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count);
+  auto d_buffer = cudf::detail::make_device_uvector_sync(
+    h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_offsets = cudf::detail::make_device_uvector_sync(
+    h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_nulls = cudf::detail::make_device_uvector_sync(
+    h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto column = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
   EXPECT_EQ(column->null_count(), null_count);
   EXPECT_EQ(2, column->num_children());
@@ -184,8 +188,8 @@ TEST_F(StringsFactoriesTest, CreateScalar)
 TEST_F(StringsFactoriesTest, EmptyStringsColumn)
 {
   rmm::device_uvector<char> d_chars{0, cudf::get_default_stream()};
-  auto d_offsets =
-    cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(1, cudf::get_default_stream());
+  auto d_offsets = cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   rmm::device_uvector<cudf::bitmask_type> d_nulls{0, cudf::get_default_stream()};
 
   auto results = cudf::make_strings_column(d_chars, d_offsets, d_nulls, 0);
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 04e6886a08a..79e96ff5121 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -297,8 +297,9 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2));
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
-  auto d_integers = cudf::detail::make_device_uvector_sync(h_integers, cudf::get_default_stream());
-  auto integers   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
+  auto d_integers = cudf::detail::make_device_uvector_sync(
+    h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
   auto integers_view = integers->mutable_view();
   CUDF_CUDA_TRY(cudaMemcpy(integers_view.data<TypeParam>(),
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index 0542d007ca0..5127f69162f 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,8 +47,8 @@ void row_comparison(cudf::table_view input1,
 
   auto device_table_1 = cudf::table_device_view::create(input1, stream);
   auto device_table_2 = cudf::table_device_view::create(input2, stream);
-  auto d_column_order =
-    cudf::detail::make_device_uvector_sync(column_order, cudf::get_default_stream());
+  auto d_column_order = cudf::detail::make_device_uvector_sync(
+    column_order, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto comparator = cudf::row_lexicographic_comparator(
     cudf::nullate::NO{}, *device_table_1, *device_table_2, d_column_order.data());
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index 911911851f2..a27d8931ee6 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,8 @@ __global__ void dispatch_test_kernel(cudf::type_id id, bool* d_result)
 
 TYPED_TEST(TypedDispatcherTest, DeviceDispatch)
 {
-  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1, cudf::get_default_stream());
+  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
@@ -130,7 +131,8 @@ __global__ void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2
 
 TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch)
 {
-  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1, cudf::get_default_stream());
+  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   double_dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
diff --git a/cpp/tests/utilities/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu
index 15998e32bd0..d2e95812894 100644
--- a/cpp/tests/utilities/tdigest_utilities.cu
+++ b/cpp/tests/utilities/tdigest_utilities.cu
@@ -64,12 +64,12 @@ void tdigest_sample_compare(cudf::tdigest::tdigest_column_view const& tdv,
     });
   }
 
-  auto d_expected_src =
-    cudf::detail::make_device_uvector_async(h_expected_src, cudf::get_default_stream());
-  auto d_expected_mean =
-    cudf::detail::make_device_uvector_async(h_expected_mean, cudf::get_default_stream());
-  auto d_expected_weight =
-    cudf::detail::make_device_uvector_async(h_expected_weight, cudf::get_default_stream());
+  auto d_expected_src = cudf::detail::make_device_uvector_async(
+    h_expected_src, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_expected_mean = cudf::detail::make_device_uvector_async(
+    h_expected_mean, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_expected_weight = cudf::detail::make_device_uvector_async(
+    h_expected_weight, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto iter = thrust::make_counting_iterator(0);
   thrust::for_each(
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index a043e723eda..66f9fbfc0d6 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -238,8 +238,8 @@ __global__ void simple_device_kernel(device_span<bool> result) { result[0] = tru
 
 TEST(SpanTest, CanUseDeviceSpan)
 {
-  auto d_message =
-    cudf::detail::make_zeroed_device_uvector_async<bool>(1, cudf::get_default_stream());
+  auto d_message = cudf::detail::make_zeroed_device_uvector_async<bool>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_span = device_span<bool>(d_message.data(), d_message.size());
 
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 5cf7658106f..747ff24f055 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -226,7 +226,8 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi
     std::copy_if(offsets_iter, offsets_iter + tbl.num_columns(),
                  std::back_inserter(offsets_iterators),
                  [](auto const &offset_ptr) { return offset_ptr != nullptr; });
-    return make_device_uvector_async(offsets_iterators, stream);
+    return make_device_uvector_async(offsets_iterators, stream,
+                                     rmm::mr::get_current_device_resource());
   }();
 
   auto const num_columns = static_cast<size_type>(d_offsets_iterators.size());
@@ -1539,7 +1540,9 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
     last_row_end = row_end;
   }
 
-  return {std::move(batch_row_offsets), make_device_uvector_async(batch_row_boundaries, stream),
+  return {std::move(batch_row_offsets),
+          make_device_uvector_async(batch_row_boundaries, stream,
+                                    rmm::mr::get_current_device_resource()),
           std::move(batch_row_boundaries), std::move(row_batches)};
 }
 
@@ -1750,8 +1753,10 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
     return table_view(cols);
   };
 
-  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream);
-  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream);
+  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream,
+                                                 rmm::mr::get_current_device_resource());
+  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream,
+                                                  rmm::mr::get_current_device_resource());
 
   // Get the pointers to the input columnar data ready
   auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const &c) {
@@ -1764,8 +1769,10 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
       thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { return c.null_mask(); });
   std::vector<bitmask_type const *> input_nm(nm_begin, nm_begin + tbl.num_columns());
 
-  auto dev_input_data = make_device_uvector_async(input_data, stream);
-  auto dev_input_nm = make_device_uvector_async(input_nm, stream);
+  auto dev_input_data =
+      make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource());
+  auto dev_input_nm =
+      make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource());
 
   // the first batch always exists unless we were sent an empty table
   auto const first_batch_size = batch_info.row_batches[0].row_count;
@@ -1811,7 +1818,8 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
   auto validity_tile_infos = detail::build_validity_tile_infos(
       tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches);
 
-  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream);
+  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream,
+                                                           rmm::mr::get_current_device_resource());
 
   auto const validity_offset = column_info.column_starts.back();
 
@@ -1847,9 +1855,10 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
     std::vector<int8_t const *> variable_width_input_data(
         variable_data_begin, variable_data_begin + variable_width_table.num_columns());
 
-    auto dev_variable_input_data = make_device_uvector_async(variable_width_input_data, stream);
-    auto dev_variable_col_output_offsets =
-        make_device_uvector_async(column_info.variable_width_column_starts, stream);
+    auto dev_variable_input_data = make_device_uvector_async(
+        variable_width_input_data, stream, rmm::mr::get_current_device_resource());
+    auto dev_variable_col_output_offsets = make_device_uvector_async(
+        column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource());
 
     for (uint i = 0; i < batch_info.row_batches.size(); i++) {
       auto const batch_row_offset = batch_info.batch_row_boundaries[i];
@@ -2076,8 +2085,10 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   // Ideally we would check that the offsets are all the same, etc. but for now this is probably
   // fine
   CUDF_EXPECTS(size_per_row * num_rows <= child.size(), "The layout of the data appears to be off");
-  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream);
-  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream);
+  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream,
+                                                  rmm::mr::get_current_device_resource());
+  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream,
+                                                 rmm::mr::get_current_device_resource());
 
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<column>> output_columns;
@@ -2118,16 +2129,20 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
     }
   }
 
-  auto dev_string_row_offsets = make_device_uvector_async(string_row_offsets, stream);
-  auto dev_string_lengths = make_device_uvector_async(string_lengths, stream);
+  auto dev_string_row_offsets =
+      make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource());
+  auto dev_string_lengths =
+      make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource());
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
   row_batches.push_back(
       {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
 
-  auto dev_output_data = make_device_uvector_async(output_data, stream);
-  auto dev_output_nm = make_device_uvector_async(output_nm, stream);
+  auto dev_output_data =
+      make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource());
+  auto dev_output_nm =
+      make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource());
 
   // only ever get a single batch when going from rows, so boundaries are 0, num_rows
   constexpr auto num_batches = 2;
@@ -2164,7 +2179,8 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   auto validity_tile_infos =
       detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches);
 
-  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream);
+  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream,
+                                                           rmm::mr::get_current_device_resource());
 
   dim3 const validity_blocks(validity_tile_infos.size());
 
@@ -2221,8 +2237,10 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
       string_col_offsets.push_back(std::move(output_string_offsets));
       string_data_cols.push_back(std::move(string_data));
     }
-    auto dev_string_col_offsets = make_device_uvector_async(string_col_offset_ptrs, stream);
-    auto dev_string_data_cols = make_device_uvector_async(string_data_col_ptrs, stream);
+    auto dev_string_col_offsets = make_device_uvector_async(string_col_offset_ptrs, stream,
+                                                            rmm::mr::get_current_device_resource());
+    auto dev_string_data_cols = make_device_uvector_async(string_data_col_ptrs, stream,
+                                                          rmm::mr::get_current_device_resource());
 
     dim3 const string_blocks(
         std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS),
@@ -2274,8 +2292,10 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
     // fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = make_device_uvector_async(column_start, stream);
-    auto dev_column_size = make_device_uvector_async(column_size, stream);
+    auto dev_column_start =
+        make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource());
+    auto dev_column_size =
+        make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource());
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<column>> output_columns;

From d171fdaad641ec186c5cb5c172a96be321b5713b Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 20 Mar 2023 14:15:21 -0400
Subject: [PATCH 26/63] Update to use rapids-export(COMPONENTS) feature.
 (#12959)

Not only does this simplify the `cudf` CMakeLists.txt it removes a bug where we failed to install the testing dependencies file since it relied on internal rapids-cmake details.

Expected to fix the following:

- cuspatial issues with not finding GTest targets
- [spark-rapids-jni](https://github.com/NVIDIA/spark-rapids-jni) issues with not finding GTest targets

Fixes #12976

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/12959
---
 cpp/CMakeLists.txt | 49 +++++++++++-----------------------------------
 1 file changed, 11 insertions(+), 38 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0fcd1895972..0848af2a916 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -890,31 +890,20 @@ install(
   EXPORT cudf-exports
 )
 
-install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf ${CUDF_SOURCE_DIR}/include/cudf_test
-                  ${CUDF_SOURCE_DIR}/include/nvtext DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-)
-
-if(CUDF_BUILD_TESTUTIL)
+set(_components_export_string)
+if(TARGET cudftestutil)
   install(
     TARGETS cudftest_default_stream cudftestutil
     DESTINATION ${lib_dir}
     EXPORT cudf-testing-exports
   )
-
-  install(
-    EXPORT cudf-testing-exports
-    FILE cudf-testing-targets.cmake
-    NAMESPACE cudf::
-    DESTINATION "${lib_dir}/cmake/cudf"
-  )
-
-  include("${rapids-cmake-dir}/export/write_dependencies.cmake")
-  rapids_export_write_dependencies(
-    INSTALL cudf-testing-exports
-    "${PROJECT_BINARY_DIR}/rapids-cmake/cudf/export/cudf-testing-dependencies.cmake"
-  )
+  set(_components_export_string COMPONENTS testing COMPONENTS_EXPORT_SET cudf-testing-exports)
 endif()
 
+install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf ${CUDF_SOURCE_DIR}/include/cudf_test
+                  ${CUDF_SOURCE_DIR}/include/nvtext DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
 if(CUDF_BUILD_STREAMS_TEST_UTIL)
   install(TARGETS cudf_identify_stream_usage_mode_cudf DESTINATION ${lib_dir})
   install(TARGETS cudf_identify_stream_usage_mode_testing DESTINATION ${lib_dir})
@@ -976,12 +965,6 @@ string(
   [=[
 if(testing IN_LIST cudf_FIND_COMPONENTS)
   enable_language(CUDA)
-  if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake")
-  endif()
-  if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
-  endif()
 endif()
 ]=]
 )
@@ -989,8 +972,8 @@ string(APPEND install_code_string "${common_code_string}")
 
 rapids_export(
   INSTALL cudf
-  EXPORT_SET cudf-exports
-  GLOBAL_TARGETS cudf
+  EXPORT_SET cudf-exports ${_components_export_string}
+  GLOBAL_TARGETS cudf cudftestutil
   NAMESPACE cudf::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK install_code_string
@@ -1013,23 +996,13 @@ string(APPEND build_code_string "${common_code_string}")
 
 rapids_export(
   BUILD cudf
-  EXPORT_SET cudf-exports
-  GLOBAL_TARGETS cudf
+  EXPORT_SET cudf-exports ${_components_export_string}
+  GLOBAL_TARGETS cudf cudftestutil
   NAMESPACE cudf::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK build_code_string
 )
 
-if(CUDF_BUILD_TESTUTIL)
-  export(
-    EXPORT cudf-testing-exports
-    FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake
-    NAMESPACE cudf::
-  )
-  rapids_export_write_dependencies(
-    BUILD cudf-testing-exports "${CUDF_BINARY_DIR}/cudf-testing-dependencies.cmake"
-  )
-endif()
 # ##################################################################################################
 # * make documentation ----------------------------------------------------------------------------
 

From aff1c9f964a9cfa638ce838593a6e6e7d540ef4a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 20 Mar 2023 15:36:48 -0400
Subject: [PATCH 27/63] Add optional text file support to ninja-log utility
 (#12823)

Adds support for `sort_ninja_log.py` utility to accept an optional text file to be included at the top of the generated html report. This allows it to be used asynchronously outside the build step and helps enable using it in C++ builds in non-cudf repos.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/12823
---
 build.sh                      |   7 +-
 cpp/scripts/sort_ninja_log.py | 233 +++++++++++++++++++++++-----------
 2 files changed, 164 insertions(+), 76 deletions(-)

diff --git a/build.sh b/build.sh
index bee66d819b4..7cbd0fceb5a 100755
--- a/build.sh
+++ b/build.sh
@@ -300,8 +300,7 @@ if buildAll || hasArg libcudf; then
     # Record build times
     if [[ "$BUILD_REPORT_METRICS" == "ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
         echo "Formatting build metrics"
-        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml
-        MSG="<p>"
+        MSG=""
         # get some sccache stats after the compile
         if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" && -x "$(command -v sccache)" ]]; then
            COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }')
@@ -318,7 +317,9 @@ if buildAll || hasArg libcudf; then
         BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIB_BUILD_DIR}"}
         echo "Metrics output dir: [$BMR_DIR]"
         mkdir -p ${BMR_DIR}
-        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${BMR_DIR}/ninja_log.html
+        MSG_OUTFILE="$(mktemp)"
+        echo "$MSG" > "${MSG_OUTFILE}"
+        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "${MSG_OUTFILE}" > ${BMR_DIR}/ninja_log.html
         cp ${LIB_BUILD_DIR}/.ninja_log ${BMR_DIR}/ninja.log
     fi
 
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index 9cb8afbff9f..3fe503f749e 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,10 +1,11 @@
 #
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 import argparse
 import os
 import sys
 import xml.etree.ElementTree as ET
+from pathlib import Path
 from xml.dom import minidom
 
 parser = argparse.ArgumentParser()
@@ -22,52 +23,50 @@
     "--msg",
     type=str,
     default=None,
-    help="optional message to include in html output",
+    help="optional text file to include at the top of the html output",
+)
+parser.add_argument(
+    "--cmp_log",
+    type=str,
+    default=None,
+    help="optional baseline ninja_log to compare results",
 )
 args = parser.parse_args()
 
 log_file = args.log_file
-log_path = os.path.dirname(os.path.abspath(log_file))
-
 output_fmt = args.fmt
+cmp_file = args.cmp_log
 
 # build a map of the log entries
-entries = {}
-with open(log_file) as log:
-    last = 0
-    files = {}
-    for line in log:
-        entry = line.split()
-        if len(entry) > 4:
-            obj_file = entry[3]
-            file_size = (
-                os.path.getsize(os.path.join(log_path, obj_file))
-                if os.path.exists(obj_file)
-                else 0
-            )
-            start = int(entry[0])
-            end = int(entry[1])
-            # logic based on ninjatracing
-            if end < last:
-                files = {}
-            last = end
-            files.setdefault(entry[4], (entry[3], start, end, file_size))
-
-    # build entries from files dict
-    for entry in files.values():
-        entries[entry[0]] = (entry[1], entry[2], entry[3])
-
-# check file could be loaded and we have entries to report
-if len(entries) == 0:
-    print("Could not parse", log_file)
-    exit()
+def build_log_map(log_file):
+    entries = {}
+    log_path = os.path.dirname(os.path.abspath(log_file))
+    with open(log_file) as log:
+        last = 0
+        files = {}
+        for line in log:
+            entry = line.split()
+            if len(entry) > 4:
+                obj_file = entry[3]
+                file_size = (
+                    os.path.getsize(os.path.join(log_path, obj_file))
+                    if os.path.exists(obj_file)
+                    else 0
+                )
+                start = int(entry[0])
+                end = int(entry[1])
+                # logic based on ninjatracing
+                if end < last:
+                    files = {}
+                last = end
+                files.setdefault(entry[4], (entry[3], start, end, file_size))
+
+        # build entries from files dict
+        for entry in files.values():
+            entries[entry[0]] = (entry[1], entry[2], entry[3])
+
+    return entries
 
-# sort the entries by build-time (descending order)
-sorted_list = sorted(
-    list(entries.keys()),
-    key=lambda k: entries[k][1] - entries[k][0],
-    reverse=True,
-)
 
 # output results in XML format
 def output_xml(entries, sorted_list, args):
@@ -147,14 +146,46 @@ def assign_entries_to_threads(entries):
     return (results, end_time)
 
 
-# output chart results in HTML format
-def output_html(entries, sorted_list, args):
+# format the build-time
+def format_build_time(input_time):
+    build_time = abs(input_time)
+    build_time_str = str(build_time) + " ms"
+    if build_time > 120000:  # 2 minutes
+        minutes = int(build_time / 60000)
+        seconds = int(((build_time / 60000) - minutes) * 60)
+        build_time_str = "{:d}:{:02d} min".format(minutes, seconds)
+    elif build_time > 1000:
+        build_time_str = "{:.3f} s".format(build_time / 1000)
+    if input_time < 0:
+        build_time_str = "-" + build_time_str
+    return build_time_str
+
+
+# format file size
+def format_file_size(input_size):
+    file_size = abs(input_size)
+    file_size_str = ""
+    if file_size > 1000000:
+        file_size_str = "{:.3f} MB".format(file_size / 1000000)
+    elif file_size > 1000:
+        file_size_str = "{:.3f} KB".format(file_size / 1000)
+    elif file_size > 0:
+        file_size_str = str(file_size) + " bytes"
+    if input_size < 0:
+        file_size_str = "-" + file_size_str
+    return file_size_str
+
+
+# Output chart results in HTML format
+# Builds a standalone html file with no javascript or styles
+def output_html(entries, sorted_list, cmp_entries, args):
     print("<html><head><title>Build Metrics Report</title>")
-    # Note: Jenkins does not support javascript nor style defined in the html
-    # https://www.jenkins.io/doc/book/security/configuring-content-security-policy/
     print("</head><body>")
     if args.msg is not None:
-        print("<p>", args.msg, "</p>")
+        msg_file = Path(args.msg)
+        if msg_file.is_file():
+            msg = msg_file.read_text()
+            print("<p>", msg, "</p>")
 
     # map entries to threads
     # the end_time is used to scale all the entries to a fixed output width
@@ -201,15 +232,8 @@ def output_html(entries, sorted_list, args):
             # adjust for the cellspacing
             prev_end = end + int(end_time / 500)
 
-            # format the build-time
             build_time = end - start
-            build_time_str = str(build_time) + " ms"
-            if build_time > 120000:  # 2 minutes
-                minutes = int(build_time / 60000)
-                seconds = int(((build_time / 60000) - minutes) * 60)
-                build_time_str = "{:d}:{:02d} min".format(minutes, seconds)
-            elif build_time > 1000:
-                build_time_str = "{:.3f} s".format(build_time / 1000)
+            build_time_str = format_build_time(build_time)
 
             # assign color and accumulate legend values
             color = white
@@ -248,7 +272,7 @@ def output_html(entries, sorted_list, args):
             # done with this entry
             print("</font></td>")
             # update the entry with just the computed output info
-            entries[name] = (build_time_str, color, entry[2])
+            entries[name] = (build_time, color, entry[2])
 
         # add a filler column at the end of each row
         print("<td width='*'></td></tr></table></td></tr>")
@@ -259,30 +283,53 @@ def output_html(entries, sorted_list, args):
     # output detail table in build-time descending order
     print("<table id='detail' bgcolor='#EEEEEE'>")
     print(
-        "<tr><th>File</th>",
-        "<th>Compile time</th>",
-        "<th>Size</th><tr>",
-        sep="",
+        "<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep=""
     )
+    if cmp_entries:
+        print("<th>t-cmp</th>", sep="")
+    print("</tr>")
+
     for name in sorted_list:
         entry = entries[name]
-        build_time_str = entry[0]
+        build_time = entry[0]
         color = entry[1]
         file_size = entry[2]
 
-        # format file size
-        file_size_str = ""
-        if file_size > 1000000:
-            file_size_str = "{:.3f} MB".format(file_size / 1000000)
-        elif file_size > 1000:
-            file_size_str = "{:.3f} KB".format(file_size / 1000)
-        elif file_size > 0:
-            file_size_str = str(file_size) + " bytes"
+        build_time_str = format_build_time(build_time)
+        file_size_str = format_file_size(file_size)
 
         # output entry row
         print("<tr ", color, "><td>", name, "</td>", sep="", end="")
         print("<td align='right'>", build_time_str, "</td>", sep="", end="")
-        print("<td align='right'>", file_size_str, "</td></tr>", sep="")
+        print("<td align='right'>", file_size_str, "</td>", sep="", end="")
+        # output diff column
+        cmp_entry = (
+            cmp_entries[name] if cmp_entries and name in cmp_entries else None
+        )
+        if cmp_entry:
+            diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
+            diff_time_str = format_build_time(diff_time)
+            diff_color = white
+            diff_percent = int((diff_time / build_time) * 100)
+            if build_time > 60000:
+                if diff_percent > 20:
+                    diff_color = red
+                    diff_time_str = "<b>" + diff_time_str + "</b>"
+                elif diff_percent < -20:
+                    diff_color = green
+                    diff_time_str = "<b>" + diff_time_str + "</b>"
+                elif diff_percent > 0:
+                    diff_color = yellow
+            print(
+                "<td align='right' ",
+                diff_color,
+                ">",
+                diff_time_str,
+                "</td>",
+                sep="",
+                end="",
+            )
+        print("</tr>")
 
     print("</table><br/>")
 
@@ -296,22 +343,62 @@ def output_html(entries, sorted_list, args):
     print("<td align='right'>", summary["green"], "</td></tr>")
     print("<tr><td", white, ">time &lt; 1 second</td>")
     print("<td align='right'>", summary["white"], "</td></tr>")
-    print("</table></body></html>")
+    print("</table>")
+
+    if cmp_entries:
+        print("<table id='legend' border='2' bgcolor='#EEEEEE'>")
+        print("<tr><td", red, ">time increase &gt; 20%</td></tr>")
+        print("<tr><td", yellow, ">time increase &gt; 0</td></tr>")
+        print("<tr><td", green, ">time decrease &gt; 20%</td></tr>")
+        print(
+            "<tr><td",
+            white,
+            ">time change &lt; 20%% or build time &lt; 1 minute</td></tr>",
+        )
+        print("</table>")
+
+    print("</body></html>")
 
 
 # output results in CSV format
-def output_csv(entries, sorted_list, args):
-    print("time,size,file")
+def output_csv(entries, sorted_list, cmp_entries, args):
+    print("time,size,file", end="")
+    if cmp_entries:
+        print(",diff", end="")
+    print()
     for name in sorted_list:
         entry = entries[name]
         build_time = entry[1] - entry[0]
         file_size = entry[2]
-        print(build_time, file_size, name, sep=",")
+        cmp_entry = (
+            cmp_entries[name] if cmp_entries and name in cmp_entries else None
+        )
+        print(build_time, file_size, name, sep=",", end="")
+        if cmp_entry:
+            diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
+            print(",", diff_time, sep="", end="")
+        print()
+
+
+# parse log file into map
+entries = build_log_map(log_file)
+if len(entries) == 0:
+    print("Could not parse", log_file)
+    exit()
+
+# sort the entries by build-time (descending order)
+sorted_list = sorted(
+    list(entries.keys()),
+    key=lambda k: entries[k][1] - entries[k][0],
+    reverse=True,
+)
 
+# load the comparison build log if available
+cmp_entries = build_log_map(cmp_file) if cmp_file else None
 
 if output_fmt == "xml":
     output_xml(entries, sorted_list, args)
 elif output_fmt == "html":
-    output_html(entries, sorted_list, args)
+    output_html(entries, sorted_list, cmp_entries, args)
 else:
-    output_csv(entries, sorted_list, args)
+    output_csv(entries, sorted_list, cmp_entries, args)

From 5e1fae606a730316ca099236266ca28e1b0a1df3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 20 Mar 2023 16:11:24 -0500
Subject: [PATCH 28/63] Fix fetching of MultiIndex values when a label is
 passed (#12521)

Fixes: #12259

This PR fixes an issue with `.loc` where a raw string can be passed like `'a'`(errors on `branch-23.02`), instead of `('a', )`(which works on `branch-23.02`).

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/12521
---
 python/cudf/cudf/core/dataframe.py        | 7 +++++--
 python/cudf/cudf/core/series.py           | 6 +++++-
 python/cudf/cudf/tests/test_indexing.py   | 2 ++
 python/cudf/cudf/tests/test_multiindex.py | 3 +++
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e50c324a8f4..672e663d316 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -259,9 +259,12 @@ def _getitem_tuple_arg(self, arg):
 
             else:
                 if isinstance(arg, tuple):
-                    return columns_df.index._get_row_major(columns_df, arg[0])
+                    row_arg = arg[0]
+                elif is_scalar(arg):
+                    row_arg = (arg,)
                 else:
-                    return columns_df.index._get_row_major(columns_df, arg)
+                    row_arg = arg
+                return columns_df.index._get_row_major(columns_df, row_arg)
         else:
             if isinstance(arg[0], slice):
                 out = _get_label_range_or_mask(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 79927c60a85..8ec08b7c92a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -250,7 +250,11 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
         if isinstance(self._frame.index, cudf.MultiIndex) and not isinstance(
             arg, cudf.MultiIndex
         ):
-            result = self._frame.index._get_row_major(self._frame, arg)
+            if is_scalar(arg):
+                row_arg = (arg,)
+            else:
+                row_arg = arg
+            result = self._frame.index._get_row_major(self._frame, row_arg)
             if (
                 isinstance(arg, tuple)
                 and len(arg) == self._frame._index.nlevels
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 5012ae0979f..95936c48b7c 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1446,6 +1446,8 @@ def test_loc_zero_dim_array():
                 reason="https://github.com/pandas-dev/pandas/issues/46704"
             ),
         ),
+        1,
+        2,
     ],
 )
 def test_loc_series_multiindex(arg):
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 0f04e8c0f2d..a0e027d4c86 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -319,6 +319,9 @@ def test_multiindex_getitem(pdf, gdf, pdfIndex):
         (("a", "store"), slice(None)),
         # return 2 rows, n-1 remaining keys = dataframe with n-k index columns
         ("a",),
+        "a",
+        "b",
+        "c",
         (("a",), slice(None)),
         # return 1 row, 0 remaining keys = dataframe with entire index
         ("a", "store", "storm", "smoke"),

From 3b8064de10e1891c7d104a6e63bb68393a15fc68 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Mon, 20 Mar 2023 17:18:18 -0400
Subject: [PATCH 29/63] Remove manual artifact upload step in CI (#12869)

With the changes in https://github.com/rapidsai/shared-action-workflows/pull/51, it's now easier to upload arbitrary CI artifacts to [downloads.rapids.ai](https://downloads.rapids.ai).

Any files placed in the `RAPIDS_ARTIFACTS_DIR` directory will automatically be uploaded to [downloads.rapids.ai](https://downloads.rapids.ai).

This PR removes the manual upload step that was necessary prior to these changes.

cc: @davidwendt

Authors:
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Sevag H (https://github.com/sevagh)

URL: https://github.com/rapidsai/cudf/pull/12869
---
 ci/build_cpp.sh | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index b68c2bdbef6..bc27e7d76b0 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -14,29 +14,3 @@ rapids-logger "Begin cpp build"
 rapids-mamba-retry mambabuild conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
-
-if [[ -d $RAPIDS_ARTIFACTS_DIR ]]; then
-  ls -l ${RAPIDS_ARTIFACTS_DIR}
-fi
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
-
-FILE=${RAPIDS_ARTIFACTS_DIR}/ninja.log
-if [[ -f $FILE ]]; then
-  echo -e "\x1B[33;1m\x1B[48;5;240m Ninja log for this build available at the following link \x1B[0m"
-  UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).ninja.log
-  rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}"
-fi
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
-
-FILE=${RAPIDS_ARTIFACTS_DIR}/ninja_log.html
-if [[ -f $FILE ]]; then
-  echo -e "\x1B[33;1m\x1B[48;5;240m Build Metrics Report for this build available at the following link \x1B[0m"
-  UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).BuildMetricsReport.html
-  rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}"
-fi
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"

From dc585deb743d853ba6dff61b8b29416c42436910 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 Mar 2023 01:00:06 +0000
Subject: [PATCH 30/63] Add skeleton API and prose documentation for dask-cudf
 (#12725)

Start building out documentation for dask-cudf, so that we have something other than the "10 minutes to ..." notebook.

In particular, this provides API documentation, and cross-linking to related projects such as Dask-CUDA.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Benjamin Zaitlen (https://github.com/quasiben)

Approvers:
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/12725
---
 .../source/api_docs/general_functions.rst     |   1 +
 docs/dask_cudf/Makefile                       |  20 ++++
 docs/dask_cudf/make.bat                       |  35 ++++++
 .../source/_static/RAPIDS-logo-purple.png     | Bin 0 -> 22593 bytes
 docs/dask_cudf/source/api.rst                 |  79 ++++++++++++
 docs/dask_cudf/source/conf.py                 |  82 +++++++++++++
 docs/dask_cudf/source/index.rst               | 112 ++++++++++++++++++
 python/dask_cudf/dask_cudf/core.py            |  40 ++++++-
 python/dask_cudf/dask_cudf/groupby.py         |  67 ++++++++---
 python/dask_cudf/dask_cudf/io/csv.py          |  27 +++--
 python/dask_cudf/dask_cudf/io/json.py         |  28 +++--
 python/dask_cudf/dask_cudf/io/orc.py          |  61 ++++++----
 python/dask_cudf/dask_cudf/io/parquet.py      |  12 +-
 13 files changed, 492 insertions(+), 72 deletions(-)
 create mode 100644 docs/dask_cudf/Makefile
 create mode 100644 docs/dask_cudf/make.bat
 create mode 100644 docs/dask_cudf/source/_static/RAPIDS-logo-purple.png
 create mode 100644 docs/dask_cudf/source/api.rst
 create mode 100644 docs/dask_cudf/source/conf.py
 create mode 100644 docs/dask_cudf/source/index.rst

diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst
index 112df2fdf9f..5c28b4e7e85 100644
--- a/docs/cudf/source/api_docs/general_functions.rst
+++ b/docs/cudf/source/api_docs/general_functions.rst
@@ -27,6 +27,7 @@ Top-level conversions
 
     cudf.to_numeric
     cudf.from_dlpack
+    cudf.from_pandas
 
 Top-level dealing with datetimelike
 -----------------------------------
diff --git a/docs/dask_cudf/Makefile b/docs/dask_cudf/Makefile
new file mode 100644
index 00000000000..d0c3cbf1020
--- /dev/null
+++ b/docs/dask_cudf/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/dask_cudf/make.bat b/docs/dask_cudf/make.bat
new file mode 100644
index 00000000000..747ffb7b303
--- /dev/null
+++ b/docs/dask_cudf/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png b/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png
new file mode 100644
index 0000000000000000000000000000000000000000..d884e01374dcd5e62db937b24990074d2f584ff3
GIT binary patch
literal 22593
zcmeEu_dk{IAOC%hnH?Ers|Xn-E3&01B&%an$d0V6Lq?HQcCsoXyX<)mB3TF7>yW*V
zJ<svIPVevcU-<s?@px4CqsM(+*Xw%C=WE<v>uPJzQeB{eAc*$v9W^}&qCi3ryor(=
z{Es44H}Rh{Zg(DeKoB(}@eizTBJc!)c%Zv#%KAQ^RwpK$IWz{zr<PkL|FFa6D08Ec
z?xeG~m1N<)O4yVJZuE1~1U_-*;NO&2Z<kn;rn5ErFM36L7w}%DTgb`RzD3V7EO&#&
zTH0>dz%|vQ=FmS4oJjo1#VU@g{`ZJvBSruB1SwM@VgEgbFhC^#J(2Q4L^8n-r3ysz
z-xEV9#P;744<$_ezb7RIxaEIONS^;~NaV!-c=%5r{--wo*zi9~`NxL;Ifs92_@8t5
zj}8BG4*#E=L&lYWMD2;Q5QJPf`!BR-9Tsl%IeL>pkY&KX$ep2vq6!A#e^B6Gz}LJ?
z3gW(T?7v8VC7A&Pd`sH@UVN@%3HA|S{I`!J0~{a)`QU%=<%SX8n-V4*&WRE6!?`Dv
z!GE06CTm0>wE^#?fpF|-&WL<~u`e@{|M~Xecq$QEMD}_w38pY(se@s;>e#GA!7Nw3
zqm}fiAPbqv=1TX{osFTxbZ*ul+wIMCh?M6ab&*>?A(=D5uq$`5S*(K5LFRKo4Q;#|
z%Iea~q7+Ts34}u{DV=mff)*5c`M(7}ZbP3!(a=3!qaD%H5e>49xOERU|N3)O`E2A@
z;vlvpH7tH$-u90#|0)LOcG~k)Cr2rhK`W<=RiwqC^x)HGbrY+|Esg%8(#aZ~Lp(v&
znc}eHBg~BvzJawfaEV(udXIX)aMdSUKGu6Pry_6CtlESsv{=3a7N-i)tQrIwE9eNO
zR&DW4h&eBc+O~adnQwL?Eb^1$?j3sUdTK7+7$*PetT}U=V3HDdbyg^f8gsU*%Iv0p
zg-^Jf?$@&zN@Og9kK$J8FLjc5mqKGgf!U+x!@8^|ha=kt=8lJ*ejn9&{8LG+(zzif
zAY#8i-b>fBc3PPQvYwaf<F|Q@PUQ8^kjnG9-Zqu$vt4zU;+FpS)EFC0!5O$Kfwpr|
zOnI>18(hkkb{-Po#kuTG_&8(uf@0*Cy`IwU3r5J@lxEPzMq6z>r3<8=n2r{q+smUP
zAC`cLm#|7v`Fu{vx<BriDK1~o%i}aG<1<I<n-YqWZ(i$DE~QtQK5}CCx2y4=O|p)a
zD*C+eZ(Vwe?1lMyPeU6aze^Cgm#E^M(|OJ9d(u&R%2KYGhdiqa5oo%I8<#3C;Olo+
z(1znz>5}IbsHW8#rD%w}s6BGwnA!IDy74;IB#nC6O|0cNXMA^9Lu~aUs+YH7VXH`p
zZMeFDU1#@cN^`~d)vgcUUn@P-SAMGxm(Jd3#rR7M4U=Kck&USkHDe5DVT$R5AdeN(
zzLtkFS<^kfptSn#pts{;d^G8_M!u^t7&4>?P<pwnqeM=VtLMU_$Ano^+P2Q@Dvr<g
z9*_8+;8olL*GDd5&V`NW5%)~-fTB4ckv16#K_lf2T4H*dta$-Z(uQQm^|U3Na7)Xa
zBT=*LxXG&a)!=9om4{^2e-$TJHg_kg9_-t!3<cM3-+O=*AxGb(tV?cS$5IGL2GQ#`
zI_Xx3#xPcwyv^(GQ6V&KxFEYIk-@2q#^Ps1LMhefx?ZjWQ@NxVe1`K1dZJGvOvyT?
zf`iEZ?ej~`q@j&Ym%P8oRdO5DTe&V=SBzL<gi82<7#{{mhr7<_iD!+~S@sydaX7ph
z5Z84w#c}D2_$cFgFCBbpcE5Fdb<rICaN=rM<DiHCgQA0gl^#u*eOb~5d%Ox%K-BcO
zE;yBYI%|lKPj2M-T#M%tDKRPXq?e&=U2iSSxu#c~BhxokDC_tgZ&GD;FvavOSD#0y
zk`yTa-=(|Vt+4XN+Go%F=EH9-n?&6F%LqNJQfGTM?O}iHRri(wUflTcEqtjBox9Qc
zUflJTn3Nr5hy`YT1_48AoXkuo!g$Jf?;YDvvTWV8x-#;uc3Zs|wWE&YyxE31cgjh(
zGJD^C8RwW*Oy9pGgNjkyGrPG_yAyppoG_iXKViIiY%%Q`8_t47E!^V7SU@o#6YBXf
zQ)gfG!#+a|un0xr(#<-8rqC>*3=tzeJHP-}#fLL2F2uF+3@^A|+`8~|aefy_FXWld
zuWeTJMUV`=>-(*@`&OOg^7m_a&j_Kz;E<ClO1|+pA<AIh@RxHK3&)xy_kj)Z7t@s-
zgytq3;q)Cn1@!I=5;hgoy_MZq?by`6)jIH+ee;skM(~F>%4#n6myw!>p=gF*?hLB|
zoOkH<6cF5-WX+jKtb2rXgc5T8GDyq{*raF!*K0R-a_^gDKmN14q%E=)4CfS${)jto
zAjK>G)BPQGXz+Rbal!JCJxAK#OWg1Wv@|z#K%Pf0*Pc?5DCL~|Dta-8UFzjkT(I8A
ztAdvdY=31DH(zZx;S05Zul5}*&a)Ww%(%d&1QuCr^J6|dbN^KTJB3rD=m0?~tn}MS
zZBUt#=345kWa<OO__qCJ`qIFWp1<{{ePN^sVE4#klQOqDx58nj6wHd;gAtg%<pbJP
z%SYAVa(}G*G39ox?sEwk*9gCXqjF2faQr7l4hZ&dWzO+6%=;f!=%NvyKrREc5-lBz
z4gYwd^_gJaBAWs9lXlMQMGrOY3N5I;{@SG)BL_ixYk5lpp;D^ldwVuG>g1}_7V=}=
z?|6GDzK%<z;Z<~@%1?fayimOGviU7-s&P?yC^-V8zsOw?A@9)pt07(U3vy*i6ha<O
zQf9q?zLdiZvbLeqQC{P{*(<<V?UGCrWp)_Yz5Uu!*qyBDb$<0(3JK&W$9(i1Wt(KM
zWXxJ2cmI%d%kxzApd(F$Zr2Y>k_i@BO4yA`|4_FbcIlQRb$HHWfAymu$tHf;Z6k0F
zGhktIlqT4XYe2b~Aaj92l~m7|R$nBweVAt??QI#$l{;K=)^~}V<t`=7CT)bDqVdpL
z8ZI&9WD!Tf5Dy}A#mpVbPW~g&`Zh%kCv9eN-BM{OzNpKj;Rh@$HLUBqE|funJ;`=r
zJbXZFoMIS~?&lIF(wI`bl;)h&610fBuGC~Q9%T8|n;m{!ey;pl-VZiSWM^$CT0Svv
zV9otj@aV{^tBF8A<l0tl^5PO^9eq=aMdI2@Ss5aUB<N6fB92^Wgz3%73x8arR}*(q
zTV{B`542E#bn-);R0AVHFL-)HEB+=!2zcL1m9q1ue!dI#eb-H%SLtNYEsStgf0e1V
zDzB;d$hysA&R0a0tCa{vk5X)^AHGA6Bof%ux;}Hh`fD_;`W^5(oxz8s`r}sas4LPC
z|545prmsa@;rTg$!NTf6wVc#!@SSknbeJ6C5}?9+%EKq%uypgjbC7G><jn(E|2^W{
zIpAgMikWo|nQ~gv>!I{Uty<@Z3_0X5SEcLV49~Wm+*TSO%7i;n`|VbBo#1EAQJ0Z7
z6nwHjlFe@rH;!8v+43}!ReJf6b3XkEa+%0CMLPFm9s!L<wy{3@+I1U9%kI!O>P+B<
z8UcxuHJF%{+UL_5!oeI1=}(b)#05e6YdBf<pU2ZTNqS8`sy-s_EX~n+t$hwwy*&_<
zZxNh_&&_AoWd{;m9ZF9A`c+Z-w)vcw-fG%?AD)ljOJCI@h->ysl@^r)70<@|qLmDY
zWCtOXV?#gqc|P`_qg*uJeY6QUhb=FkIGx36ArzSg+>j>1_8Aud^czNz6lRPiNWaz(
z*+``I;rUG6s7ic0<t@!4Lck@Ci#5{KDXdDHU{~>IiKy`G)cG#HJ&h?kwOQRqoZ!W=
z*KcXQP&;y5$OMFv0YdNn3U{t`0thPea7;;;E`9*Gitn_X=grGr#BA>qZ0D#M{`63d
zX3}T)oIa0&EU!14_HB=tGK_zylXV|I@=R)DfGi{rQ%QPh4UAN#wrF|XK6aHyg&|oA
zGfePT2akLu_64)(1+!ufF*zqF3U0gY&2oW`fp22lL8pAD=L!4LgP%>dZLnFDE0gTV
zfaUq^YHGUGd)Uj|*KtppRW@3j6hv|MO4#U>i;6Wvv4@HKCbvM$9{U<1aIBzL!sgHU
zV4(g-_RI2<o>3UqAH9nbwwSus%!T%(^Z6CD?ibMKZNpY~Ai;r<K^(^TM5c_4!{jqe
zg^;kcjoR>{iN9YLw@|1yXV>9QWf#ubpU>sWf<6sCE$T)xmhD{qj#~~6V%>866^5yF
zI9WDVHfF6BD#;>=82n-39V>U%3^N~Ou&2#`%1Jo#n~j_{@-2Mskgy%gNpB-#yM3*V
z;>uep05bT}S<<^6R&H6p-1^<iswJRdfw)?5pV{r=*701-U~%(L7rSlLXy${eMP_u`
zWrE!|gK2lk$d|S&)DOux>X*y1_k(bL&q}v6Cph{<nrlbuqE*xLf>4W6S>G?5<W-ah
zV)4wR)GtxpM~qhz0Ok^exuv+bmCM}9%UgIF_^~VX(_5EZ@;id5<$;`3xWY&GJsF@%
z5mzA;;zf1|?9kJ@^w4J_PXH2)kTk33N+zi}(J-!)!<OqZ4}0674}lh})nGWeL@yRd
zZ`rjyE!x`i-K^5JkwMCkqS4wTS1q45il4|nC4|UpGrN%-xzwTHZ<g@aQ(_8-1V2GZ
zPx2n5UH!@b6{_MGeEywwO)K%{&HCp^Z^tGhviWj?1}au#zj}24`tnDGAVbrGH$J*a
zBtA-4Pu&z@!&z3F5>6`J^E0+<m8w~e<{X>e6t>N+gWfava#_9bKZ9aWzr7T<$(4p~
z<rey8+Z6Ug9t;yYmewH8x86%3C8qRht^5~44pQXD{?Zl2Q;m70!SulPh-xuD?#1qO
z=Hjh|mzZ#roYNGZPX7)>gPx=jdFP%o!x}P0@$Jr=`-{eJn@#Uh>R;g&t?i6w!BU49
zaV$LJfG{6krbT5b9Vj-Nnygfot8;JuIXc?qk4ZSgc@pR%NJ*HqYr>Wh0wF38Vl6ke
zTG!keI@3e-l;>m<5E+Rrezfp51IMUH%anJgHcX7eAJr+sNBo@*YbjqI)|40o`v!QF
zZG}nEVAvIBNf-w(hXF<MF4hOK2U*=4mMd@FQ=E|2z$+zn>M@)YvZw_0N?qZ%vhmg4
z9>30m%;b}CRcY{6)KHi~rW!l^Iu%<xNWbL3?3TYWIK6;32wb-}pZp_>an9uYS(;9A
z?l!phth?i)kuOQ6r@-M2$?v&8g)qUET-Lu+-sK<|-}<-0<gz$VW7P}cE}Pz+&YDYe
zPH74S{#nmdO&0wbig#b)?dYapmqFU2`^G<V4=d&DJbe4Zf;d+N(73r!DbhF-#s<dQ
z2X9FBY`$@#cn+*G^N{S$r~WCzK`F_*wMG6{X5$lj(ojfuaH}ht3iprs#7YoP8X2_8
zma%uJk8GIt86F0v`HUQFhhk(bixWbo(m9s#DfR~FLQZ2ALJ@CeD52pa3b!|cfkgt$
zkRju1wVOA6f9xWAMW)&)k8h%0m9G)YB17E5>-R-`=*g~mo0b)=HOo`}zRVsY;o_f{
zPXfZnqq7DWh6CwzX@GN7|MAqZSt_J~ePc#B!*$6aX@t{B0${UX-!z;&?4FH}ZReV0
zkQ?=F)hp*N%iMb8dpSEZ5?_wn_q{+Ld@Bg14v?#EL94WLnBFzDnwb4k-<tMQg4bOB
z?oFDq^Az%yOTM35s^e7I7hMg_$%bP`{8zcQT}y*C28wAf__;K+UA*j*9j<D5EUR=D
zxb$-(uA>buZsGm!X~$~jj#-zUpSUmjm2VByJjzgYJM{fq)|(ucr;EHR()Gl({_@Tv
zq@=kp40m6l4jt}Jd7@=A)C(>~3NF^t!A>qhej|@@g=NiduNab{5Zo{%M7>BwWT^Td
z`ttSBwB~Gq8@l)ec8X+Pyrrt3vE~?>2zb>0ce|;(bAD#msjxI<LCn$}M5VlUCYTB@
z2mKc92|hBkH+`96ls=jN6rS9<&EQI57`+rvur*jB;oiaPw$^A(4a3z^3@SJkm1|!G
z^8O95SMG0ho6;{=*@OnXXIk4HWVk-f!aq`oJG+Fx0)=~hlCFTscHRv7z9ocK1DvJb
zMr7Wsw&}Kt4%(NjpIAO+{{l)H{`O|)jfDrclo|o6^e<MXI)7vSlUY$V5IIOV{O*AL
zWnSF-8c90(E?oe@E;6lqUN^2)bqWzgQO;mL(LKNPV$Pt#TFP}<_jI#e81mx>8YEP`
zjLXifwQa7@ejgIHe^|Ds`Z@Kj5gAQ15Wj9H%R)$YWV)=0%++s4-ut4TgI-dX8y=H9
z0}7n`K!Tdw`Q`E-@0sv!qVDtxVddF{`?AfnILRRl6BOGHVyxs~f}+7Vc}*B}&|6MF
zb}c1_*H730ee)hH7G$})spB8M#piJ{%k93_$NQaz-hX!|XEs0eH+ezv_l4JYJeGo$
z+$qPcSraY{(e$Rrx}tgxU#&SjVJRuUEG`J7T>irn7=i8f-3&R8c<H%@OQV9m2I!uI
zgi(f@`sC*Xx!VrEeXo6Jvxx6%7yj-~Nx;W{jn`aR+&G`%MigtD-F|tG)FMwrWIr!r
zx5<cg{xI(3I9{y3gTvDP4jpG1FKN=SlLA-U)2Cc+(W!QyPr1%Q8+XlBD{z6{8Qe*r
zri1)g?(We4Ig{&sp2o(Oa#YT%9diykGc|`9T7NTPGj&q5(h|atJikz)c^c3_iQMm~
zdxGR-0=F-{Q>uYdgG_@P*rf+Ab{YF`h4@QxW25$kEpiiuEbX;~+20|QNYqohenIPr
z;p-plx_^N6D>YfJES6M6`s1Z35%y!bWY0N2cfB>M<P+6Hx_=-E$biw7^X^oW_fPQ+
zH#UjMO0DN`l=>d^A4v{N1k0h{67|uH+CWaoYnfq*^wgP7ZSZU%pM^SkSyosPU$2Qb
z95%~P6hC<KEulQE`Mr}#U@?{yRkBj_4OXEE(I~Bz(OSEMG&StK)Zh@E3aA9svwh>k
z0-k9xVVI<F!52E`ZJ6DtQ`7X+BLeUQOV-bxy+@z$LY8`fYKNgFl3k#Iar}t#MGxU=
zK7(2A-PFUeo=ADa&tVB)z<NZP9$CTI!}FhwKC=ia2I0@`jMb)er7Vh*oCAPaX~2Cf
z!%#coVTOF=%GI8FUsU5szCF8V9x_uzghL3hw%li|t?|=E4`kF-&#sz#%V{kV`?36z
zxGd0_v;MCFSq=DDL=d|hrTISF<5OcqfdfR%Nbm*i$9_KUJ5E_Ix)CwiUGp<F!g`m1
zw^L(8e+5RT*3%ST_;7kI@<!!iAhmNIp8<?}_OM%t-KifIr_OF~uU14#DBAKGR~-sD
zwa3%Mf&G+7xhK@Kd%SHf4?M_-LPei%cs~yb04<GUNfXddh7Gh$Jeibs`{ggi%2T|S
z)T1jYml8w4*Kl7D{|<X99ai}gg$epKy>V<UMPJE9F0`j3O^<{S!zBhV(`@_csa69P
z)!K#zcGhJ9VT*!4zr_=sUsks%T@KI4+-SmzV3?Ggi{Q766tcsqrVLvz$(WiW{SRh-
zWQszNTqc|iNr|Ae{X`}9@GErSB844o)h<gg0-WT0Kc7UR=4Rlti<P3W(TG&Y_|s-}
zr?3h}sSfV6Tfv5Cj0Hs`WjD6PNN16<F6UxC=l(NbT^=4r_2K0&7DV1noZPK_2&oGg
z2ONUPxpA)nm#@(^Um8kzNkc6gcy+(5kgtDS{<Bm~0fq8`5~}e-mtL<6i!~GTvRY&3
zoXul_5vcCG<JDWB%HR~yhlbpga>5M9=>p&k;UImp3{XArV_2+lzE#jwII>~bpj&(=
zSP@c^bZ1{oner|OhpNJQwHAbnA*C!L4J%QGv^Dv8k#`Kx5di|A^qh2OhmQ7_aItvU
zpYgRb5P*k(232fy&h|y;p1KYL=30hAvvlG~7p4~=BL?{jU6_9c6&v!(zwI92kJWsO
z+`f7R*a^WQ`yM&z)Nft$n<P_lk?D%H`Kn*CWcNGil}LfkBX`xPkQ;W}sYXi;X1-Z2
zc}SRjyOV;#F=en|uq^tILnd5iPG~AB-Uya)o%p`Ma_y!*!75{=JjP85!EP@y4cWhC
zNK#Rw=h31^ntX$}>XvGE9Oew5+*g3Wsin_hL2~rSZU5;z?5!HUbT^<I>Xi3jpDTk!
zkIt9Amm2SwbReTu>H>u8>##L*sXLtd$)#-E)p}U=|7HOSBUPBiLGhc^)|&ez7I|T?
z+#7^LPc}DB5+zlzPK^h$T{v<X$H`MZ&)Sztk;a^VTlVNNo%qkWWO#!LWvBk0kIyq9
zN&S?H>l8lIRU3PiE^wqX@be7H)XvEW6Z^y~h{ei{KdbMxG2DBp3~V8hToAUn4vqn5
zs#FGx5qjH&=bk<@d%;gio^cznfzM|IZFW+&MdcSe_9mUQ<}YAjePvHn#Z#az4zo(<
zO+bNyuXju-v8Y@YO|44l|Ab#0k^~ryA=oQ=_2V*@b}k{{y^xf)Go*aymy57`Dnrtl
zP=&TW?<d?YX<aF_$cG9H{;Bj7B>z%MwH_6+lGyb>H|clk$90C^%IY!Wf`u$!wPe!y
zaPf-!vD8cG(W0VU49%bZ>d8KS`Ai(@2NeP(Qprxu(Kgbx7r(azPnwm6W*nr_oo22g
z*cXR@_DM?U!PD+ic5*#^qFA4h500cfGQnnPMf`%UB!dW&w`=xdj_!xe`@En(alB4+
zY1m@_O!Hp|wDJ`ZQgjHD*KNwqG&RLGz2?$6{fSV`?c0*#&}Cu-sbzsrC73wH^Yn!f
zijdW2y;5)VVX7QpRmnpYOD|_B`k;4N2apW*B?xL|OX5Z(DYP_7&ww!dyPukP@u$x#
zP(BY6ptrEw{tlPz3x5x>{k7+Xr1U~kL7jj}A9gzNOmhhZ$^}#*uq$DJR)@JTgM&kq
z?bZ82h$`>ut@ayy&TO0x>h!xE5+?|sYzC;56`Z`<O_ufvI<T^<7AfOl`wsy<5*|QF
zISRaMM#d^(`_`PyJ28EaJzjaMF^l&X$J<3NmP!b|Vo2irq3{vJTypfNlHw{4MWav%
z?El7D#+u=&q}yfb#nviA@)0t6bFEJmfB^DeYZAz9?ey4q#S03vu^PQ}p6g9~lnvc8
zVUVROQQ-wEO`S6hX85k!oRXqznV&W<qUGIl&8au6L>?xq(FF&&@A7`qYwhT_Ogei4
zBoh;mC~81PQbJFRRs)*5#a*LkcFbO!Uqv<^kJMFFr1uJulcIIMHL#}((p{(ux)~{+
zJR)!QHLk&%Vwf@Aj;w9wi3<Wn=Xmp;VEu1{#^!>lp_^x5CBk68yjY!%G)eb`GKWv$
z%UI41_S=<N_<AX~zn2vC^W#;YfW5wr`1^C6g_qL8r`+vH@czxgGZqguPlvphgt+gJ
zAPb2p$d?4f{WHj%vrjhz6jO5Y<Bd8P*6ItW*o0!~T{xuOv84R>v_YL?cE7C1AVuzc
z(^P|2I;hP_im@S`iI_cJ{&@duW^FdY4zpz~rzhN_*NoLaz^=)5uN(`jZi`;1f#8;r
zJfz`nh$FKaM~dpP5m#B7hCcoa$OR&DOG(HJilDs?!x7XS>kU@@_%W9QQCndzIOh+0
zmB=oz3r;78e5MTMr87Xwd6eYRn7c4JuhKv|pi`D{AwW9qs)EDf+lhj@ChfV{f%ih4
z8@Ahj$6jfquxC@TE%)lR=zUFIA>}0vrx>`f{tZ4^ug-RYZP4Nl$DfVRfgLb{IBw8Z
zm>%S`>hi6B1Z7{E5J_*e?f>^5cB>Q3RSGP!&psNGAj@KeH%B(d8wtI`cF?h}VIE9!
zAAZ^B`_(dTS@;=dOvJ!G)h!Ro9%btE{@s@^w;xC;*B`o_2UlQ2lor-e0g$pETr616
zPD;xJ-=;}7E#q4Xm56R`8J`*#I;z>|t6sK*fR3FLNw=(T-*!c~TPUwg>TW5&yd%=;
zYP-GjY;NGXI<+-0%f=&@PtoR2!Uo9op@*PM=J-QfNH<OLv}kul7daExhe<b}KpAq<
z^l%aLmv-v7mDYKJC@7LD4xpHRM{uq?93Cq)-c`_r&e__>B<a2QGaOKzzo^6jC{v|e
zXl$+RcX_~h;#fq-WlLc(|9}l51pzJNPILt7E+qfj-XNtMPh(b8rzdTI)=e6x`d&VW
z`7J>vc^0lZVY~hPR^_uRb2r;5rUN%>Zw!fE;E^Jd@jEIep&6TrFYOw=)y{17el)nD
zBSlQ|^`3<53^i3B?#}D4c!_f27F!5I;l-O*E2GRwn^;xDN)mdZU80YQ<M>(5LEw+S
z8LNwz`#zqwlU;$@XXTK+OaH`pS*UMx9Q$7NN&g*6ngMo%GPWjq;5>YqJA!H&gSeee
z6$)Fv2<|}HY;A39{53AnvFxVZqo_4OfmqfbcEJr<ABEe#;8*kE;jhmNzMll4B%T5O
zCl9p298L!l5y~kPrj_a)i&)YoXct(Gc(v*7F|XKFgtv5x<3N>NZ(fii-Gkx~XT~!~
zE0t=h(=SDQx?;TI(Rm7xc2Gl*;;g3SUj|ANeKk-$c)Vqryi(_5|3hSkMAXQ_`qyV?
zO?`G~K{NP{Rg1F16nym&XmpAfc`~1c9xC#R(;rkFF<x%pz;cg&Dp8;hWt9kxZh~C_
z4MSwT2~}VaM#A=hdo6wcN~KYh{&}wyozbDI2;)pDw!JGcacl}2fy{*L{X<JyeNfc?
zF=7gtg5+6#_5^54&#M-iX~iSafD38A!&QCHj(#NRC)Rb$XK6>K8&6+O?LXLX>R&f2
zb1OO`A@vR1oM>lV_WaCwVO683id!X^Dbasc_k89o!-1THdZt3@VbwY#ymF35F_t!B
zi;yy=o)4O?7lHKUBtC6*W^OPno>&CpTz%=VL#F<7D^3<=kc{STfsp0YZv}S2-WLNK
zO=b$ZB$*%s_QS_Buq|ICb(!qd^ThpX&~lCLfDIQ0)igj(S}rTw*InD|825_iQVng7
zB}vH}ZFNxeI}$rmm=BSE%MT|@0uDV+RE*wJd?Q>guo_nnJWbiAjX$q4)xdt*ZMSar
zhi)5#CbIcOxT>QOT`|ll5wNp-jy5q)b<)*+@wkt?^S1u!dipX}WiEPROex~d+i*iW
z*t|sZth?6kpOf>V2LYg5*At2+$~j%unCd0PM#KV}l{k7HSTQEfgtp=*|H?YBy3#C&
zH}Y7XRqVs`@u6(bzC@@2fRb>X4<@9d1WLb#P7kvCpf6YNPU)ZeORe#Tl?(25yCt#<
zBv)xVJna==3zR)f<tN#*GUz}bqR~lIO>HrL&6MaOg6RH3!z->_Z$I5iM`3&{8qkix
zPon(Sr%I)IT|J(Z6lp>_#J1H_{AW!&#@<oI2*+L7lpLJ^>59`x#uE-Y<~Q2K%~;yC
z-ffxJM!wC*VX9v2)!(maI)<W2`_-UN1pia>x&4hXr5TIihUrp#7fv9C3K80vZt95q
zMeGFK+|LWzFWzVhqxLTK9ZuLWY7{!verf2qeJ^0Ba|7w>F1fLFTJgJ<Jx$pD14Gqa
z?`Bx-pqL7&)&^F~BiJ|O%edh3LW9<ZJHz<sD=DM^FfR0S(ND&yYM_=?5^U{4?S0<<
zqPM+;*Lncv1GgDX#bPV@6%i)5^uiZN=Ix1pT@p3N(qZ}pqU=>ehoW>|2rS{GxdlSr
zrAaBX-MN1e0de3Lt{~6>HCt3@FSTPlxnqY&kHws}oYIqUl8kD{wrIa!7w9|;fReG(
zpOyY|n{!GFNU$VmU+QVjd%OF$^UhT{TZTU`J8UIp08Z;>Bq;i+yFl=j)B7V%3JnoK
zI0f7L6xBipC|SzDIXGMdcq=r)Z1`*Hy)vALKeMlk-HS?MNRxd}=QL8^d>QAhVwr2P
zo|JVVY2iZ16gHybZ-`}v0SM~DVuk5B%!PQD;X_pV12P(174`PICD9Zr^=VJ)_pf7~
zU!^2RfZ&J3S4*FI+ys)GP@voqsdu$+RzyIn9?&ogMaL91Xke(O(aJyXJ!}z~OD1hp
z@*v(xF-Pb*KeNd5D3(YCp~`!xfYlY|o2zoX;w9gCdtR(vZ%RA^n!Hvbw^CwKpEnG!
ze5Ab^G9~=Q7r97W&8;~5DEl6d*s3<%>=@ve<UI^^uh2NCYiEmY!Ol+!gF}DmnJjLQ
zHab}jb04^3v!WWbuvPDr)QDHB_a2qd?q0<nsUN55INyNSdydWzf=;G!ILV;acJRiy
zdD_yixIPNbkH}ZV<4jb?Mm>gn+<VuRJyHnO0Y|kUy<Smb7RTM&S5TbtO;EA4I!k)3
zlOnC_*IS(s%d{2otQ`{<>a};G|FSwSJIWx7<YT!);KjE?L?dhKd`CWm8mO%|9P?~>
z1fC|Qs8QdESyp05Cu>BgZgYV8Y)oEoFfk`9VORC+c4RM8*Tu7|rTKAubDnvYrbCO<
z$I^?4nTbF?#&jWFl=Ibe?vgl@0kqN@aZSK}^;YG8P*-2lDSOA5B{l{)wr`WPDWgR!
zPW;{xhe>)j-357zvd%YR$=NSA2Y#9xL5rEWW*8E=BRcJ>?7ouy5+K?xCj}Sd3Q(UY
zTf{r9Z^NbHrMHS--r|N^h!>=9ZS(|X#RZBBBuUbI`AS%n!&oY%5>LO?gxyjz$(6J?
zjK^BMmvW{%b}6_A`E(XG^OeX>yUOHk8Fz@MJ(bc)^wq`%g*O`%@#yTV^e~RLz#Twn
zM6Oja5fUzAR%FlYIhRioe0fJX9yikAlEzN_H<#WQlj!iA(EhHU9kJbhE}0TD@wvf(
zxsxe%*Q`d0l`Z!*NFAl5ow^&&5m*EexqI3WF4123;9E556s*Z&7(OkAOB&~T@qqcn
zEREU!v@MA4SMvswqjUxf@$Q{|DWxruEKok7Yl~y{j>uRz?b4rkqu_)yK~~hEDE;Np
z=rI()#q^~icH7s+|L}eM#gP@KFpCv~)QPsXjlW(N{X^Cyt&UBOYDZ*(*16r#+}Wf`
zXe!<aY53nJXb+L{R|F%?B|j_Rt~=w=DwY}c;BaGX596|8sqaniwaBa^>4tpEwGr(j
z7pGekWoxN8Rw;oKvI?t%0Qr=h$|g%LHy%=QO#p_FyI1G&RI}|YB2ZxMmgbDR%x5aL
zSF?%ZFZ2$m*%=7%+7b?O%>1_w=g(a<E@Q#TXr<pkNVw;Upp<@a+$$+;gpKA!e5FN=
z{TnfhKPL_i8bG7F_1#2iiIs>|-`}ye#i1fznjWHMoX^5ZS^!!`4^3pnHKcU?33D_c
z8$d<3#UFSFH;k!@{LGMTwi`x8MDkT~NnJc4%f8Mr%t_}?OVRpJ6DwLCX~;Pqe$gfc
z3GfMQsw1#nB)u4QVDTW-e|hiH@I=YGlj~q4p>T4+j>e43Qnd+NAtSEwJN&JcR^eyX
zfyU;Brf7E7(PvoM9*VfR^mPhI-ro3A{os1_&0gEwhJA?x1270D9d^n&WCgN#X?+qi
z5W*K&OL)O_N*N;S!ePdcMMOPF8#I@S6@RTizG4@USmfyi(*$)Rh~Q%1a*#eEU(Z-)
zAc)xgx{NJu+xT;*w1VzP)Rf(0DkvP2Zv>QxZdyz^z};r%LQNFyOx9MKs`Q%_JzkFX
zEKxEgL4ClS^l*ny9s-gpV0ceGt)8i+KK+*fd(j_E{E1om-~LrQN;{wKDh?_C2?TMk
z2j^*f669T$i{C%2Rynv~bVdV#t@z7RJQs>a&w}lGYL5L!_rgjFU*rXiF|XD8eW1Sq
zT^pZ$gaWiMPKz-Z6ftno<<O^loN5XNr0cy%tc`yV@ugvVoZ_AinK)Ef)3LoD+VD&*
zC3V?u`!zuD-!}S=d1VU!ik;lZ@ol)KTU<MC_A{7km~UFFC(<f4b|`FSR6K|vGEbp_
zI9A&Mjb349JUR~OP1!turDfT9_B&vvZNv)H$VDsRGfPDw&LMRykZO6EUpVLQ!Ux5n
zD5WU(qyww(+C8E*K8c5u&^;Audk7)!^po&G@5oC^2fV`jjo@rrG>@Wx&ir=kOi%__
zht1_sRLPS7X|A_NW(U-`9L%VC5CCr|`U=s!k!OSnGVwFaplXvnS_unCY`80!N_MPG
zhQ@d>ui8y}qDKZ9dIO4ZRh@+wf<-#m7g04ukLi9l+JOsimU{I@El=4`{BZEF3ZTEI
z4%qVEj5;WVq3~#5N^J6xMOngc?I_g|g5jp{N!SDfVGHSMXaP?U0SQdoK%?TU>elLz
z6?Tb+<Q|yTQhKboH&HdrTVc=qVZK4(VcKg?qd5zL6@_nuq;A4kX3Jlvw1%@MO)h7G
zp6Y(Jdqj1OLmb2g?aQhLIFx(-qBlZ)qsw>Ql)S_U82)FbY_bFr!WT<yX5Ma>nqzSm
z6<hl?r?^1r&XEp}U%{M=guW}O$l-6(Q=>QziIkk7W~++0zM#-XId6s%kMp`P0Bn0x
zXGf%eo)h8lEB@eQ)2ouql+$9RS6Qt&Z{ho8XLJoY!9;+s{t}mR6r>?|Yc8?IFUn=o
z-YD?~1Rye~WQy!zC6Pa9=MH~Y8U!)r>%*D@LCw+^E1DO;*&U2OOMu_AVf$8kL)w_F
z&Mcz!7&;~<nq(Tj7hfy6KiMa$LZD1Q{gx2s>{?F*Mr^o#zq-%+?2-9A$z?RxnIKp@
z?h#wT{T(mkA(-E;P9i(<w&~8q@aP>f9ge&p-%f*)u9<0u4u(4c#)nimQ`Pv5p6)|U
z6T2%fafy;_F)+V4dZCzz_-(sg-QO?%auqmpY}i;(4YLA$D6`HzcgwrdrcL6(mvS9O
z$n~zdE{b}y>m=sSU!1>cdHa3MrJT4vZ{M_y4$}d?lQ1}C(4#+EtTfs0HEd`cH(j?&
z<MXC=iG+QDvrQekz|CuC;*G3jH)@}K|D0um4<}nt&e&Mc_FNdG73jyn0~B-UzN2Gl
zQr>a9dv4&42`@25S5k1SgxgPe4BC!`SgXfISLzI9-S-=oJ<;#gl`?_lU~Z(uY-^}4
z)jlh|0RZ+p)+^ekgJPq%%AoNdnfE~kpsXAB5*`8Ojk{il_wvo+<4$Frlmhj;sCUm2
zny4ZYa=Uez<)4==*FH;M{T%!^<UY+zfcces&>olpME{sp#_vxV&0aNvja@40FEOU7
z$~d>Y)>?9QlfP%*W7#e&W1V{#*AO^`RUEzgQYI7yeV+|TFbbI}e({9#+`qaI;ktam
zzS)a(y_;tO4Mev{q&*`I@p7cx#m6plo>KewOB`i5EM+kgZ(mRITWg#}7M~^j;FUmT
zT52idp00a&E(YSezJWmo3ZRbZa;`j(0+&>(dkWt&lL*9xIComLB_!Wz3xf=p2tUXk
z-1b+p_x0}*|G2=nA?dq8_zlyw6eCPuCn>pFY3M(MR<B;HbsGqt1{1ZT*d;53*ZMk%
zuJE*&fBboLD433f;Yt!v&%8A#d|gQ_a{z<oXZb3#qE~g_>0RD`Gsrn7I}GVEPm7Jc
zvx~ZI+fTtu+d}TU^P8S_Rr*$#kfr4USW?htfKC*)c(q{qPA`=>l^{b2Q*u6XF*TtK
zf?beKi9yka<F<=7wLo+jCU#n4s-33v=aq}Rvf<SZKRbEc7~sgroe7y*LbP9Nf|?@w
z_}LsYr7ak3NUfY?M1bMUPavD40HJo}8%tAfDf#Ftb>5v2Mb5p{eG+J{V|pXmq=hAn
z4(TB+O6fZ6(JqpSm3&bdV`vT=zD_)^z0en7-|>a2%B|X9|L)2VEP>c*|A<TEd>}~?
z40=*I#r&$4EtKdY5Kj+2f-`Iep0{`eO>Jm{B&>8x2s)>9&_SmjNT@dRCA-k$M#)j1
zbxZsj*xLdqWQw%NRW&gq^UO;N>*Ob0!7tB3ad!?n-f~hjpwkV_gFzeDhxoC<DzPn?
zW6$f)ZtLJcW|q;wK32yGrUa&7#YPrz<Pv4jNEUfbdeW&@X?QaF5V;bdI;5oK_fIbV
z15RmRpir9D^qYr>LDbhHRxYnk3pOO^*-+G~@=l2wbng%7x83ULYxF<XL2HAH_~fy0
zHPM%ah<=c&#K{<d`EzYB_E$$0m|2d@_66C!z#kQuN+0~fuq3#M&WC&?R?@A!+r7hz
z3Us9o{HDy&Ux0Y;X&!WNsaxcL$$7HTeof|*2S)$4H`WlQpUy*e@Q#OwvCk)J6o<ny
zbek;p?or#@E2-MFL0+IGh>o4}y(*#)<6dZg$RD?GExn2YXF#V3m3V_U1&e7_j5_9a
zrEocwBBm6ai%2s7ib{&M<N5+fjP5hzc0v(25L+LfgZ5MoI!Hfz%m3rQ6i_KU#IWCw
zw<KP~`3(dy7{7OWZl6D@*|DN`208sSTy~^}1u9`}vfyXD1rb1}z-{7$85$;5Sr)3C
zW-rZG=hZx&mAa!L61ai82i&(8U{2^~3Ur95@mv6`gcuDRI}FeZw5HWy@=V~%5$m8w
zpZ8LjBRRT=GALOyu=RDIO#g|6@vhYzGnBTPF3JpHu65e{7WqK`(gIyNef+6Qq-`I-
z{uMb-dorXM1L6Aj#R1f}*^K;+))kARTWm#Gij_TM0Kg}_3oXcy3jq6;yyk@n<-x)S
z74Ppy@a;^nS#y>F1v%eMhK8I{aEb6xZqI8eiLB7V)V}lmh=ya)37X?`s%}+V>JFYm
zO4Bg3wmpm%T}o?!7L2AYSbV$vHxaLkr^a|BE!dHurOpH;-@|6PhCg6!+(56j3p1_M
zKD)o?7uGnoiQ8YVeI6g=m7AXDg0v$F;#<`1BG3di8m_r5<ztzMNW#!TWf%Q-U7xO{
zdmNbf<rG`p_Z0yxnJ0jMQXfo&%XP`*^#{)wdKHJ!at&YJ;DvZTu+)$+GeHaJ8|`=A
zU*r{sh}m=rZ6WCwR+bn!fyWVD9J6_}(7#^Cdy}KvMtyaonax9u?fJWmaFU*@Kn(1l
z&&|}aaiQu11#NlVWNlreNB?joZ7pcHL9a4iH)s0ZGK+kLd9M+i_YpQl#N|#}!i|e5
z9Qr>3@`9}7wEeYVkYopQipX<GLEBGy4f7GeTTK^iaC{)m5CNov;RhY=hnlDx?vmJ|
zx+W>(cg=e~ieQZ#?O^1=waeu}fz?}@S6jdK3m#!Ep9J~sdyn6CUxWFymT%~jf4#`S
z3G=*|!4W)5sb<twTeXpUow-r-Y|<OFHL20Rzg|I=7Zh;TWRZWlX~YALj`NI1Iqe-^
zpLdg)Kq=%{-5}ENO&^9~m-!-~ime}&X^(AjSS`r!vf{qy2Iz3iYVMDNPZhIkxXYHa
zy2a~>ktU1qd(uN6Kz*e|vQ&M)FtgrMALg3qpgvrbsOnlS59y|7T8KV&w0^c2*Ob1v
z9G|NGf%!X*Ys#0%ACogkS9Y2F2vt|isPij>Vb|p#0dZOWDj<iS!x{0**x#$CV+=M2
zH5`|Nf<&J8kn9obh=%ISZo(vc9wJ{WRCdMQ00;;5&lhACT4fI(N&4}4=H`_NjT8?g
zMqh+Xh@)RCwoy;~W?Ax(^aVVVR0|BP&lrv!NuYa^J!1LzN$KZVqS9|yEvSR_g{cQ6
z+PI*gF&TJ?I$%a?pJwj90>6+?h>W*bsvKLikR_DC!p&Zzw>_e=i%t8K3r}mv+<42r
zNIt~{y&?LUE4EQ{4)cb4H)@7unpDr^SeoV_21uZX(mNewKLu#bas>@fy_lS}ZiOeX
zu=nSMeGUQKger*FN<usZql;vlv7p%NAJ`2ce@AMff^K+)WnX|Q`#w~xk`*xTsI|Zc
zSt@=2et!X`A^vg1y84ANv$%I;kG!9jKoT72srcQqJNG1MIPX92{P;Nm(U&&T{q%?C
z?uPQ7^mz|k=nJvXTwxdzk*F&6*!R)`wb#rO{Xq!2*Y)IOluEJG+k0v@!TrinQS{lO
zEMa9mghQ_xN^}Bn<ZJ^@-<_Ure|ul*Z&lnC@iaG!>0THaD}??^gLmg1OxVMlCBIno
zDSdXS_=m2Zbn(-tpN}dt@fE?hh*nJf#5L8I#O^5LeSF>+uI9Zm=YNYIs~sF?Hws@*
z7fIL%Xh%4ZnAQF1a=h@_vG@4)q>H1X9hwV7Nj5WmSmlqrz50MquAdOQ*NWR>7#czo
zBsmCP@nsw?->+|0W^R}}b37T!ZjiA~0h!hpqJ;~yM(e}8X~QOT7-7Cqg_D+8pm*rW
zI8b^EJpZEX0$5FIT1fsd@j-AqmzwZ;V)UouzB5W>;^H>}E9I`4JjY(3x~)^l)sVPs
zF&rbkCrAP9@rIORqQyw-Q$@15i*8dCk43X?37eb?V1aL#mrVr@^_UpqsaL@V;>grw
zkpw?j<@I;^WG6OoP`v`-53Q|UwOyUL+`#?q=^JC!{N-zu!QNNb=~0rq*<<)axYsJH
z00)v&&UJR0@Z@}3C(HdctL=x^-YI-Ks66Bt+6`POkkveqz2)fSDgs&RSg#~9zDTPf
z*OI*TGQg#P&%^9Y@TvD=E$TcbCchU~`4C343Wm>-sX`>VnHJ&bWL0$hcIJlY7rf-@
zvxEcr@lF;*kR5}__-XvkT6~M3M(-#+WO+^%)O!_U#G{Lj_5W14RT!^F$hn>z;aM<U
z3+%->C)x;<bgMNQBpS46g|a^Qfw=F~;ko27jKguN+G<xqwORc!BP1TlxJ-k;K7J5^
z+oyNn5Vn6f=D86OIKp_;(kd+f=p0!Y8^NpQ8myLRrVD(gPqPfxAC)OMnit@@<j*_~
zxs=!$+z^0-Aqx@If#<Rv-5`i{jQD#2f*ik&{jo8{uuLSrXqY%<)W`q2JLGBcMT-HW
z8?wWgn`l`-y=^}U4W9>6JnqwE&w3d?xq^92mzw9W)cW!<R}wc}J@`(~C+$a1x&GFf
zz@WVLO-mB9XIx+2tei^jOoTa1<-FefHALBO;+ch)DA5A$V2_VH)+qWDUb?WD<*m@-
zA0YLTn+nNnNp=kyz&wx|QB)0jYbGomcsZd%Qk0?>yA2jGGCv<P>1np#V%;7Q6~;Fg
zxT)hjZEy;&1;tj&bi`=Hk6oduPLkPC0ndZ)og_O$d8`mJ;uU1ad_=I&Keja=ufAU%
zLn88gLZ3zcL}HJ8kmZQN`g@!2)pO5Ki#3N%(+g*YMZhinn3%oNFI2+?zEim5Uys0(
zKfBuk1*Gk)apuCwq4n!K6VFH+xWCkIqrw_=+)ezZn8wD-uG<C`#M)DfP@0!I&~mS(
znM|(i`{gYKryjQmQdnZT`9ke8nRQ3FSALY&TX&G^ZB@9zr^Xd-=U2X&JUW3Z)mo<{
zY<br0*>xOyxR@l6p)Ys(l|GA$H(>BLODi-f+<F6MV=?XY=e=QapuS~UtjTtBrz8HI
zNCC~QH|W}Oub$-mpyDqdQtPf$ptB?z-(x_9!{MugEB;mja$h~Pes7I)$TPv&y1n!-
z@YYB&zkRva3oB7<s&V269rdT`bJpMY$Bg?o(za~U-&7+5K<`+bjP)#Ig)W)DA>y0l
z9xlDQtH1z!;Bq%5bHk;k$S-*(wV_-?7NR{)jKbXcam(TxJh1AevC@&tV(nOw$$Dx)
z@)KorkJzvM;?RZ|>pHSk;#@&}l+e_c(n$<?QLD22iS&1FHyXN9^5bo`&RFcPpZfwp
z0BEm(cE-KJTzKG*=5q0>E6$%g>}EV@)8E@O1#tuyvux@B@HR~RHX&Br<T<M<ssQd{
zMYfdE$0Fn>m-|k3(Zxk^A^Zs)l3we$!znfng8Z)Q)RDavnp%*1UA%vmGErOgGRFy{
z1$J-Kl&kQQg<8)sU(@8w?V7UaE^Cv2qXhTv`ic_@-wV5MP6V1R0k=Hm^YK8$+=+uX
zYf;ZF{$Tq=N(g)a>yx%!5T*8o>R^b`+@-GVVf~xp)3*|QZ0ny9RZ&Qv`jDODAQa|-
zf1yd2YTcc>t}cCl0Nz%?hp@=sNg_2d56H%wcxJzrX{3qhRWfv&;{C?%kV&JClVqSk
zGZv>@dkV-WA*F9b=uoik)w8sEi(i!vl?7|=sq=Y3!2!py!VvoBW9LNA#MM}mWD-kI
za(vx#q4F-)^Xb~&i<v7B6!YHqr1Ts$xygmn=7(8p<WN6|^60NW1Eq(eP2+#;+qN@j
zevi<d*Ie(6A3HIgUlE1KA)guxlO=|8)EA4T%ppU>A~~cFhF{R<%KP~juI{yW@oz8h
z3m+fc5Z>k?JYSUFqacB%7K-em<`MO2>QN?^506|RB~D_hp{)-Ka+q!BxI`O&o!&>m
zg(LIK*%R!gz+x~L1##I;S`RZ}S`f<3*|s$>=nqk`^AHGSJU@azM|`J)VTq^X_^?|Z
zk=9$(X@KV6guG^4yCuGERt&5wmA1HB6&Uv5YE#H-VGt9jyI6j(X254==En-^>*2L*
z6JL)_aM~i?z{MW~MfH&8Vkx3j6Kzmq(O-Jfs^4m&`$05J`mr6p$9(g3`$%3(U~5h}
zD#kCV#ChDQHz0~Z2F$NQGiM1ZTu-pr`Sg^(BgTtlX5NDi(vW2h@p6VV{1TNWc=Pc6
zkG{Ad8!>K``QLs_KF|yUSR<X59LQ>Osc`M-2nA{;Sda)ZJXvWCV2OMA<r}QAzRYzn
zPA!1+WS`~NzRc8_Xyvrz4n@8DCBKKq&wnI{u0KncGDn`cJSTy&iGb*93%VvNVyV>r
zi)ku=O27ag&FmAD_+1|5fI+ng;+bITJ4?7{0w5>8rWHAh&`<53r;Lh3#EG3Nelh;8
zABJNA8$5doMGxjP2(bnPe}V1Cgr`>*p($T*az`?5KgV1n%BijBDy6S;8k9uLwaats
zCAVzq@^AS6HkI_LeioYD#WFE8%S;C(n5J<+U44)oh@@;bKNEacTN=vvH?7L>axG!>
zu7PAVkYUN9+e{dTiybvg+`C==j6Js{GKcA$vS?s7H?PW6<=GF)w}I#lh@&d833*7J
z@P@ql$NHyzpOU~QTi<ux7i(vtzd$E%GAP|QCGezY-_|q2W8e9~b!h={1HgtbdB_~Q
z#adRgq55yregU7OgiwxNm$%_CAiWqHJ+`f+<K#dv({-j=05SlOO*f@9yl8m3$7nN9
z!syf-%u?enCbLokW#H=mDn9#t6#ECWBcV>||5(S85$DI1`xY2$-cM-jp{miEypU7Q
zRgWvK$Y1q;)1LcbvEPQ3S2t+U_<K~UT+(L=qgW3uYn>@06-yxlme7-d4c~v`Z4Ui?
z=4gtCfhbj!X&2rdFl2u3bS_vQMpCAAK<#Tv9rWuv3%s%{DdniJ2A0GM<$*Y!akKZs
z{!n<)cr7I&=Zc5S^Ycv1IEiEdf{yT_3REK2WYIR$?DU=Hh5TvE*H2?3*0O<4Pp|6K
z1BD?DIA$2Ke!o!4{4mwDlLJcWnNzm@j1$@%(IcZNxqCn@TQFK{J2_~p2Oxb!K1xa|
zEk_i{K@hMZSC3RQd8{aK<MCpmNGbothV*8^03DO(awgBy|J1R2@p@2#@8%CKPK(Xz
zs-)ShnR$zKR|f<q^G!g=8^A953rNwGv}TRp?%%Mb3qv4G(JVhN84e1CC#qsOT{+0?
zV{xp@Mrk7iE?$h$kcZ;unjfKPZiN5oy|=(^+yuIs$1Hpu!(I~-p{f|&Be_Q!P`|0d
zl?VAy0g+ap3Cax2&$FOb5a8J(Q~1O9@zx(&=+0eGejc!9EXo42m6R6yg`p@4rUQjm
zw=71aKsyK3V{;q~$IxVvo05k;6E(G#`=$k<$htaSyiTn|Z0}vS|E7*<RGW02cTma?
z^!;FOljzA^+83-uNmHhIK<z0k5+0+E*EZ27K^f9^zKwe}W?>tZUDFPWCTdi@v55af
zPrSe4PRBs6-`UfRSbSPYZ|8m7*1co_7BaL_R47Wf`1<V=kEho-#wZ>HmpfSckw}F5
z>f^b7!Of9nfNPI?NAtP@!tdn;P5-(c6h3+HE7>h#VIoue@&TUP*=(h}LK=5^qhdJa
znOTcDDlagYAIR#CRmGx$w3N6!=h6B=YwtuG2jocot5nVNcg|0{2h8F<RMTSpcDD}j
zt49OHroY^8b=<06sTmu@Q0REE?}rbugh3+U2Ji09TW*QUVX1w9x&2^WY^9bRw}-VP
zJT8K}tSL~J0fq6bW;~f^*`K}6f*&BD<(uRKbGWOeA>ude?`{{&cAEO`*Z;XyW;0<l
zuqm3f_(a$m`=fxF>K91L+<_0}oWHhr?4iGV#`{B5MbE-_#S)&R<4*7)#jN+u9-^Na
z`6MaDFz(3~D|>QH{Bg>Uz%;tgOpwy;CX1wo6nhHOkkrc2>Iw~p&Rc^ovD?FG6d8}e
zH}6i;-Qu<9=noKaZv`C5aPf`2CJC}i43KNgG2Oig9BX#|bHl?P&j}!{^{SJC7hYh4
zkM1!CwNtE9F@Yl>JvDp8UGt-2?_Oq^1?SP6f^JS>IyF~ksF;aC-h=Rl+Vz_COVi1>
zMrE$272m~7M+W?|x)6%m7mj*iF?lR)d90aaR54KXW=(QR`|Msk&w>A9VpL;?BKPF3
z_uZT~l?q3kbZ*ixaq$YwT>+Ohu^R!dGdGh9MvIEiQ2eRnk7xU>fg-8gI(<?4>;^qd
zd^H`nztG^5ZL8rjq+(&?!X>i4zztx2^>XKDT%2K)s;DhBrVrZhg<oJZ8T@@NR*k80
zkm{xfd+s3@|LyB^CEZeQfwhtLdp3DgcO0Xi{5?Jx54k=pH@(p;qwyx|M6@n)urMf1
zBw@`21Ae1|(IE}+<||v(k$XKgRRbHxcvKh)QtTCDi8?+dM@!J+-J5tG?so4wz4zko
ze;yuKZS*8rN#@SMp2Gjx6wXP#i<PB*rM|vi|Nd8R-wQ1@wgeH*_m1;VwVXb7QIPlV
zrr!I%WA|T_{bBlTE%Q4A;IzhM(L?v$C4Oz0JNL%gtB;mHozBo6+qLS!Cz;<?FaJ2W
zZQsbTa!cjETfa^+Q~<|`40Zwgp)Okk+O};>Jy7-L;ATBf=Lffc=)9|${}$No0#<<A
zKRVT(;(8SR+oFAo>Nlm|x(8>^JKpuX_D=bxt@5S+8|O<{GKl@N3({fwqh#`M;=0Gh
zpH-RSA7`i~zL>7~Av^Jd%G#)d^;<kxK!Yd;{9G8yBP%&<`cFpjFA(kWTzy)F@lc$7
z_r?{Azx|zhI^|0CL+;%|iXU8$3j4gXz8!ohKKJzlbvK@d*8Tod_H5=mA6&P+HkP}3
z|1Vp2-D`&)gnM#{XO-;T_J4b;5<@g_9dntQTK?&j)0wU5f34KAZ9BsBoMwFPVlSKg
zL+9A}&#tX+?|OHXE_|!?TYx*zF6@R=u~1{`(eHCh=bzg5SZw}*^SvSr-1YNUKDwW3
zb3T`4&wZr_f47N!ky4)Qrc+WW+PLBOor<u~eXGkp%<E-z0~XO`dAaweu3W!vPjCCh
zOJ&bKA6p%w%T~5Jbl-8&5Tlr*SvNd=qrx`zU&{3fSQ{oH9kVIiu(D3<`#ep5o!H~=
z53Iaf-uL*=)u5$?z`Y*gY9}8)bmIW}HCp6+!|LjdT@R|xJl<Jxcl-OkwM_TUS<JY0
zbZ_2QRe`!)S3hqk`@mogtigciv8bh<X58>Wydk~2wc~g;_etX%|4q5uUsu;px7R(y
zb_1B^fr<<?qd}p_(37SsV%L@M!&Uj<hnm{^?>VYUc^PI1z|?So)Vy(JIjpZF6Zik~
zib=UXyKV?A>-M_$|DWay))mdbMGg!MGo}DnMIVZ1P}BV;5f}OYd1yk-^vCXR_SS8_
zb@{h-{NwDWr-4=cCP@WOsJZ9Cb~89!{&(dUC(r<fOThI33=C(1qaHwxByi4yq2XC6
zu#y0-pJ#li&%odSY^(q|rogE~28K1j;uFZ(Dg@%xxC4jK7#Oa2{NQF_C;-j)Ffas2
zgY+EW2bR{rgKlPljAM8Q+|&n@%2NCTJTm78#32F9V0Fw8bt}P!+f0CX$p|c^43z>~
zWitsP1)Q7)nlAk3(GsA!48otU-(0l2BoN4yTOY3@<|hqg0f7S(aDNemsRFSZ$nk)m
z!8ZYM1|P&RAT2^*F9TT(D!}>)0zAP^0I?@Q0s+VxH4q#&qhSaRhS6jL4hDwNoB|Gq
u(ZXT0aDdjdz<?Mn9B5uR7`@o_?my$jTfL8Tjjn;31Pq?8elF{r5}E+tcFsHi

literal 0
HcmV?d00001

diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst
new file mode 100644
index 00000000000..893f5dd7434
--- /dev/null
+++ b/docs/dask_cudf/source/api.rst
@@ -0,0 +1,79 @@
+===============
+ API reference
+===============
+
+This page provides a list of all publicly accessible modules, methods,
+and classes in the ``dask_cudf`` namespace.
+
+
+Creating and storing DataFrames
+===============================
+
+:doc:`Like Dask <dask:dataframe-create>`, Dask-cuDF supports creation
+of DataFrames from a variety of storage formats. For on-disk data that
+are not supported directly in Dask-cuDF, we recommend using Dask's
+data reading facilities, followed by calling
+:func:`.from_dask_dataframe` to obtain a Dask-cuDF object.
+
+.. automodule:: dask_cudf
+   :members:
+      from_cudf,
+      from_dask_dataframe,
+      read_csv,
+      read_json,
+      read_orc,
+      to_orc,
+      read_text,
+      read_parquet
+
+.. warning::
+
+   FIXME: where should the following live?
+
+   .. autofunction:: dask_cudf.concat
+
+   .. autofunction:: dask_cudf.from_delayed
+
+Grouping
+========
+
+As discussed in the :doc:`Dask documentation for groupby
+<dask:dataframe-groupby>`, ``groupby``, ``join``, and ``merge``, and
+similar operations that require matching up rows of a DataFrame become
+significantly more challenging in a parallel setting than they are in
+serial. Dask-cuDF has the same challenges, however for certain groupby
+operations, we can take advantage of functionality in cuDF that allows
+us to compute multiple aggregations at once. There are therefore two
+interfaces to grouping in Dask-cuDF, the general
+:meth:`DataFrame.groupby` which returns a
+:class:`.CudfDataFrameGroupBy` object, and a specialized
+:func:`.groupby_agg`. Generally speaking, you should not need to call
+:func:`.groupby_agg` directly, since Dask-cuDF will arrange to call it
+if possible.
+
+.. autoclass:: dask_cudf.groupby.CudfDataFrameGroupBy
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autofunction:: dask_cudf.groupby_agg
+
+
+DataFrames and Series
+=====================
+
+The core distributed objects provided by Dask-cuDF are the
+:class:`.DataFrame` and :class:`.Series`. These inherit respectively
+from :class:`dask.dataframe.DataFrame` and
+:class:`dask.dataframe.Series`, and so the API is essentially
+identical. The full API is provided below.
+
+.. autoclass:: dask_cudf.DataFrame
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: dask_cudf.Series
+   :members:
+   :inherited-members:
+   :show-inheritance:
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
new file mode 100644
index 00000000000..1341e7fd9e7
--- /dev/null
+++ b/docs/dask_cudf/source/conf.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "dask-cudf"
+copyright = "2018-2023, NVIDIA Corporation"
+author = "NVIDIA Corporation"
+version = "23.04"
+release = "23.04.00"
+
+language = "en"
+
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.autodoc",
+    "sphinx_copybutton",
+    "numpydoc",
+    "IPython.sphinxext.ipython_console_highlighting",
+    "IPython.sphinxext.ipython_directive",
+    "myst_nb",
+]
+
+templates_path = ["_templates"]
+exclude_patterns = []
+
+copybutton_prompt_text = ">>> "
+
+# Enable automatic generation of systematic, namespaced labels for sections
+myst_heading_anchors = 2
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "pydata_sphinx_theme"
+html_logo = "_static/RAPIDS-logo-purple.png"
+htmlhelp_basename = "dask-cudfdoc"
+html_use_modindex = True
+
+html_static_path = ["_static"]
+
+pygments_style = "sphinx"
+
+html_theme_options = {
+    "external_links": [],
+    "github_url": "https://github.com/rapidsai/cudf",
+    "twitter_url": "https://twitter.com/rapidsai",
+    "show_toc_level": 1,
+    "navbar_align": "right",
+}
+include_pandas_compat = True
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "cupy": ("https://docs.cupy.dev/en/stable/", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "pyarrow": ("https://arrow.apache.org/docs/", None),
+    "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None),
+    "dask": ("https://docs.dask.org/en/stable/", None),
+    "pandas": ("https://pandas.pydata.org/docs/", None),
+}
+
+numpydoc_show_inherited_class_members = True
+numpydoc_class_members_toctree = False
+numpydoc_attributes_as_param_list = False
+
+
+def setup(app):
+    app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
+    app.add_js_file(
+        "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
+    )
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
new file mode 100644
index 00000000000..0442ab0929a
--- /dev/null
+++ b/docs/dask_cudf/source/index.rst
@@ -0,0 +1,112 @@
+.. dask-cudf documentation coordinating file, created by
+   sphinx-quickstart on Mon Feb  6 18:48:11 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to dask-cudf's documentation!
+=====================================
+
+Dask-cuDF is an extension library for the `Dask <https://dask.org>`__
+parallel computing framework that provides a `cuDF
+<https://docs.rapids.ai/api/cudf/stable/>`__-backed distributed
+dataframe with the same API as `Dask dataframes
+<https://docs.dask.org/en/stable/dataframe.html>`__.
+
+If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
+`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask-cuDF
+should feel familiar to you. If not, we recommend starting with `10
+minutes to Dask
+<https://docs.dask.org/en/stable/10-minutes-to-dask.html>`__ followed
+by `10 minutes to cuDF and Dask-cuDF
+<https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
+
+When running on multi-GPU systems, `Dask-CUDA
+<https://docs.rapids.ai/api/dask-cuda/stable/>`__ is recommended to
+simplify the setup of the cluster, taking advantage of all features of
+the GPU and networking hardware.
+
+Using Dask-cuDF
+---------------
+
+When installed, Dask-cuDF registers itself as a dataframe backend for
+Dask. This means that in many cases, using cuDF-backed dataframes requires
+only small changes to an existing workflow. The minimal change is to
+select cuDF as the dataframe backend in :doc:`Dask's
+configuration <dask:configuration>`. To do so, we must set the option
+``dataframe.backend`` to ``cudf``. From Python, this can be achieved
+like so::
+
+  import dask
+
+  dask.config.set({"dataframe.backend": "cudf"})
+
+Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the
+environment before running your code.
+
+Dataframe creation from on-disk formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your workflow creates Dask dataframes from on-disk formats
+(for example using :func:`dask.dataframe.read_parquet`), then setting
+the backend may well be enough to migrate your workflow.
+
+For example, consider reading a dataframe from parquet::
+
+   import dask.dataframe as dd
+
+   # By default, we obtain a pandas-backed dataframe
+   df = dd.read_parquet("data.parquet", ...)
+
+
+To obtain a cuDF-backed dataframe, we must set the
+``dataframe.backend`` configuration option::
+
+  import dask
+  import dask.dataframe as dd
+
+  dask.config.set({"dataframe.backend": "cudf"})
+  # This gives us a cuDF-backed dataframe
+  df = dd.read_parquet("data.parquet", ...)
+
+This code will use cuDF's GPU-accelerated :func:`parquet reader
+<cudf.read_parquet>` to read partitions of the data.
+
+Dataframe creation from in-memory formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you already have a dataframe in memory and want to convert it to a
+cuDF-backend one, there are two options depending on whether the
+dataframe is already a Dask one or not. If you have a Dask dataframe,
+then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"``
+as the backend; if you have a pandas dataframe then you can either
+call :func:`dask.dataframe.from_pandas` followed by
+:func:`~dask.dataframe.to_backend` or first convert the dataframe with
+:func:`cudf.from_pandas` and then parallelise this with
+:func:`dask_cudf.from_cudf`.
+
+API Reference
+-------------
+
+Generally speaking, Dask-cuDF tries to offer exactly the same API as
+Dask itself. There are, however, some minor differences mostly because
+cuDF does not :doc:`perfectly mirror <cudf:user_guide/PandasCompat>`
+the pandas API, or because cuDF provides additional configuration
+flags (these mostly occur in data reading and writing interfaces).
+
+As a result, straightforward workflows can be migrated without too
+much trouble, but more complex ones that utilise more features may
+need a bit of tweaking. The API documentation describes details of the
+differences and all functionality that Dask-cuDF supports.
+
+.. toctree::
+   :maxdepth: 2
+
+   api
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 60bbe5d9571..d2858876fcd 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -1,6 +1,7 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import math
+import textwrap
 import warnings
 
 import numpy as np
@@ -68,6 +69,18 @@ def to_dask_dataframe(self, **kwargs):
 
 
 class DataFrame(_Frame, dd.core.DataFrame):
+    """
+    A distributed Dask DataFrame where the backing dataframe is a
+    :class:`cuDF DataFrame <cudf:cudf.DataFrame>`.
+
+    Typically you would not construct this object directly, but rather
+    use one of Dask-cuDF's IO routines.
+
+    Most operations on :doc:`Dask DataFrames <dask:dataframe>` are
+    supported, with many of the same caveats.
+
+    """
+
     _partition_type = cudf.DataFrame
 
     @_dask_cudf_nvtx_annotate
@@ -671,12 +684,35 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
 
 
 from_cudf.__doc__ = (
-    "Wraps main-line Dask from_pandas...\n" + dd.from_pandas.__doc__
+    textwrap.dedent(
+        """
+        Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`.
+
+        This function is a thin wrapper around
+        :func:`dask.dataframe.from_pandas`, accepting the same
+        arguments (described below) excepting that it operates on cuDF
+        rather than pandas objects.\n
+        """
+    )
+    + textwrap.dedent(dd.from_pandas.__doc__)
 )
 
 
 @_dask_cudf_nvtx_annotate
 def from_dask_dataframe(df):
+    """
+    Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
+    one.
+
+    Parameters
+    ----------
+    df : dask.dataframe.DataFrame
+        The Dask dataframe to convert
+
+    Returns
+    -------
+    dask_cudf.DataFrame : A new Dask collection backed by cuDF objects
+    """
     return df.map_partitions(cudf.from_pandas)
 
 
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index f91738bdab0..f4bbcaf4dd1 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from functools import wraps
 from typing import Set
@@ -433,22 +433,55 @@ def groupby_agg(
 ):
     """Optimized groupby aggregation for Dask-CuDF.
 
-    This aggregation algorithm only supports the following options:
-
-    - "count"
-    - "mean"
-    - "std"
-    - "var"
-    - "sum"
-    - "min"
-    - "max"
-    - "collect"
-    - "first"
-    - "last"
-
-    This "optimized" approach is more performant than the algorithm
-    in `dask.dataframe`, because it allows the cudf backend to
-    perform multiple aggregations at once.
+    Parameters
+    ----------
+    ddf : DataFrame
+        DataFrame object to perform grouping on.
+    gb_cols : str or list[str]
+        Column names to group by.
+    aggs_in : str, list, or dict
+        Aggregations to perform.
+    split_every : int (optional)
+        How to group intermediate aggregates.
+    dropna : bool
+        Drop grouping key values corresponding to NA values.
+    as_index : bool
+        Currently ignored.
+    sort : bool
+        Sort the group keys, better performance is obtained when
+        not sorting.
+    shuffle : str (optional)
+        Control how shuffling of the DataFrame is performed.
+    sep : str
+        Internal usage.
+
+
+    Notes
+    -----
+    This "optimized" approach is more performant than the algorithm in
+    implemented in :meth:`DataFrame.apply` because it allows the cuDF
+    backend to perform multiple aggregations at once.
+
+    This aggregation algorithm only supports the following options
+
+    * "collect"
+    * "count"
+    * "first"
+    * "last"
+    * "max"
+    * "mean"
+    * "min"
+    * "std"
+    * "sum"
+    * "var"
+
+
+    See Also
+    --------
+    DataFrame.groupby : generic groupby of a DataFrame
+    dask.dataframe.apply_concat_apply : for more description of the
+        split_every argument.
+
     """
     # Assert that aggregations are supported
     aggs = _redirect_aggs(aggs_in)
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index b4d080fd182..fd27083bbf4 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -16,9 +16,10 @@
 
 def read_csv(path, blocksize="default", **kwargs):
     """
-    Read CSV files into a dask_cudf.DataFrame
+    Read CSV files into a :class:`.DataFrame`.
 
-    This API parallelizes the ``cudf.read_csv`` function in the following ways:
+    This API parallelizes the :func:`cudf:cudf.read_csv` function in
+    the following ways:
 
     It supports loading many files at once using globstrings:
 
@@ -34,23 +35,26 @@ def read_csv(path, blocksize="default", **kwargs):
     >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
     >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
 
-    Internally ``dask_cudf.read_csv`` uses ``cudf.read_csv`` and supports
-    many of the same keyword arguments with the same performance guarantees.
-    See the docstring for ``cudf.read_csv()`` for more information on available
+    Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and
+    supports many of the same keyword arguments with the same
+    performance guarantees. See the docstring for
+    :func:`cudf:cudf.read_csv` for more information on available
     keyword arguments.
 
     Parameters
     ----------
     path : str, path object, or file-like object
-        Either a path to a file (a str, pathlib.Path, or
-        py._path.local.LocalPath), URL (including http, ftp, and S3 locations),
-        or any object with a read() method (such as builtin open() file
-        handler function or StringIO).
+        Either a path to a file (a str, :py:class:`pathlib.Path`, or
+        py._path.local.LocalPath), URL (including http, ftp, and S3
+        locations), or any object with a read() method (such as
+        builtin :py:func:`open` file handler function or
+        :py:class:`~io.StringIO`).
     blocksize : int or str, default "256 MiB"
-        The target task partition size. If `None`, a single block
+        The target task partition size. If ``None``, a single block
         is used for each file.
     **kwargs : dict
-        Passthrough key-word arguments that are sent to ``cudf.read_csv``.
+        Passthrough key-word arguments that are sent to
+        :func:`cudf:cudf.read_csv`.
 
     Examples
     --------
@@ -61,6 +65,7 @@ def read_csv(path, blocksize="default", **kwargs):
     0  1     hi
     1  2  hello
     2  3     ai
+
     """
 
     # Handle `chunksize` deprecation
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index bb3d0f3c601..2a6ad603414 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -10,30 +10,33 @@
 
 
 def read_json(url_path, engine="auto", **kwargs):
-    """Create a dask_cudf DataFrame collection from JSON data
+    """Read JSON data into a :class:`.DataFrame`.
 
-    This function wraps ``dask.dataframe.read_json``, and passes
+    This function wraps :func:`dask.dataframe.read_json`, and passes
     ``engine=partial(cudf.read_json, engine="auto")`` by default.
 
     Parameters
     ----------
-    url_path: str, list of str
+    url_path : str, list of str
         Location to read from. If a string, can include a glob character to
         find a set of file names.
         Supports protocol specifications such as ``"s3://"``.
     engine : str or Callable, default "auto"
-        If str, this value will be used as the ``engine`` argument when
-        ``cudf.read_json`` is used to create each partition. If Callable,
-        this value will be used as the underlying function used to create
-        each partition from JSON data. The default value is "auto", so
-        that ``engine=partial(cudf.read_json, engine="auto")`` will be
-        passed to ``dask.dataframe.read_json`` by default.
+
+        If str, this value will be used as the ``engine`` argument
+        when :func:`cudf.read_json` is used to create each partition.
+        If a :obj:`~typing.Callable`, this value will be used as the
+        underlying function used to create each partition from JSON
+        data. The default value is "auto", so that
+        ``engine=partial(cudf.read_json, engine="auto")`` will be
+        passed to :func:`dask.dataframe.read_json` by default.
+
     **kwargs :
-        Key-word arguments to pass through to ``dask.dataframe.read_json``.
+        Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
 
     Returns
     -------
-    dask_cudf.DataFrame
+    :class:`.DataFrame`
 
     Examples
     --------
@@ -53,7 +56,8 @@ def read_json(url_path, engine="auto", **kwargs):
 
     See Also
     --------
-    dask.dataframe.io.json.read_json
+    dask.dataframe.read_json
+
     """
 
     # TODO: Add optimized code path to leverage the
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index e731057ed90..49fea0d7602 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from io import BufferedWriter, IOBase
 
@@ -25,37 +25,45 @@ def _read_orc_stripe(fs, path, stripe, columns, kwargs=None):
 
 
 def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
-    """Read cudf dataframe from ORC file(s).
+    """Read ORC files into a :class:`.DataFrame`.
 
     Note that this function is mostly borrowed from upstream Dask.
 
     Parameters
     ----------
-    path: str or list(str)
+    path : str or list[str]
         Location of file(s), which can be a full URL with protocol specifier,
         and may include glob character if a single string.
-    columns: None or list(str)
+    columns : None or list[str]
         Columns to load. If None, loads all.
     filters : None or list of tuple or list of lists of tuples
-        If not None, specifies a filter predicate used to filter out row groups
-        using statistics stored for each row group as Parquet metadata. Row
-        groups that do not match the given filter predicate are not read. The
-        predicate is expressed in disjunctive normal form (DNF) like
-        `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical
-        combinations of single column predicates. The innermost tuples each
-        describe a single column predicate. The list of inner predicates is
-        interpreted as a conjunction (AND), forming a more selective and
-        multiple column predicate. Finally, the outermost list combines
-        these filters as a disjunction (OR). Predicates may also be passed
-        as a list of tuples. This form is interpreted as a single conjunction.
-        To express OR in predicates, one must use the (preferred) notation of
-        list of lists of tuples.
-    storage_options: None or dict
+        If not None, specifies a filter predicate used to filter out
+        row groups using statistics stored for each row group as
+        Parquet metadata. Row groups that do not match the given
+        filter predicate are not read. The predicate is expressed in
+        `disjunctive normal form (DNF)
+        <https://en.wikipedia.org/wiki/Disjunctive_normal_form>`__
+        like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary
+        boolean logical combinations of single column predicates. The
+        innermost tuples each describe a single column predicate. The
+        list of inner predicates is interpreted as a conjunction
+        (AND), forming a more selective and multiple column predicate.
+        Finally, the outermost list combines these filters as a
+        disjunction (OR). Predicates may also be passed as a list of
+        tuples. This form is interpreted as a single conjunction. To
+        express OR in predicates, one must use the (preferred)
+        notation of list of lists of tuples.
+    storage_options : None or dict
         Further parameters to pass to the bytes backend.
 
+    See Also
+    --------
+    dask.dataframe.read_orc
+
     Returns
     -------
-    cudf.DataFrame
+    dask_cudf.DataFrame
+
     """
 
     storage_options = storage_options or {}
@@ -133,22 +141,25 @@ def to_orc(
     compute=True,
     **kwargs,
 ):
-    """Write a dask_cudf dataframe to ORC file(s) (one file per partition).
+    """
+    Write a :class:`.DataFrame` to ORC file(s) (one file per partition).
 
     Parameters
     ----------
-    df : dask_cudf.DataFrame
-    path: string or pathlib.Path
+    df : DataFrame
+    path : str or pathlib.Path
         Destination directory for data.  Prepend with protocol like ``s3://``
         or ``hdfs://`` for remote data.
     write_index : boolean, optional
         Whether or not to write the index. Defaults to True.
-    storage_options: None or dict
+    storage_options : None or dict
         Further parameters to pass to the bytes backend.
     compression : string or dict, optional
     compute : bool, optional
-        If True (default) then the result is computed immediately. If False
-        then a ``dask.delayed`` object is returned for future computation.
+        If True (default) then the result is computed immediately. If
+        False then a :class:`~dask.delayed.Delayed` object is returned
+        for future computation.
+
     """
 
     from dask import compute as dask_compute, delayed
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 452f2f8914a..b03ac256b05 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -438,13 +438,14 @@ def set_object_dtypes_from_pa_schema(df, schema):
 
 
 def read_parquet(path, columns=None, **kwargs):
-    """Read parquet files into a Dask DataFrame
+    """
+    Read parquet files into a :class:`.DataFrame`.
 
-    Calls ``dask.dataframe.read_parquet`` with ``engine=CudfEngine``
-    to coordinate the execution of ``cudf.read_parquet``, and to
-    ultimately create a ``dask_cudf.DataFrame`` collection.
+    Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine``
+    to coordinate the execution of :func:`cudf.read_parquet`, and to
+    ultimately create a :class:`.DataFrame` collection.
 
-    See the ``dask.dataframe.read_parquet`` documentation for
+    See the :func:`dask.dataframe.read_parquet` documentation for
     all available options.
 
     Examples
@@ -469,6 +470,7 @@ def read_parquet(path, columns=None, **kwargs):
     See Also
     --------
     cudf.read_parquet
+    dask.dataframe.read_parquet
     """
     if isinstance(columns, str):
         columns = [columns]

From 84ac1d5a87859bd96bd767b5a65b017e260024a1 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 21 Mar 2023 12:52:54 +0100
Subject: [PATCH 31/63] Fix conda recipe post-link.sh typo (#12916)

Following an internal discussion, fix a typo in conda recipe `post-link.sh`. For some conda versions relying solely on `prelink_message` is insufficient, thus this file has to still be maintained.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Jordan Jacobelli (https://github.com/jjacobelli)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/12916
---
 conda/recipes/libcudf/post-link.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/recipes/libcudf/post-link.sh b/conda/recipes/libcudf/post-link.sh
index 64e0b1ad305..8ae2349f791 100644
--- a/conda/recipes/libcudf/post-link.sh
+++ b/conda/recipes/libcudf/post-link.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Only add the license notice to libcudf and not our examples / tests
 if [[ "$PKG_NAME" == "libcudf" ]]; then
-  cat ./nvlink.txt >> $PREFIX/.messages.txt
+  cat ./nvcomp.txt >> $PREFIX/.messages.txt
 fi

From 6547d962acde9c7f4091d89ac44da04b7ab5c409 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 21 Mar 2023 14:38:30 -0400
Subject: [PATCH 32/63] Remove default detail mrs: part5 (#12968)

This is the fifth PR in a sequence removing default mr parameters in detail APIs. Contributes to https://github.com/rapidsai/cudf/issues/12944.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/12968
---
 cpp/include/cudf/detail/reshape.hpp           | 18 +++---
 cpp/include/cudf/io/detail/csv.hpp            |  4 +-
 cpp/include/cudf/io/detail/tokenize_json.hpp  |  2 +-
 cpp/include/cudf/lists/detail/combine.hpp     | 20 +++----
 .../cudf/lists/detail/set_operations.hpp      | 54 +++++++++---------
 cpp/src/copying/get_element.cu                | 56 +++++++++----------
 cpp/src/filling/fill.cu                       | 11 ++--
 cpp/src/filling/sequence.cu                   | 11 ++--
 cpp/src/io/csv/durations.hpp                  |  9 ++-
 cpp/src/io/json/json_column.cu                |  3 +-
 cpp/src/lists/set_operations.cu               |  7 ++-
 cpp/tests/io/json_tree.cpp                    | 20 +++----
 cpp/tests/io/nested_json_test.cpp             |  8 +--
 13 files changed, 105 insertions(+), 118 deletions(-)

diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index ccffcbc61df..5ab53690a23 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,21 +30,19 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<table> tile(
-  table_view const& input,
-  size_type count,
-  rmm::cuda_stream_view,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> tile(table_view const& input,
+                            size_type count,
+                            rmm::cuda_stream_view,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::interleave_columns
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<column> interleave_columns(
-  table_view const& input,
-  rmm::cuda_stream_view,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           rmm::cuda_stream_view,
+                                           rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 90d730338fc..9fdc7a47fb9 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ void write_csv(data_sink* sink,
                host_span<std::string const> column_names,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+               rmm::mr::device_memory_resource* mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index b03dbd4fb70..4914f434c98 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -131,7 +131,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   device_span<SymbolT const> json_in,
   cudf::io::json_reader_options const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
index 9f28074173a..4bc45e48a9f 100644
--- a/cpp/include/cudf/lists/detail/combine.hpp
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,22 +27,20 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate_rows(
-  table_view const& input,
-  concatenate_null_policy null_policy,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate_rows(table_view const& input,
+                                         concatenate_null_policy null_policy,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::concatenate_list_elements
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate_list_elements(
-  column_view const& input,
-  concatenate_null_policy null_policy,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate_list_elements(column_view const& input,
+                                                  concatenate_null_policy null_policy,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/set_operations.hpp b/cpp/include/cudf/lists/detail/set_operations.hpp
index ef4255de430..1411c65448e 100644
--- a/cpp/include/cudf/lists/detail/set_operations.hpp
+++ b/cpp/include/cudf/lists/detail/set_operations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,52 +30,48 @@ namespace cudf::lists::detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> have_overlap(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
+                                     lists_column_view const& rhs,
+                                     null_equality nulls_equal,
+                                     nan_equality nans_equal,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::intersect_distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> intersect_distinct(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
+                                           lists_column_view const& rhs,
+                                           null_equality nulls_equal,
+                                           nan_equality nans_equal,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::union_distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> union_distinct(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
+                                       lists_column_view const& rhs,
+                                       null_equality nulls_equal,
+                                       nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::difference_distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> difference_distinct(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
+                                            lists_column_view const& rhs,
+                                            null_equality nulls_equal,
+                                            nan_equality nans_equal,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /** @} */  // end of group
 }  // namespace cudf::lists::detail
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 5e76b4adbbe..cc12aaa1382 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,11 +37,10 @@ namespace {
 
 struct get_element_functor {
   template <typename T, std::enable_if_t<is_fixed_width<T>() && !is_fixed_point<T>()>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto s = make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
 
@@ -61,11 +60,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, string_view>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto device_col = column_device_view::create(input, stream);
 
@@ -86,11 +84,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, dictionary32>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto dict_view    = dictionary_column_view(input);
     auto indices_iter = detail::indexalator_factory::make_input_iterator(dict_view.indices());
@@ -122,11 +119,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, list_view>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     bool valid               = is_element_valid_sync(input, index, stream);
     auto const child_col_idx = lists_column_view::child_column_index;
@@ -147,11 +143,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     using Type = typename T::rep;
 
@@ -178,11 +173,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, struct_view>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     bool valid = is_element_valid_sync(input, index, stream);
     auto row_contents =
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index ecd66f1b0c9..a747cc195ae 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,11 +104,10 @@ struct out_of_place_fill_range_dispatch {
 
   template <typename T,
             CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>() or cudf::is_fixed_point<T>())>
-  std::unique_ptr<cudf::column> operator()(
-    cudf::size_type begin,
-    cudf::size_type end,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<cudf::column> operator()(cudf::size_type begin,
+                                           cudf::size_type end,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index 284e7c46347..b4bab369c61 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,11 +134,10 @@ std::unique_ptr<column> sequence(size_type size,
   return type_dispatcher(init.type(), sequence_functor{}, size, init, step, stream, mr);
 }
 
-std::unique_ptr<column> sequence(
-  size_type size,
-  scalar const& init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> sequence(size_type size,
+                                 scalar const& init,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "init scalar type must be numeric");
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
index d42ddf3817c..ac925011c58 100644
--- a/cpp/src/io/csv/durations.hpp
+++ b/cpp/src/io/csv/durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,10 +28,9 @@ namespace io {
 namespace detail {
 namespace csv {
 
-std::unique_ptr<column> pandas_format_durations(
-  column_view const& durations,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> pandas_format_durations(column_view const& durations,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index f4d65f37cdb..c937315969c 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -893,7 +893,8 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
 
   auto gpu_tree = [&]() {
     // Parse the JSON and get the token stream
-    const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream);
+    const auto [tokens_gpu, token_indices_gpu] =
+      get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource());
     // gpu tree generation
     return get_tree_representation(
       tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 8df99153d74..eb3ec5a8236 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -195,8 +195,11 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
 
   // Algorithm: `return distinct(concatenate_rows(lhs, rhs))`.
 
-  auto const union_col = lists::detail::concatenate_rows(
-    table_view{{lhs.parent(), rhs.parent()}}, concatenate_null_policy::NULLIFY_OUTPUT_ROW, stream);
+  auto const union_col =
+    lists::detail::concatenate_rows(table_view{{lhs.parent(), rhs.parent()}},
+                                    concatenate_null_policy::NULLIFY_OUTPUT_ROW,
+                                    stream,
+                                    rmm::mr::get_current_device_resource());
 
   return cudf::lists::detail::distinct(
     lists_column_view{union_col->view()}, nulls_equal, nans_equal, stream, mr);
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index 94a7c8edcf9..0ae0360c4d9 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -586,8 +586,8 @@ TEST_F(JsonTest, TreeRepresentation)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
@@ -673,8 +673,8 @@ TEST_F(JsonTest, TreeRepresentation2)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
@@ -747,8 +747,8 @@ TEST_F(JsonTest, TreeRepresentation3)
   options.enable_lines(true);
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
@@ -772,8 +772,8 @@ TEST_F(JsonTest, TreeRepresentationError)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
   // This JSON is invalid and will raise an exception.
@@ -855,8 +855,8 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
                                                              static_cast<size_t>(d_scalar.size())};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   bool const is_array_of_arrays =
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 8af530b0002..5b797a00ca1 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -262,8 +262,8 @@ TEST_F(JsonTest, TokenStream)
     cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
 
   // Parse the JSON and get the token stream
-  auto [d_tokens_gpu, d_token_indices_gpu] =
-    cuio_json::detail::get_token_stream(d_input, default_options, stream);
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, default_options, stream, rmm::mr::get_current_device_resource());
   // Copy back the number of tokens that were written
   thrust::host_vector<PdaTokenT> const tokens_gpu =
     cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
@@ -398,8 +398,8 @@ TEST_F(JsonTest, TokenStream2)
     cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
 
   // Parse the JSON and get the token stream
-  auto [d_tokens_gpu, d_token_indices_gpu] =
-    cuio_json::detail::get_token_stream(d_input, default_options, stream);
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, default_options, stream, rmm::mr::get_current_device_resource());
   // Copy back the number of tokens that were written
   thrust::host_vector<PdaTokenT> const tokens_gpu =
     cudf::detail::make_host_vector_async(d_tokens_gpu, stream);

From 17a2cdcdec0a3bf24efa82d46496d95501bce7cf Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Tue, 21 Mar 2023 11:52:29 -0700
Subject: [PATCH 33/63] Refactor orc chunked writer (#12949)

The current ORC chunked writer performs compressing/encoding and writing data into the output data sink without any safeguard. This PR modifies the internal `writer::impl::write()` function, separating it into multiple pieces:
 * A free function that performs compressing/encoding the input table into intermediate results. These intermediate results are totally independent of the writer. As such, the writer can be isolated from failures of this free function, allowing to retry upon failure.
 * After having the intermediate results in the previous step, these results will be actually applied to the output data sink to start the actual data writing.

Some cleanup is also performed on the existing code. That includes moving some member functions into free functions, which helps reducing potential dependencies between translation units.

There is no new implementation added in this work. Only the existing code is moved around.

Partially contributes to https://github.com/rapidsai/cudf/issues/12792.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/12949
---
 cpp/src/io/orc/writer_impl.cu  | 1019 ++++++++++++++++++++------------
 cpp/src/io/orc/writer_impl.hpp |  269 +++------
 2 files changed, 736 insertions(+), 552 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 1ee90bde1d2..d3bb0a45c12 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -55,6 +55,7 @@
 #include <algorithm>
 #include <cstring>
 #include <numeric>
+#include <tuple>
 #include <utility>
 
 #include <cooperative_groups.h>
@@ -332,6 +333,136 @@ size_type orc_table_view::num_rows() const noexcept
   return columns.empty() ? 0 : columns.front().size();
 }
 
+orc_streams::orc_stream_offsets orc_streams::compute_offsets(
+  host_span<orc_column_view const> columns, size_t num_rowgroups) const
+{
+  std::vector<size_t> strm_offsets(streams.size());
+  size_t non_rle_data_size = 0;
+  size_t rle_data_size     = 0;
+  for (size_t i = 0; i < streams.size(); ++i) {
+    const auto& stream = streams[i];
+
+    auto const is_rle_data = [&]() {
+      // First stream is an index stream, don't check types, etc.
+      if (!stream.column_index().has_value()) return true;
+
+      auto const& column = columns[stream.column_index().value()];
+      // Dictionary encoded string column - dictionary characters or
+      // directly encoded string - column characters
+      if (column.orc_kind() == TypeKind::STRING &&
+          ((stream.kind == DICTIONARY_DATA && column.orc_encoding() == DICTIONARY_V2) ||
+           (stream.kind == DATA && column.orc_encoding() == DIRECT_V2)))
+        return false;
+      // Decimal data
+      if (column.orc_kind() == TypeKind::DECIMAL && stream.kind == DATA) return false;
+
+      // Everything else uses RLE
+      return true;
+    }();
+    // non-RLE and RLE streams are separated in the buffer that stores encoded data
+    // The computed offsets do not take the streams of the other type into account
+    if (is_rle_data) {
+      strm_offsets[i] = rle_data_size;
+      rle_data_size += (stream.length + 7) & ~7;
+    } else {
+      strm_offsets[i] = non_rle_data_size;
+      non_rle_data_size += stream.length;
+    }
+  }
+  non_rle_data_size = (non_rle_data_size + 7) & ~7;
+
+  return {std::move(strm_offsets), non_rle_data_size, rle_data_size};
+}
+
+namespace {
+struct string_length_functor {
+  __device__ inline size_type operator()(int const i) const
+  {
+    // we translate from 0 -> num_chunks * 2 because each statistic has a min and max
+    // string and we need to calculate lengths for both.
+    if (i >= num_chunks * 2) return 0;
+
+    // min strings are even values, max strings are odd values of i
+    auto const should_copy_min = i % 2 == 0;
+    // index of the chunk
+    auto const idx = i / 2;
+    auto& str_val  = should_copy_min ? stripe_stat_chunks[idx].min_value.str_val
+                                     : stripe_stat_chunks[idx].max_value.str_val;
+    auto const str = stripe_stat_merge[idx].stats_dtype == dtype_string;
+    return str ? str_val.length : 0;
+  }
+
+  int const num_chunks;
+  statistics_chunk const* stripe_stat_chunks;
+  statistics_merge_group const* stripe_stat_merge;
+};
+
+__global__ void copy_string_data(char* string_pool,
+                                 size_type* offsets,
+                                 statistics_chunk* chunks,
+                                 statistics_merge_group const* groups)
+{
+  auto const idx = blockIdx.x / 2;
+  if (groups[idx].stats_dtype == dtype_string) {
+    // min strings are even values, max strings are odd values of i
+    auto const should_copy_min = blockIdx.x % 2 == 0;
+    auto& str_val = should_copy_min ? chunks[idx].min_value.str_val : chunks[idx].max_value.str_val;
+    auto dst      = &string_pool[offsets[blockIdx.x]];
+    auto src      = str_val.ptr;
+
+    for (int i = threadIdx.x; i < str_val.length; i += blockDim.x) {
+      dst[i] = src[i];
+    }
+    if (threadIdx.x == 0) { str_val.ptr = dst; }
+  }
+}
+
+}  // namespace
+
+void persisted_statistics::persist(int num_table_rows,
+                                   bool single_write_mode,
+                                   intermediate_statistics& intermediate_stats,
+                                   rmm::cuda_stream_view stream)
+{
+  if (not single_write_mode) {
+    // persist the strings in the chunks into a string pool and update pointers
+    auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
+    // min offset and max offset + 1 for total size
+    rmm::device_uvector<size_type> offsets((num_chunks * 2) + 1, stream);
+
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0,
+      string_length_functor{num_chunks,
+                            intermediate_stats.stripe_stat_chunks.data(),
+                            intermediate_stats.stripe_stat_merge.device_ptr()});
+    thrust::exclusive_scan(rmm::exec_policy(stream), iter, iter + offsets.size(), offsets.begin());
+
+    // pull size back to host
+    auto const total_string_pool_size = offsets.element(num_chunks * 2, stream);
+    if (total_string_pool_size > 0) {
+      rmm::device_uvector<char> string_pool(total_string_pool_size, stream);
+
+      // offsets describes where in the string pool each string goes. Going with the simple
+      // approach for now, but it is possible something fancier with breaking up each thread into
+      // copying x bytes instead of a single string is the better method since we are dealing in
+      // min/max strings they almost certainly will not be uniform length.
+      copy_string_data<<<num_chunks * 2, 256, 0, stream.value()>>>(
+        string_pool.data(),
+        offsets.data(),
+        intermediate_stats.stripe_stat_chunks.data(),
+        intermediate_stats.stripe_stat_merge.device_ptr());
+      string_pools.emplace_back(std::move(string_pool));
+    }
+  }
+
+  stripe_stat_chunks.emplace_back(std::move(intermediate_stats.stripe_stat_chunks));
+  stripe_stat_merge.emplace_back(std::move(intermediate_stats.stripe_stat_merge));
+  stats_dtypes = std::move(intermediate_stats.stats_dtypes);
+  col_types    = std::move(intermediate_stats.col_types);
+  num_rows     = num_table_rows;
+}
+
+namespace {
 /**
  * @brief Gathers stripe information.
  *
@@ -442,12 +573,26 @@ void init_dictionaries(orc_table_view& orc_table,
   dict->device_to_host(stream, true);
 }
 
-void writer::impl::build_dictionaries(orc_table_view& orc_table,
-                                      host_span<stripe_rowgroups const> stripe_bounds,
-                                      hostdevice_2dvector<gpu::DictionaryChunk> const& dict,
-                                      host_span<rmm::device_uvector<uint32_t>> dict_index,
-                                      host_span<bool const> dictionary_enabled,
-                                      hostdevice_2dvector<gpu::StripeDictionary>& stripe_dict)
+/**
+ * @brief Builds up per-stripe dictionaries for string columns.
+ *
+ * @param orc_table Non-owning view of a cuDF table w/ ORC-related info
+ * @param stripe_bounds List of stripe boundaries
+ * @param dict List of dictionary chunks [rowgroup][column]
+ * @param dict_index List of dictionary indices
+ * @param dictionary_enabled Whether dictionary encoding is enabled for a given column
+ * @param stripe_dict List of stripe dictionaries
+ * @param enable_dictionary Whether dictionary is enabled
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void build_dictionaries(orc_table_view& orc_table,
+                        host_span<stripe_rowgroups const> stripe_bounds,
+                        hostdevice_2dvector<gpu::DictionaryChunk> const& dict,
+                        host_span<rmm::device_uvector<uint32_t>> dict_index,
+                        host_span<bool const> dictionary_enabled,
+                        hostdevice_2dvector<gpu::StripeDictionary>& stripe_dict,
+                        bool enable_dictionary,
+                        rmm::cuda_stream_view stream)
 {
   const auto num_rowgroups = dict.size().first;
 
@@ -471,7 +616,7 @@ void writer::impl::build_dictionaries(orc_table_view& orc_table,
       sd.leaf_column = dict[0][dict_idx].leaf_column;
     }
 
-    if (enable_dictionary_) {
+    if (enable_dictionary) {
       struct string_column_cost {
         size_t direct     = 0;
         size_t dictionary = 0;
@@ -555,9 +700,20 @@ auto comp_block_alignment(CompressionKind compression_kind)
   return 1u << nvcomp::compress_output_alignment_bits(to_nvcomp_compression_type(compression_kind));
 }
 
-orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
-                                         file_segmentation const& segmentation,
-                                         std::map<uint32_t, size_t> const& decimal_column_sizes)
+/**
+ * @brief Builds up per-column streams.
+ *
+ * @param[in,out] columns List of columns
+ * @param[in] segmentation stripe and rowgroup ranges
+ * @param[in] decimal_column_sizes Sizes of encoded decimal columns
+ * @return List of stream descriptors
+ */
+orc_streams create_streams(host_span<orc_column_view> columns,
+                           file_segmentation const& segmentation,
+                           std::map<uint32_t, size_t> const& decimal_column_sizes,
+                           bool enable_dictionary,
+                           CompressionKind compression_kind,
+                           bool single_write_mode)
 {
   // 'column 0' row index stream
   std::vector<Stream> streams{{ROW_INDEX, 0}};  // TODO: Separate index and data streams?
@@ -600,7 +756,7 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
 
     auto add_stream =
       [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
-        auto const max_alignment_padding = uncomp_block_alignment(compression_kind_) - 1;
+        auto const max_alignment_padding = uncomp_block_alignment(compression_kind) - 1;
         const auto base                  = column.index() * gpu::CI_NUM_STREAMS;
         ids[base + index_type]           = streams.size();
         streams.push_back(orc::Stream{
@@ -637,7 +793,7 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         column.set_orc_encoding(DIRECT);
         break;
       case TypeKind::STRING: {
-        bool enable_dict           = enable_dictionary_;
+        bool enable_dict           = enable_dictionary;
         size_t dict_data_size      = 0;
         size_t dict_strings        = 0;
         size_t dict_lengths_div512 = 0;
@@ -712,47 +868,6 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
   return {std::move(streams), std::move(ids), std::move(types)};
 }
 
-orc_streams::orc_stream_offsets orc_streams::compute_offsets(
-  host_span<orc_column_view const> columns, size_t num_rowgroups) const
-{
-  std::vector<size_t> strm_offsets(streams.size());
-  size_t non_rle_data_size = 0;
-  size_t rle_data_size     = 0;
-  for (size_t i = 0; i < streams.size(); ++i) {
-    const auto& stream = streams[i];
-
-    auto const is_rle_data = [&]() {
-      // First stream is an index stream, don't check types, etc.
-      if (!stream.column_index().has_value()) return true;
-
-      auto const& column = columns[stream.column_index().value()];
-      // Dictionary encoded string column - dictionary characters or
-      // directly encoded string - column characters
-      if (column.orc_kind() == TypeKind::STRING &&
-          ((stream.kind == DICTIONARY_DATA && column.orc_encoding() == DICTIONARY_V2) ||
-           (stream.kind == DATA && column.orc_encoding() == DIRECT_V2)))
-        return false;
-      // Decimal data
-      if (column.orc_kind() == TypeKind::DECIMAL && stream.kind == DATA) return false;
-
-      // Everything else uses RLE
-      return true;
-    }();
-    // non-RLE and RLE streams are separated in the buffer that stores encoded data
-    // The computed offsets do not take the streams of the other type into account
-    if (is_rle_data) {
-      strm_offsets[i] = rle_data_size;
-      rle_data_size += (stream.length + 7) & ~7;
-    } else {
-      strm_offsets[i] = non_rle_data_size;
-      non_rle_data_size += stream.length;
-    }
-  }
-  non_rle_data_size = (non_rle_data_size + 7) & ~7;
-
-  return {std::move(strm_offsets), non_rle_data_size, rle_data_size};
-}
-
 std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
   orc_table_view const& orc_table,
   file_segmentation const& segmentation,
@@ -1093,11 +1208,23 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   return {std::move(encoded_data), std::move(chunk_streams)};
 }
 
-std::vector<StripeInformation> writer::impl::gather_stripes(
+/**
+ * @brief Returns stripe information after compacting columns' individual data
+ * chunks into contiguous data streams.
+ *
+ * @param[in] num_index_streams Total number of index streams
+ * @param[in] segmentation stripe and rowgroup ranges
+ * @param[in,out] enc_streams List of encoder chunk streams [column][rowgroup]
+ * @param[in,out] strm_desc List of stream descriptors [stripe][data_stream]
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @return The stripes' information
+ */
+std::vector<StripeInformation> gather_stripes(
   size_t num_index_streams,
   file_segmentation const& segmentation,
   hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
-  hostdevice_2dvector<gpu::StripeStream>* strm_desc)
+  hostdevice_2dvector<gpu::StripeStream>* strm_desc,
+  rmm::cuda_stream_view stream)
 {
   if (segmentation.num_stripes() == 0) { return {}; }
   std::vector<StripeInformation> stripes(segmentation.num_stripes());
@@ -1165,16 +1292,25 @@ hostdevice_vector<uint8_t> allocate_and_encode_blobs(
   return blobs;
 }
 
-writer::impl::intermediate_statistics writer::impl::gather_statistic_blobs(
-  statistics_freq const stats_freq,
-  orc_table_view const& orc_table,
-  file_segmentation const& segmentation)
+/**
+ * @brief Returns column statistics in an intermediate format.
+ *
+ * @param statistics_freq Frequency of statistics to be included in the output file
+ * @param orc_table Table information to be written
+ * @param segmentation stripe and rowgroup ranges
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The statistic information
+ */
+intermediate_statistics gather_statistic_blobs(statistics_freq const stats_freq,
+                                               orc_table_view const& orc_table,
+                                               file_segmentation const& segmentation,
+                                               rmm::cuda_stream_view stream)
 {
   auto const num_rowgroup_blobs     = segmentation.rowgroups.count();
   auto const num_stripe_blobs       = segmentation.num_stripes() * orc_table.num_columns();
   auto const are_statistics_enabled = stats_freq != statistics_freq::STATISTICS_NONE;
   if (not are_statistics_enabled or num_rowgroup_blobs + num_stripe_blobs == 0) {
-    return writer::impl::intermediate_statistics{stream};
+    return intermediate_statistics{stream};
   }
 
   hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
@@ -1292,8 +1428,17 @@ writer::impl::intermediate_statistics writer::impl::gather_statistic_blobs(
           std::move(col_types)};
 }
 
-writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
-  int num_stripes, writer::impl::persisted_statistics& per_chunk_stats)
+/**
+ * @brief Returns column statistics encoded in ORC protobuf format stored in the footer.
+ *
+ * @param num_stripes number of stripes in the data
+ * @param incoming_stats intermediate statistics returned from `gather_statistic_blobs`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The encoded statistic blobs
+ */
+encoded_footer_statistics finish_statistic_blobs(int num_stripes,
+                                                 persisted_statistics& per_chunk_stats,
+                                                 rmm::cuda_stream_view stream)
 {
   auto stripe_size_iter = thrust::make_transform_iterator(per_chunk_stats.stripe_stat_merge.begin(),
                                                           [](auto const& i) { return i.size(); });
@@ -1383,16 +1528,36 @@ writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
   return {std::move(stripe_blobs), std::move(file_blobs)};
 }
 
-void writer::impl::write_index_stream(int32_t stripe_id,
-                                      int32_t stream_id,
-                                      host_span<orc_column_view const> columns,
-                                      file_segmentation const& segmentation,
-                                      host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
-                                      host_2dspan<gpu::StripeStream const> strm_desc,
-                                      host_span<compression_result const> comp_res,
-                                      std::vector<ColStatsBlob> const& rg_stats,
-                                      StripeInformation* stripe,
-                                      orc_streams* streams)
+/**
+ * @brief Writes the specified column's row index stream.
+ *
+ * @param[in] stripe_id Stripe's identifier
+ * @param[in] stream_id Stream identifier (column id + 1)
+ * @param[in] columns List of columns
+ * @param[in] segmentation stripe and rowgroup ranges
+ * @param[in] enc_streams List of encoder chunk streams [column][rowgroup]
+ * @param[in] strm_desc List of stream descriptors
+ * @param[in] comp_res Output status for compressed streams
+ * @param[in] rg_stats row group level statistics
+ * @param[in,out] stripe Stream's parent stripe
+ * @param[in,out] streams List of all streams
+ * @param[in] compression_kind The compression kind
+ * @param[in] compression_blocksize The block size used for compression
+ * @param[in] out_sink Sink for writing data
+ */
+void write_index_stream(int32_t stripe_id,
+                        int32_t stream_id,
+                        host_span<orc_column_view const> columns,
+                        file_segmentation const& segmentation,
+                        host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
+                        host_2dspan<gpu::StripeStream const> strm_desc,
+                        host_span<compression_result const> comp_res,
+                        std::vector<ColStatsBlob> const& rg_stats,
+                        StripeInformation* stripe,
+                        orc_streams* streams,
+                        CompressionKind compression_kind,
+                        size_t compression_blocksize,
+                        std::unique_ptr<data_sink> const& out_sink)
 {
   row_group_index_info present;
   row_group_index_info data;
@@ -1404,7 +1569,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     row_group_index_info record;
     if (stream.ids[type] > 0) {
       record.pos = 0;
-      if (compression_kind_ != NONE) {
+      if (compression_kind != NONE) {
         auto const& ss   = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)];
         record.blk_pos   = ss.first_block;
         record.comp_pos  = 0;
@@ -1419,10 +1584,10 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     if (record.pos >= 0) {
       record.pos += stream.lengths[type];
       while ((record.pos >= 0) && (record.blk_pos >= 0) &&
-             (static_cast<size_t>(record.pos) >= compression_blocksize_) &&
+             (static_cast<size_t>(record.pos) >= compression_blocksize) &&
              (record.comp_pos + block_header_size + comp_res[record.blk_pos].bytes_written <
               static_cast<size_t>(record.comp_size))) {
-        record.pos -= compression_blocksize_;
+        record.pos -= compression_blocksize;
         record.comp_pos += block_header_size + comp_res[record.blk_pos].bytes_written;
         record.blk_pos += 1;
       }
@@ -1444,7 +1609,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     }
   }
 
-  ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
+  ProtobufWriter pbw((compression_kind != NONE) ? 3 : 0);
 
   // Add row index entries
   auto const& rowgroups_range = segmentation.stripes[stripe_id];
@@ -1469,22 +1634,39 @@ void writer::impl::write_index_stream(int32_t stripe_id,
   });
 
   (*streams)[stream_id].length = pbw.size();
-  if (compression_kind_ != NONE) {
+  if (compression_kind != NONE) {
     uint32_t uncomp_ix_len = (uint32_t)((*streams)[stream_id].length - 3) * 2 + 1;
     pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_ix_len >> 0);
     pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_ix_len >> 8);
     pbw.buffer()[2]        = static_cast<uint8_t>(uncomp_ix_len >> 16);
   }
-  out_sink_->host_write(pbw.data(), pbw.size());
+  out_sink->host_write(pbw.data(), pbw.size());
   stripe->indexLength += pbw.size();
 }
 
-std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_desc,
-                                                  gpu::encoder_chunk_streams const& enc_stream,
-                                                  uint8_t const* compressed_data,
-                                                  uint8_t* stream_out,
-                                                  StripeInformation* stripe,
-                                                  orc_streams* streams)
+/**
+ * @brief Write the specified column's data streams
+ *
+ * @param[in] strm_desc Stream's descriptor
+ * @param[in] enc_stream Chunk's streams
+ * @param[in] compressed_data Compressed stream data
+ * @param[in,out] stream_out Temporary host output buffer
+ * @param[in,out] stripe Stream's parent stripe
+ * @param[in,out] streams List of all streams
+ * @param[in] compression_kind The compression kind
+ * @param[in] out_sink Sink for writing data
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @return An std::future that should be synchronized to ensure the writing is complete
+ */
+std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
+                                    gpu::encoder_chunk_streams const& enc_stream,
+                                    uint8_t const* compressed_data,
+                                    uint8_t* stream_out,
+                                    StripeInformation* stripe,
+                                    orc_streams* streams,
+                                    CompressionKind compression_kind,
+                                    std::unique_ptr<data_sink> const& out_sink,
+                                    rmm::cuda_stream_view stream)
 {
   const auto length                                        = strm_desc.stream_size;
   (*streams)[enc_stream.ids[strm_desc.stream_type]].length = length;
@@ -1492,18 +1674,18 @@ std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_
     return std::async(std::launch::deferred, [] {});
   }
 
-  const auto* stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
-                                                      : (compressed_data + strm_desc.bfr_offset);
+  const auto* stream_in = (compression_kind == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
+                                                     : (compressed_data + strm_desc.bfr_offset);
 
   auto write_task = [&]() {
-    if (out_sink_->is_device_write_preferred(length)) {
-      return out_sink_->device_write_async(stream_in, length, stream);
+    if (out_sink->is_device_write_preferred(length)) {
+      return out_sink->device_write_async(stream_in, length, stream);
     } else {
       CUDF_CUDA_TRY(
         cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDefault, stream.value()));
       stream.synchronize();
 
-      out_sink_->host_write(stream_out, length);
+      out_sink->host_write(stream_out, length);
       return std::async(std::launch::deferred, [] {});
     }
   }();
@@ -1511,18 +1693,27 @@ std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_
   return write_task;
 }
 
-void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t>& v)
+/**
+ * @brief Insert 3-byte uncompressed block headers in a byte vector
+ *
+ * @param compression_kind The compression kind
+ * @param compression_blocksize The block size used for compression
+ * @param v The destitation byte vector to write, which must include initial 3-byte header
+ */
+void add_uncompressed_block_headers(CompressionKind compression_kind,
+                                    size_t compression_blocksize,
+                                    std::vector<uint8_t>& v)
 {
-  if (compression_kind_ != NONE) {
+  if (compression_kind != NONE) {
     size_t uncomp_len = v.size() - 3, pos = 0, block_len;
-    while (uncomp_len > compression_blocksize_) {
-      block_len  = compression_blocksize_ * 2 + 1;
+    while (uncomp_len > compression_blocksize) {
+      block_len  = compression_blocksize * 2 + 1;
       v[pos + 0] = static_cast<uint8_t>(block_len >> 0);
       v[pos + 1] = static_cast<uint8_t>(block_len >> 8);
       v[pos + 2] = static_cast<uint8_t>(block_len >> 16);
-      pos += 3 + compression_blocksize_;
+      pos += 3 + compression_blocksize;
       v.insert(v.begin() + pos, 3, 0);
-      uncomp_len -= compression_blocksize_;
+      uncomp_len -= compression_blocksize;
     }
     block_len  = uncomp_len * 2 + 1;
     v[pos + 0] = static_cast<uint8_t>(block_len >> 0);
@@ -1531,58 +1722,6 @@ void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t>& v)
   }
 }
 
-writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   orc_writer_options const& options,
-                   SingleWriteMode mode,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : _mr(mr),
-    stream(stream),
-    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
-    row_index_stride{options.get_row_index_stride()},
-    compression_kind_(to_orc_compression(options.get_compression())),
-    compression_blocksize_(compression_block_size(compression_kind_)),
-    stats_freq_(options.get_statistics_freq()),
-    single_write_mode(mode == SingleWriteMode::YES),
-    kv_meta(options.get_key_value_metadata()),
-    out_sink_(std::move(sink))
-{
-  if (options.get_metadata()) {
-    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
-  }
-  init_state();
-}
-
-writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   chunked_orc_writer_options const& options,
-                   SingleWriteMode mode,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : _mr(mr),
-    stream(stream),
-    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
-    row_index_stride{options.get_row_index_stride()},
-    compression_kind_(to_orc_compression(options.get_compression())),
-    compression_blocksize_(compression_block_size(compression_kind_)),
-    stats_freq_(options.get_statistics_freq()),
-    single_write_mode(mode == SingleWriteMode::YES),
-    kv_meta(options.get_key_value_metadata()),
-    out_sink_(std::move(sink))
-{
-  if (options.get_metadata()) {
-    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
-  }
-  init_state();
-}
-
-writer::impl::~impl() { close(); }
-
-void writer::impl::init_state()
-{
-  // Write file header
-  out_sink_->host_write(MAGIC, std::strlen(MAGIC));
-}
-
 void pushdown_lists_null_mask(orc_column_view const& col,
                               device_span<orc_column_device_view> d_columns,
                               bitmask_type const* parent_pd_mask,
@@ -2018,48 +2157,6 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
           std::move(is_dict_enabled)};
 }
 
-struct string_length_functor {
-  __device__ inline size_type operator()(int const i) const
-  {
-    // we translate from 0 -> num_chunks * 2 because each statistic has a min and max
-    // string and we need to calculate lengths for both.
-    if (i >= num_chunks * 2) return 0;
-
-    // min strings are even values, max strings are odd values of i
-    auto const should_copy_min = i % 2 == 0;
-    // index of the chunk
-    auto const idx = i / 2;
-    auto& str_val  = should_copy_min ? stripe_stat_chunks[idx].min_value.str_val
-                                     : stripe_stat_chunks[idx].max_value.str_val;
-    auto const str = stripe_stat_merge[idx].stats_dtype == dtype_string;
-    return str ? str_val.length : 0;
-  }
-
-  int const num_chunks;
-  statistics_chunk const* stripe_stat_chunks;
-  statistics_merge_group const* stripe_stat_merge;
-};
-
-__global__ void copy_string_data(char* string_pool,
-                                 size_type* offsets,
-                                 statistics_chunk* chunks,
-                                 statistics_merge_group const* groups)
-{
-  auto const idx = blockIdx.x / 2;
-  if (groups[idx].stats_dtype == dtype_string) {
-    // min strings are even values, max strings are odd values of i
-    auto const should_copy_min = blockIdx.x % 2 == 0;
-    auto& str_val = should_copy_min ? chunks[idx].min_value.str_val : chunks[idx].max_value.str_val;
-    auto dst      = &string_pool[offsets[blockIdx.x]];
-    auto src      = str_val.ptr;
-
-    for (int i = threadIdx.x; i < str_val.length; i += blockDim.x) {
-      dst[i] = src[i];
-    }
-    if (threadIdx.x == 0) { str_val.ptr = dst; }
-  }
-}
-
 size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize)
 {
   if (compression_kind == NONE) return 0;
@@ -2068,60 +2165,14 @@ size_t max_compression_output_size(CompressionKind compression_kind, uint32_t co
                                         compression_blocksize);
 }
 
-void writer::impl::persisted_statistics::persist(int num_table_rows,
-                                                 bool single_write_mode,
-                                                 intermediate_statistics& intermediate_stats,
-                                                 rmm::cuda_stream_view stream)
-{
-  if (not single_write_mode) {
-    // persist the strings in the chunks into a string pool and update pointers
-    auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
-    // min offset and max offset + 1 for total size
-    rmm::device_uvector<size_type> offsets((num_chunks * 2) + 1, stream);
-
-    auto iter = cudf::detail::make_counting_transform_iterator(
-      0,
-      string_length_functor{num_chunks,
-                            intermediate_stats.stripe_stat_chunks.data(),
-                            intermediate_stats.stripe_stat_merge.device_ptr()});
-    thrust::exclusive_scan(rmm::exec_policy(stream), iter, iter + offsets.size(), offsets.begin());
-
-    // pull size back to host
-    auto const total_string_pool_size = offsets.element(num_chunks * 2, stream);
-    if (total_string_pool_size > 0) {
-      rmm::device_uvector<char> string_pool(total_string_pool_size, stream);
-
-      // offsets describes where in the string pool each string goes. Going with the simple
-      // approach for now, but it is possible something fancier with breaking up each thread into
-      // copying x bytes instead of a single string is the better method since we are dealing in
-      // min/max strings they almost certainly will not be uniform length.
-      copy_string_data<<<num_chunks * 2, 256, 0, stream.value()>>>(
-        string_pool.data(),
-        offsets.data(),
-        intermediate_stats.stripe_stat_chunks.data(),
-        intermediate_stats.stripe_stat_merge.device_ptr());
-      string_pools.emplace_back(std::move(string_pool));
-    }
-  }
-
-  stripe_stat_chunks.emplace_back(std::move(intermediate_stats.stripe_stat_chunks));
-  stripe_stat_merge.emplace_back(std::move(intermediate_stats.stripe_stat_merge));
-  stats_dtypes = std::move(intermediate_stats.stats_dtypes);
-  col_types    = std::move(intermediate_stats.col_types);
-  num_rows     = num_table_rows;
-}
-
-void writer::impl::write(table_view const& table)
+std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
 {
-  CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
-  auto const num_rows = table.num_rows();
-
-  if (not table_meta) { table_meta = std::make_unique<table_input_metadata>(table); }
+  auto table_meta = std::make_unique<table_input_metadata>(input);
 
   // Fill unnamed columns' names in table_meta
   std::function<void(column_in_metadata&, std::string)> add_default_name =
     [&](column_in_metadata& col_meta, std::string default_name) {
-      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      if (col_meta.get_name().empty()) { col_meta.set_name(default_name); }
       for (size_type i = 0; i < col_meta.num_children(); ++i) {
         add_default_name(col_meta.child(i), std::to_string(i));
       }
@@ -2130,9 +2181,51 @@ void writer::impl::write(table_view const& table)
     add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
   }
 
-  auto const d_table = table_device_view::create(table, stream);
+  return table_meta;
+}
+
+/**
+ * @brief Perform the processing steps needed to convert the input table into the output ORC data
+ * for writing, such as compression and ORC encoding.
+ *
+ * @param input The input table
+ * @param table_meta The table metadata
+ * @param max_stripe_size Maximum size of stripes in the output file
+ * @param row_index_stride The row index stride
+ * @param enable_dictionary Whether dictionary is enabled
+ * @param compression_kind The compression kind
+ * @param compression_blocksize The block size used for compression
+ * @param stats_freq Column statistics granularity type for parquet/orc writers
+ * @param single_write_mode Flag to indicate if there is only a single table write
+ * @param out_sink Sink for writing data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A tuple of the intermediate results containing the processed data
+ */
+std::tuple<orc_streams,
+           hostdevice_vector<compression_result>,
+           hostdevice_2dvector<gpu::StripeStream>,
+           encoded_data,
+           file_segmentation,
+           std::vector<StripeInformation>,
+           orc_table_view,
+           rmm::device_buffer,
+           intermediate_statistics,
+           pinned_buffer<uint8_t>>
+convert_table_to_orc_data(table_view const& input,
+                          table_input_metadata const& table_meta,
+                          stripe_size_limits max_stripe_size,
+                          size_type row_index_stride,
+                          bool enable_dictionary,
+                          CompressionKind compression_kind,
+                          size_t compression_blocksize,
+                          statistics_freq stats_freq,
+                          bool single_write_mode,
+                          data_sink const& out_sink,
+                          rmm::cuda_stream_view stream)
+{
+  auto const input_tview = table_device_view::create(input, stream);
 
-  auto orc_table = make_orc_table_view(table, *d_table, *table_meta, stream);
+  auto orc_table = make_orc_table_view(input, *input_tview, table_meta, stream);
 
   auto const pd_masks = init_pushdown_null_masks(orc_table, stream);
 
@@ -2152,7 +2245,7 @@ void writer::impl::write(table_view const& table)
   }
 
   // Decide stripe boundaries based on rowgroups and dict chunks
-  auto const segmentation =
+  auto segmentation =
     calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
 
   // Build stripe-level dictionaries
@@ -2164,15 +2257,22 @@ void writer::impl::write(table_view const& table)
                        dict,
                        dictionaries.index,
                        dictionaries.dictionary_enabled,
-                       stripe_dict);
+                       stripe_dict,
+                       enable_dictionary,
+                       stream);
   }
 
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
-  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind_);
-  auto const compressed_block_align   = comp_block_alignment(compression_kind_);
-  auto streams =
-    create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes));
+  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind);
+  auto const compressed_block_align   = comp_block_alignment(compression_kind);
+
+  auto streams  = create_streams(orc_table.columns,
+                                segmentation,
+                                decimal_column_sizes(dec_chunk_sizes.rg_sizes),
+                                enable_dictionary,
+                                compression_kind,
+                                single_write_mode);
   auto enc_data = encode_columns(orc_table,
                                  std::move(dictionaries),
                                  std::move(dec_chunk_sizes),
@@ -2181,152 +2281,314 @@ void writer::impl::write(table_view const& table)
                                  uncompressed_block_align,
                                  stream);
 
+  auto const num_rows = input.num_rows();
+
   // Assemble individual disparate column chunks into contiguous data streams
   size_type const num_index_streams = (orc_table.num_columns() + 1);
   const auto num_data_streams       = streams.size() - num_index_streams;
   hostdevice_2dvector<gpu::StripeStream> strm_descs(
     segmentation.num_stripes(), num_data_streams, stream);
-  auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
-
-  if (num_rows > 0) {
-    // Allocate intermediate output stream buffer
-    size_t compressed_bfr_size   = 0;
-    size_t num_compressed_blocks = 0;
-
-    auto const max_compressed_block_size =
-      max_compression_output_size(compression_kind_, compression_blocksize_);
-    auto const padded_max_compressed_block_size =
-      util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
-    auto const padded_block_header_size =
-      util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
-
-    auto stream_output = [&]() {
-      size_t max_stream_size = 0;
-      bool all_device_write  = true;
-
-      for (auto& ss : strm_descs.host_view().flat_view()) {
-        if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
-        size_t stream_size = ss.stream_size;
-        if (compression_kind_ != NONE) {
-          ss.first_block = num_compressed_blocks;
-          ss.bfr_offset  = compressed_bfr_size;
-
-          auto num_blocks = std::max<uint32_t>(
-            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-          stream_size += num_blocks * block_header_size;
-          num_compressed_blocks += num_blocks;
-          compressed_bfr_size +=
-            (padded_block_header_size + padded_max_compressed_block_size) * num_blocks;
-        }
-        max_stream_size = std::max(max_stream_size, stream_size);
-      }
+  auto stripes =
+    gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs, stream);
+
+  if (num_rows == 0) {
+    return {std::move(streams),
+            hostdevice_vector<compression_result>{},  // comp_results
+            std::move(strm_descs),
+            std::move(enc_data),
+            std::move(segmentation),
+            std::move(stripes),
+            std::move(orc_table),
+            rmm::device_buffer{},  // compressed_data
+            intermediate_statistics{stream},
+            pinned_buffer<uint8_t>{nullptr, cudaFreeHost}};
+  }
 
-      if (all_device_write) {
-        return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-      } else {
-        return pinned_buffer<uint8_t>{[](size_t size) {
-                                        uint8_t* ptr = nullptr;
-                                        CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
-                                        return ptr;
-                                      }(max_stream_size),
-                                      cudaFreeHost};
+  // Allocate intermediate output stream buffer
+  size_t compressed_bfr_size   = 0;
+  size_t num_compressed_blocks = 0;
+
+  auto const max_compressed_block_size =
+    max_compression_output_size(compression_kind, compression_blocksize);
+  auto const padded_max_compressed_block_size =
+    util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
+  auto const padded_block_header_size =
+    util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
+
+  auto stream_output = [&]() {
+    size_t max_stream_size = 0;
+    bool all_device_write  = true;
+
+    for (auto& ss : strm_descs.host_view().flat_view()) {
+      if (!out_sink.is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+      size_t stream_size = ss.stream_size;
+      if (compression_kind != NONE) {
+        ss.first_block = num_compressed_blocks;
+        ss.bfr_offset  = compressed_bfr_size;
+
+        auto num_blocks =
+          std::max<uint32_t>((stream_size + compression_blocksize - 1) / compression_blocksize, 1);
+        stream_size += num_blocks * block_header_size;
+        num_compressed_blocks += num_blocks;
+        compressed_bfr_size +=
+          (padded_block_header_size + padded_max_compressed_block_size) * num_blocks;
       }
-    }();
+      max_stream_size = std::max(max_stream_size, stream_size);
+    }
 
-    // Compress the data streams
-    rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-    hostdevice_vector<compression_result> comp_results(num_compressed_blocks, stream);
-    thrust::fill(rmm::exec_policy(stream),
-                 comp_results.d_begin(),
-                 comp_results.d_end(),
-                 compression_result{0, compression_status::FAILURE});
-    if (compression_kind_ != NONE) {
-      strm_descs.host_to_device(stream);
-      gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
-                                  num_compressed_blocks,
-                                  compression_kind_,
-                                  compression_blocksize_,
-                                  max_compressed_block_size,
-                                  compressed_block_align,
-                                  strm_descs,
-                                  enc_data.streams,
-                                  comp_results,
-                                  stream);
-
-      // deallocate encoded data as it is not needed anymore
-      enc_data.data = rmm::device_uvector<uint8_t>{0, stream};
-
-      strm_descs.device_to_host(stream);
-      comp_results.device_to_host(stream, true);
+    if (all_device_write) {
+      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+    } else {
+      return pinned_buffer<uint8_t>{[](size_t size) {
+                                      uint8_t* ptr = nullptr;
+                                      CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
+                                      return ptr;
+                                    }(max_stream_size),
+                                    cudaFreeHost};
     }
+  }();
+
+  // Compress the data streams
+  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+  hostdevice_vector<compression_result> comp_results(num_compressed_blocks, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               comp_results.d_begin(),
+               comp_results.d_end(),
+               compression_result{0, compression_status::FAILURE});
+  if (compression_kind != NONE) {
+    strm_descs.host_to_device(stream);
+    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+                                num_compressed_blocks,
+                                compression_kind,
+                                compression_blocksize,
+                                max_compressed_block_size,
+                                compressed_block_align,
+                                strm_descs,
+                                enc_data.streams,
+                                comp_results,
+                                stream);
+
+    // deallocate encoded data as it is not needed anymore
+    enc_data.data = rmm::device_uvector<uint8_t>{0, stream};
+
+    strm_descs.device_to_host(stream);
+    comp_results.device_to_host(stream, true);
+  }
 
-    auto intermediate_stats = gather_statistic_blobs(stats_freq_, orc_table, segmentation);
+  auto intermediate_stats = gather_statistic_blobs(stats_freq, orc_table, segmentation, stream);
+
+  return {std::move(streams),
+          std::move(comp_results),
+          std::move(strm_descs),
+          std::move(enc_data),
+          std::move(segmentation),
+          std::move(stripes),
+          std::move(orc_table),
+          std::move(compressed_data),
+          std::move(intermediate_stats),
+          std::move(stream_output)};
+}
 
-    if (intermediate_stats.stripe_stat_chunks.size() > 0) {
-      persisted_stripe_statistics.persist(
-        orc_table.num_rows(), single_write_mode, intermediate_stats, stream);
+}  // namespace
+
+writer::impl::impl(std::unique_ptr<data_sink> sink,
+                   orc_writer_options const& options,
+                   SingleWriteMode mode,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : _mr(mr),
+    stream(stream),
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
+    compression_kind_(to_orc_compression(options.get_compression())),
+    compression_blocksize_(compression_block_size(compression_kind_)),
+    stats_freq_(options.get_statistics_freq()),
+    single_write_mode(mode == SingleWriteMode::YES),
+    kv_meta(options.get_key_value_metadata()),
+    out_sink_(std::move(sink))
+{
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
+  }
+  init_state();
+}
+
+writer::impl::impl(std::unique_ptr<data_sink> sink,
+                   chunked_orc_writer_options const& options,
+                   SingleWriteMode mode,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : _mr(mr),
+    stream(stream),
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
+    compression_kind_(to_orc_compression(options.get_compression())),
+    compression_blocksize_(compression_block_size(compression_kind_)),
+    stats_freq_(options.get_statistics_freq()),
+    single_write_mode(mode == SingleWriteMode::YES),
+    kv_meta(options.get_key_value_metadata()),
+    out_sink_(std::move(sink))
+{
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
+  }
+  init_state();
+}
+
+writer::impl::~impl() { close(); }
+
+void writer::impl::init_state()
+{
+  // Write file header
+  out_sink_->host_write(MAGIC, std::strlen(MAGIC));
+}
+
+void writer::impl::write(table_view const& input)
+{
+  CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
+
+  if (not table_meta) { table_meta = make_table_meta(input); }
+
+  // All kinds of memory allocation and data compressions/encoding are performed here.
+  // If any error occurs, such as out-of-memory exception, the internal state of the current writer
+  // is still intact.
+  // Note that `out_sink_` is intentionally passed by const reference to prevent accidentally
+  // writing anything to it.
+  auto [streams,
+        comp_results,
+        strm_descs,
+        enc_data,
+        segmentation,
+        stripes,
+        orc_table,
+        compressed_data,
+        intermediate_stats,
+        stream_output] = [&] {
+    try {
+      return convert_table_to_orc_data(input,
+                                       *table_meta,
+                                       max_stripe_size,
+                                       row_index_stride,
+                                       enable_dictionary_,
+                                       compression_kind_,
+                                       compression_blocksize_,
+                                       stats_freq_,
+                                       single_write_mode,
+                                       *out_sink_,
+                                       stream);
+    } catch (...) {  // catch any exception type
+      CUDF_LOG_ERROR(
+        "ORC writer encountered exception during processing. "
+        "No data has been written to the sink.");
+      throw;  // this throws the same exception
     }
+  }();
 
-    // Write stripes
-    std::vector<std::future<void>> write_tasks;
-    for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-      auto& stripe = stripes[stripe_id];
-
-      stripe.offset = out_sink_->bytes_written();
-
-      // Column (skippable) index streams appear at the start of the stripe
-      for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
-        write_index_stream(stripe_id,
-                           stream_id,
-                           orc_table.columns,
-                           segmentation,
-                           enc_data.streams,
-                           strm_descs,
-                           comp_results,
-                           intermediate_stats.rowgroup_blobs,
-                           &stripe,
-                           &streams);
-      }
+  // Compression/encoding were all successful. Now write the intermediate results.
+  write_orc_data_to_sink(streams,
+                         comp_results,
+                         strm_descs,
+                         enc_data,
+                         segmentation,
+                         stripes,
+                         orc_table,
+                         compressed_data,
+                         intermediate_stats,
+                         stream_output.get());
+
+  // Update data into the footer. This needs to be called even when num_rows==0.
+  add_table_to_footer_data(orc_table, stripes);
+}
 
-      // Column data consisting one or more separate streams
-      for (auto const& strm_desc : strm_descs[stripe_id]) {
-        write_tasks.push_back(write_data_stream(
-          strm_desc,
-          enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
-          static_cast<uint8_t const*>(compressed_data.data()),
-          stream_output.get(),
-          &stripe,
-          &streams));
-      }
+void writer::impl::write_orc_data_to_sink(orc_streams& streams,
+                                          hostdevice_vector<compression_result> const& comp_results,
+                                          hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
+                                          encoded_data const& enc_data,
+                                          file_segmentation const& segmentation,
+                                          std::vector<StripeInformation>& stripes,
+                                          orc_table_view const& orc_table,
+                                          rmm::device_buffer const& compressed_data,
+                                          intermediate_statistics& intermediate_stats,
+                                          uint8_t* stream_output)
+{
+  if (orc_table.num_rows() == 0) { return; }
 
-      // Write stripefooter consisting of stream information
-      StripeFooter sf;
-      sf.streams = streams;
-      sf.columns.resize(orc_table.num_columns() + 1);
-      sf.columns[0].kind = DIRECT;
-      for (size_t i = 1; i < sf.columns.size(); ++i) {
-        sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
-        sf.columns[i].dictionarySize =
-          (sf.columns[i].kind == DICTIONARY_V2)
-            ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
-            : 0;
-        if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
-      }
-      ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
-      pbw.write(sf);
-      stripe.footerLength = pbw.size();
-      if (compression_kind_ != NONE) {
-        uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-        pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_sf_len >> 0);
-        pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_sf_len >> 8);
-        pbw.buffer()[2]        = static_cast<uint8_t>(uncomp_sf_len >> 16);
-      }
-      out_sink_->host_write(pbw.data(), pbw.size());
+  if (intermediate_stats.stripe_stat_chunks.size() > 0) {
+    persisted_stripe_statistics.persist(
+      orc_table.num_rows(), single_write_mode, intermediate_stats, stream);
+  }
+
+  // Write stripes
+  std::vector<std::future<void>> write_tasks;
+  for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
+    auto& stripe = stripes[stripe_id];
+
+    stripe.offset = out_sink_->bytes_written();
+
+    // Column (skippable) index streams appear at the start of the stripe
+    size_type const num_index_streams = (orc_table.num_columns() + 1);
+    for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
+      write_index_stream(stripe_id,
+                         stream_id,
+                         orc_table.columns,
+                         segmentation,
+                         enc_data.streams,
+                         strm_descs,
+                         comp_results,
+                         intermediate_stats.rowgroup_blobs,
+                         &stripe,
+                         &streams,
+                         compression_kind_,
+                         compression_blocksize_,
+                         out_sink_);
     }
-    for (auto const& task : write_tasks) {
-      task.wait();
+
+    // Column data consisting one or more separate streams
+    for (auto const& strm_desc : strm_descs[stripe_id]) {
+      write_tasks.push_back(write_data_stream(
+        strm_desc,
+        enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
+        static_cast<uint8_t const*>(compressed_data.data()),
+        stream_output,
+        &stripe,
+        &streams,
+        compression_kind_,
+        out_sink_,
+        stream));
     }
+
+    // Write stripefooter consisting of stream information
+    StripeFooter sf;
+    sf.streams = streams;
+    sf.columns.resize(orc_table.num_columns() + 1);
+    sf.columns[0].kind = DIRECT;
+    for (size_t i = 1; i < sf.columns.size(); ++i) {
+      sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
+      sf.columns[i].dictionarySize =
+        (sf.columns[i].kind == DICTIONARY_V2)
+          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+          : 0;
+      if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+    }
+    ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
+    pbw.write(sf);
+    stripe.footerLength = pbw.size();
+    if (compression_kind_ != NONE) {
+      uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
+      pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_sf_len >> 0);
+      pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_sf_len >> 8);
+      pbw.buffer()[2]        = static_cast<uint8_t>(uncomp_sf_len >> 16);
+    }
+    out_sink_->host_write(pbw.data(), pbw.size());
+  }
+  for (auto const& task : write_tasks) {
+    task.wait();
   }
+}
+
+void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table,
+                                            std::vector<StripeInformation>& stripes)
+{
   if (ff.headerLength == 0) {
     // First call
     ff.headerLength   = std::strlen(MAGIC);
@@ -2372,7 +2634,7 @@ void writer::impl::write(table_view const& table)
   ff.stripes.insert(ff.stripes.end(),
                     std::make_move_iterator(stripes.begin()),
                     std::make_move_iterator(stripes.end()));
-  ff.numberOfRows += num_rows;
+  ff.numberOfRows += orc_table.num_rows();
 }
 
 void writer::impl::close()
@@ -2381,7 +2643,8 @@ void writer::impl::close()
   closed = true;
   PostScript ps;
 
-  auto const statistics = finish_statistic_blobs(ff.stripes.size(), persisted_stripe_statistics);
+  auto const statistics =
+    finish_statistic_blobs(ff.stripes.size(), persisted_stripe_statistics, stream);
 
   // File-level statistics
   if (not statistics.file_level.empty()) {
@@ -2425,7 +2688,7 @@ void writer::impl::close()
   if (md.stripeStats.size() != 0) {
     ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
     pbw.write(md);
-    add_uncompressed_block_headers(pbw.buffer());
+    add_uncompressed_block_headers(compression_kind_, compression_blocksize_, pbw.buffer());
     ps.metadataLength = pbw.size();
     out_sink_->host_write(pbw.data(), pbw.size());
   } else {
@@ -2433,7 +2696,7 @@ void writer::impl::close()
   }
   ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
   pbw.write(ff);
-  add_uncompressed_block_headers(pbw.buffer());
+  add_uncompressed_block_headers(compression_kind_, compression_blocksize_, pbw.buffer());
 
   // Write postscript metadata
   ps.footerLength         = pbw.size();
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 691fba6bac2..27d74e45b46 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -176,6 +176,72 @@ struct stripe_size_limits {
   size_type rows;
 };
 
+/**
+ * @brief Statistics data stored between calls to write for chunked writes
+ *
+ */
+struct intermediate_statistics {
+  explicit intermediate_statistics(rmm::cuda_stream_view stream) : stripe_stat_chunks(0, stream) {}
+
+  intermediate_statistics(std::vector<ColStatsBlob> rb,
+                          rmm::device_uvector<statistics_chunk> sc,
+                          hostdevice_vector<statistics_merge_group> smg,
+                          std::vector<statistics_dtype> sdt,
+                          std::vector<data_type> sct)
+    : rowgroup_blobs(std::move(rb)),
+      stripe_stat_chunks(std::move(sc)),
+      stripe_stat_merge(std::move(smg)),
+      stats_dtypes(std::move(sdt)),
+      col_types(std::move(sct))
+  {
+  }
+
+  // blobs for the rowgroups. Not persisted
+  std::vector<ColStatsBlob> rowgroup_blobs;
+
+  rmm::device_uvector<statistics_chunk> stripe_stat_chunks;
+  hostdevice_vector<statistics_merge_group> stripe_stat_merge;
+  std::vector<statistics_dtype> stats_dtypes;
+  std::vector<data_type> col_types;
+};
+
+/**
+ * @brief used for chunked writes to persist data between calls to write.
+ *
+ */
+struct persisted_statistics {
+  void clear()
+  {
+    stripe_stat_chunks.clear();
+    stripe_stat_merge.clear();
+    string_pools.clear();
+    stats_dtypes.clear();
+    col_types.clear();
+    num_rows = 0;
+  }
+
+  void persist(int num_table_rows,
+               bool single_write_mode,
+               intermediate_statistics& intermediate_stats,
+               rmm::cuda_stream_view stream);
+
+  std::vector<rmm::device_uvector<statistics_chunk>> stripe_stat_chunks;
+  std::vector<hostdevice_vector<statistics_merge_group>> stripe_stat_merge;
+  std::vector<rmm::device_uvector<char>> string_pools;
+  std::vector<statistics_dtype> stats_dtypes;
+  std::vector<data_type> col_types;
+  int num_rows = 0;
+};
+
+/**
+ * @brief Protobuf encoded statistics created at file close
+ *
+ */
+struct encoded_footer_statistics {
+  std::vector<ColStatsBlob> stripe_level;
+  std::vector<ColStatsBlob> file_level;
+};
+
 /**
  * @brief Implementation for ORC writer
  */
@@ -227,7 +293,7 @@ class writer::impl {
   /**
    * @brief Writes a single subtable as part of a larger ORC file/table write.
    *
-   * @param[in] table The table information to be written
+   * @param table The table information to be written
    */
   void write(table_view const& table);
 
@@ -238,186 +304,41 @@ class writer::impl {
 
  private:
   /**
-   * @brief Builds up per-stripe dictionaries for string columns.
-   *
-   * @param orc_table Non-owning view of a cuDF table w/ ORC-related info
-   * @param stripe_bounds List of stripe boundaries
-   * @param dict List of dictionary chunks [rowgroup][column]
-   * @param dict_index List of dictionary indices
-   * @param dictionary_enabled Whether dictionary encoding is enabled for a given column
-   * @param stripe_dict List of stripe dictionaries
-   */
-  void build_dictionaries(orc_table_view& orc_table,
-                          host_span<stripe_rowgroups const> stripe_bounds,
-                          hostdevice_2dvector<gpu::DictionaryChunk> const& dict,
-                          host_span<rmm::device_uvector<uint32_t>> dict_index,
-                          host_span<bool const> dictionary_enabled,
-                          hostdevice_2dvector<gpu::StripeDictionary>& stripe_dict);
-
-  /**
-   * @brief Builds up per-column streams.
-   *
-   * @param[in,out] columns List of columns
-   * @param[in] segmentation stripe and rowgroup ranges
-   * @param[in] decimal_column_sizes Sizes of encoded decimal columns
-   * @return List of stream descriptors
-   */
-  orc_streams create_streams(host_span<orc_column_view> columns,
-                             file_segmentation const& segmentation,
-                             std::map<uint32_t, size_t> const& decimal_column_sizes);
-
-  /**
-   * @brief Returns stripe information after compacting columns' individual data
-   * chunks into contiguous data streams.
-   *
-   * @param[in] num_index_streams Total number of index streams
-   * @param[in] segmentation stripe and rowgroup ranges
-   * @param[in,out] enc_streams List of encoder chunk streams [column][rowgroup]
-   * @param[in,out] strm_desc List of stream descriptors [stripe][data_stream]
+   * @brief Write the intermediate ORC data into the data sink.
    *
-   * @return The stripes' information
-   */
-  std::vector<StripeInformation> gather_stripes(
-    size_t num_index_streams,
-    file_segmentation const& segmentation,
-    hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
-    hostdevice_2dvector<gpu::StripeStream>* strm_desc);
-
-  /**
-   * @brief Statistics data stored between calls to write for chunked writes
-   *
-   */
-  struct intermediate_statistics {
-    explicit intermediate_statistics(rmm::cuda_stream_view stream)
-      : stripe_stat_chunks(0, stream){};
-    intermediate_statistics(std::vector<ColStatsBlob> rb,
-                            rmm::device_uvector<statistics_chunk> sc,
-                            hostdevice_vector<statistics_merge_group> smg,
-                            std::vector<statistics_dtype> sdt,
-                            std::vector<data_type> sct)
-      : rowgroup_blobs(std::move(rb)),
-        stripe_stat_chunks(std::move(sc)),
-        stripe_stat_merge(std::move(smg)),
-        stats_dtypes(std::move(sdt)),
-        col_types(std::move(sct)){};
-
-    // blobs for the rowgroups. Not persisted
-    std::vector<ColStatsBlob> rowgroup_blobs;
-
-    rmm::device_uvector<statistics_chunk> stripe_stat_chunks;
-    hostdevice_vector<statistics_merge_group> stripe_stat_merge;
-    std::vector<statistics_dtype> stats_dtypes;
-    std::vector<data_type> col_types;
-  };
-
-  /**
-   * @brief used for chunked writes to persist data between calls to write.
-   *
-   */
-  struct persisted_statistics {
-    void clear()
-    {
-      stripe_stat_chunks.clear();
-      stripe_stat_merge.clear();
-      string_pools.clear();
-      stats_dtypes.clear();
-      col_types.clear();
-      num_rows = 0;
-    }
-
-    void persist(int num_table_rows,
-                 bool single_write_mode,
-                 intermediate_statistics& intermediate_stats,
-                 rmm::cuda_stream_view stream);
-
-    std::vector<rmm::device_uvector<statistics_chunk>> stripe_stat_chunks;
-    std::vector<hostdevice_vector<statistics_merge_group>> stripe_stat_merge;
-    std::vector<rmm::device_uvector<char>> string_pools;
-    std::vector<statistics_dtype> stats_dtypes;
-    std::vector<data_type> col_types;
-    int num_rows = 0;
-  };
-
-  /**
-   * @brief Protobuf encoded statistics created at file close
-   *
-   */
-  struct encoded_footer_statistics {
-    std::vector<ColStatsBlob> stripe_level;
-    std::vector<ColStatsBlob> file_level;
-  };
-
-  /**
-   * @brief Returns column statistics in an intermediate format.
-   *
-   * @param statistics_freq Frequency of statistics to be included in the output file
-   * @param orc_table Table information to be written
-   * @param segmentation stripe and rowgroup ranges
-   * @return The statistic information
-   */
-  intermediate_statistics gather_statistic_blobs(statistics_freq const statistics_freq,
-                                                 orc_table_view const& orc_table,
-                                                 file_segmentation const& segmentation);
-
-  /**
-   * @brief Returns column statistics encoded in ORC protobuf format stored in the footer.
-   *
-   * @param num_stripes number of stripes in the data
-   * @param incoming_stats intermediate statistics returned from `gather_statistic_blobs`
-   * @return The encoded statistic blobs
-   */
-  encoded_footer_statistics finish_statistic_blobs(
-    int num_stripes, writer::impl::persisted_statistics& incoming_stats);
-
-  /**
-   * @brief Writes the specified column's row index stream.
-   *
-   * @param[in] stripe_id Stripe's identifier
-   * @param[in] stream_id Stream identifier (column id + 1)
-   * @param[in] columns List of columns
-   * @param[in] segmentation stripe and rowgroup ranges
-   * @param[in] enc_streams List of encoder chunk streams [column][rowgroup]
-   * @param[in] strm_desc List of stream descriptors
-   * @param[in] comp_out Output status for compressed streams
-   * @param[in] rg_stats row group level statistics
-   * @param[in,out] stripe Stream's parent stripe
-   * @param[in,out] streams List of all streams
-   */
-  void write_index_stream(int32_t stripe_id,
-                          int32_t stream_id,
-                          host_span<orc_column_view const> columns,
-                          file_segmentation const& segmentation,
-                          host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
-                          host_2dspan<gpu::StripeStream const> strm_desc,
-                          host_span<compression_result const> comp_out,
-                          std::vector<ColStatsBlob> const& rg_stats,
-                          StripeInformation* stripe,
-                          orc_streams* streams);
-
-  /**
-   * @brief Write the specified column's data streams
+   * The intermediate data is generated from processing (compressing/encoding) an cuDF input table
+   * by `process_for_write` called in the `write()` function.
    *
-   * @param[in] strm_desc Stream's descriptor
-   * @param[in] enc_stream Chunk's streams
-   * @param[in] compressed_data Compressed stream data
-   * @param[in,out] stream_out Temporary host output buffer
-   * @param[in,out] stripe Stream's parent stripe
-   * @param[in,out] streams List of all streams
-   * @return An std::future that should be synchronized to ensure the writing is complete
+   * @param streams List of stream descriptors
+   * @param comp_results Status of data compression
+   * @param strm_descs List of stream descriptors
+   * @param enc_data ORC per-chunk streams of encoded data
+   * @param segmentation Description of how the ORC file is segmented into stripes and rowgroups
+   * @param stripes List of stripe description
+   * @param orc_table Non-owning view of a cuDF table that includes ORC-related information
+   * @param compressed_data Compressed stream data
+   * @param intermediate_stats Statistics data stored between calls to write
+   * @param stream_output Temporary host output buffer
    */
-  std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
-                                      gpu::encoder_chunk_streams const& enc_stream,
-                                      uint8_t const* compressed_data,
-                                      uint8_t* stream_out,
-                                      StripeInformation* stripe,
-                                      orc_streams* streams);
+  void write_orc_data_to_sink(orc_streams& streams,
+                              hostdevice_vector<compression_result> const& comp_results,
+                              hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
+                              encoded_data const& enc_data,
+                              file_segmentation const& segmentation,
+                              std::vector<StripeInformation>& stripes,
+                              orc_table_view const& orc_table,
+                              rmm::device_buffer const& compressed_data,
+                              intermediate_statistics& intermediate_stats,
+                              uint8_t* stream_output);
 
   /**
-   * @brief Insert 3-byte uncompressed block headers in a byte vector
+   * @brief Add the processed table data into the internal file footer.
    *
-   * @param byte_vector Raw data (must include initial 3-byte header)
+   * @param orc_table Non-owning view of a cuDF table that includes ORC-related information
+   * @param stripes List of stripe description
    */
-  void add_uncompressed_block_headers(std::vector<uint8_t>& byte_vector);
+  void add_table_to_footer_data(orc_table_view const& orc_table,
+                                std::vector<StripeInformation>& stripes);
 
  private:
   rmm::mr::device_memory_resource* _mr = nullptr;

From 832dd27ad2665d3b70285f71168ab2daae942232 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 21 Mar 2023 16:22:43 -0400
Subject: [PATCH 34/63] Stop setting package version attribute in wheels
 (#12977)

This PR removes modification of the `__init__.py::version` attribute that occurs during the wheel build process. See https://github.com/rapidsai/ops/issues/2592 for more information.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Sevag H (https://github.com/sevagh)

URL: https://github.com/rapidsai/cudf/pull/12977
---
 ci/release/apply_wheel_modifications.sh | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
index 9d9758f1f15..0c55c4b9141 100755
--- a/ci/release/apply_wheel_modifications.sh
+++ b/ci/release/apply_wheel_modifications.sh
@@ -6,12 +6,6 @@
 VERSION=${1}
 CUDA_SUFFIX=${2}
 
-# __init__.py versions
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/cudf/cudf/__init__.py
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/dask_cudf/dask_cudf/__init__.py
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/cudf_kafka/cudf_kafka/__init__.py
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/custreamz/custreamz/__init__.py
-
 # pyproject.toml versions
 sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf/pyproject.toml
 sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/dask_cudf/pyproject.toml

From f567cf5b100ba71934d08ac3c1c9648ce09594e1 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 Mar 2023 21:40:34 +0000
Subject: [PATCH 35/63] Correctly handle scalar indices in `Index.__getitem__`
 (#12955)

It is not sufficient to check for isinstance(i, int) since the index may be a numpy type for which this check is False. Instead, invert the condition and check if the return value from _get_elements_from_column is a Column, in which case we should get an Index back.

Closes #12954.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/12955
---
 python/cudf/cudf/core/cut.py         | 10 +++-------
 python/cudf/cudf/core/index.py       |  2 +-
 python/cudf/cudf/tests/test_index.py | 19 +++++++++++++++++++
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 6590cf2940d..ccf730c91fb 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from collections import abc
 
@@ -279,12 +279,8 @@ def cut(
     if labels is not None:
         if labels is not ordered and len(set(labels)) != len(labels):
             # when we have duplicate labels and ordered is False, we
-            # should allow duplicate categories. The categories are
-            # returned in order
-            new_data = [interval_labels[i][0] for i in index_labels.values]
-            return cudf.CategoricalIndex(
-                new_data, categories=sorted(set(labels)), ordered=False
-            )
+            # should allow duplicate categories.
+            return interval_labels[index_labels]
 
     col = build_categorical_column(
         categories=interval_labels,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 413e005b798..d1408fec160 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1403,7 +1403,7 @@ def __repr__(self):
     @_cudf_nvtx_annotate
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
-        if not isinstance(index, int):
+        if isinstance(res, ColumnBase):
             res = as_index(res)
             res.name = self.name
         return res
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index d043b917251..0b0c5fba7fa 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2886,3 +2886,22 @@ def test_index_to_pandas_nullable(data, expected_dtype):
     expected = pd.Index(data, dtype=expected_dtype)
 
     assert_eq(pi, expected)
+
+
+class TestIndexScalarGetItem:
+    @pytest.fixture(
+        params=[range(1, 10, 2), [1, 2, 3], ["a", "b", "c"], [1.5, 2.5, 3.5]]
+    )
+    def index_values(self, request):
+        return request.param
+
+    @pytest.fixture(params=[int, np.int8, np.int32, np.int64])
+    def i(self, request):
+        return request.param(1)
+
+    def test_scalar_getitem(self, index_values, i):
+        index = cudf.Index(index_values)
+
+        assert not isinstance(index[i], cudf.Index)
+        assert index[i] == index_values[i]
+        assert_eq(index, index.to_pandas())

From b2a65e89ba68f1954bc36ada61267cfec40e806a Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 21 Mar 2023 22:03:13 -0400
Subject: [PATCH 36/63] Remove default detail mrs: part3 (#12966)

This is the third PR in a sequence removing default mr parameters in detail APIs. Contributes to https://github.com/rapidsai/cudf/issues/12944.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12966
---
 cpp/benchmarks/common/generate_input.cu       | 35 ++++++++---
 cpp/benchmarks/join/join_common.hpp           |  7 ++-
 cpp/include/cudf/detail/round.hpp             | 13 ++--
 cpp/include/cudf/detail/scatter.cuh           | 15 +++--
 cpp/include/cudf/detail/scatter.hpp           | 48 +++++++-------
 cpp/include/cudf/detail/search.hpp            | 15 +++--
 cpp/include/cudf/detail/sequence.hpp          | 33 +++++-----
 cpp/include/cudf/detail/transform.hpp         | 63 ++++++++-----------
 cpp/include/cudf/detail/transpose.hpp         |  9 ++-
 cpp/include/cudf/detail/unary.hpp             | 52 +++++++--------
 cpp/include/cudf/detail/valid_if.cuh          | 13 ++--
 .../cudf/io/text/detail/tile_state.hpp        |  4 +-
 cpp/include/cudf/io/text/detail/trie.hpp      |  6 +-
 cpp/include/cudf/lists/detail/contains.hpp    | 42 ++++++-------
 cpp/src/groupby/sort/scan.cpp                 |  6 +-
 cpp/src/interop/to_arrow.cu                   |  2 +-
 cpp/src/io/text/multibyte_split.cu            |  8 ++-
 cpp/src/join/semi_join.cu                     |  8 ++-
 cpp/src/lists/combine/concatenate_rows.cu     |  8 ++-
 cpp/src/lists/extract.cu                      |  6 +-
 cpp/src/lists/set_operations.cu               | 12 ++--
 cpp/tests/bitmask/valid_if_tests.cu           | 17 +++--
 java/src/main/native/src/ColumnViewJni.cu     |  7 ++-
 java/src/main/native/src/maps_column_view.cu  |  8 ++-
 java/src/main/native/src/row_conversion.cu    |  2 +-
 25 files changed, 222 insertions(+), 217 deletions(-)

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 545028260b8..762e9640d12 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -430,8 +430,12 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
                    null_mask.begin());
   }
 
-  auto [result_bitmask, null_count] = cudf::detail::valid_if(
-    null_mask.begin(), null_mask.end(), thrust::identity<bool>{}, cudf::get_default_stream());
+  auto [result_bitmask, null_count] =
+    cudf::detail::valid_if(null_mask.begin(),
+                           null_mask.end(),
+                           thrust::identity<bool>{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
   return std::make_unique<cudf::column>(
     dtype,
@@ -509,8 +513,12 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
                      thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1),
                      num_rows,
                      string_generator{chars.data(), engine});
-  auto [result_bitmask, null_count] = cudf::detail::valid_if(
-    null_mask.begin(), null_mask.end() - 1, thrust::identity<bool>{}, cudf::get_default_stream());
+  auto [result_bitmask, null_count] =
+    cudf::detail::valid_if(null_mask.begin(),
+                           null_mask.end() - 1,
+                           thrust::identity<bool>{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(
     num_rows,
     std::move(offsets),
@@ -628,8 +636,11 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
       auto [null_mask, null_count] = [&]() {
         if (profile.get_null_probability().has_value()) {
           auto valids = valid_dist(engine, num_rows);
-          return cudf::detail::valid_if(
-            valids.begin(), valids.end(), thrust::identity<bool>{}, cudf::get_default_stream());
+          return cudf::detail::valid_if(valids.begin(),
+                                        valids.end(),
+                                        thrust::identity<bool>{},
+                                        cudf::get_default_stream(),
+                                        rmm::mr::get_current_device_resource());
         }
         return std::pair<rmm::device_buffer, cudf::size_type>{};
       }();
@@ -712,9 +723,12 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
     auto offsets_column = std::make_unique<cudf::column>(
       cudf::data_type{cudf::type_id::INT32}, num_rows + 1, offsets.release());
 
-    auto [null_mask, null_count] = cudf::detail::valid_if(
-      valids.begin(), valids.end(), thrust::identity<bool>{}, cudf::get_default_stream());
-    list_column = cudf::make_lists_column(
+    auto [null_mask, null_count] = cudf::detail::valid_if(valids.begin(),
+                                                          valids.end(),
+                                                          thrust::identity<bool>{},
+                                                          cudf::get_default_stream(),
+                                                          rmm::mr::get_current_device_resource());
+    list_column                  = cudf::make_lists_column(
       num_rows,
       std::move(offsets_column),
       std::move(current_child_column),
@@ -840,7 +854,8 @@ std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
     return cudf::detail::valid_if(thrust::make_counting_iterator<cudf::size_type>(0),
                                   thrust::make_counting_iterator<cudf::size_type>(size),
                                   bool_generator{seed, 1.0 - *null_probability},
-                                  cudf::get_default_stream());
+                                  cudf::get_default_stream(),
+                                  rmm::mr::get_current_device_resource());
   }
 }
 
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index e37a4ca1193..70036a95377 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -104,8 +104,11 @@ void BM_join(state_type& state, Join JoinFunc)
     // roughly 75% nulls
     auto validity =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{});
-    return cudf::detail::valid_if(
-             validity, validity + size, thrust::identity<bool>{}, cudf::get_default_stream())
+    return cudf::detail::valid_if(validity,
+                                  validity + size,
+                                  thrust::identity<bool>{},
+                                  cudf::get_default_stream(),
+                                  rmm::mr::get_current_device_resource())
       .first;
   };
 
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index 1e5612919f4..cdfc7caef37 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,12 +31,11 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> round(
-  column_view const& input,
-  int32_t decimal_places,
-  rounding_method method,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> round(column_view const& input,
+                              int32_t decimal_places,
+                              rounding_method method,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index c8b17e22df2..dbf7bfa9527 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -386,13 +386,12 @@ struct column_scatterer_impl<struct_view> {
  * @return Result of scattering values from source to target
  */
 template <typename MapIterator>
-std::unique_ptr<table> scatter(
-  table_view const& source,
-  MapIterator scatter_map_begin,
-  MapIterator scatter_map_end,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> scatter(table_view const& source,
+                               MapIterator scatter_map_begin,
+                               MapIterator scatter_map_end,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 7c4b04537ea..39ae4fe1944 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,12 +59,11 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
-std::unique_ptr<table> scatter(
-  table_view const& source,
-  column_view const& scatter_map,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> scatter(table_view const& source,
+                               column_view const& scatter_map,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::detail::scatter(table_view const&,column_view const&,table_view
@@ -72,12 +71,11 @@ std::unique_ptr<table> scatter(
  *
  * @throws cudf::logic_error if `scatter_map` span size is larger than max of `size_type`.
  */
-std::unique_ptr<table> scatter(
-  table_view const& source,
-  device_span<size_type const> const scatter_map,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> scatter(table_view const& source,
+                               device_span<size_type const> const scatter_map,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -108,12 +106,11 @@ std::unique_ptr<table> scatter(
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
-std::unique_ptr<table> scatter(
-  std::vector<std::reference_wrapper<const scalar>> const& source,
-  column_view const& indices,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
+                               column_view const& indices,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
@@ -123,12 +120,11 @@ std::unique_ptr<table> scatter(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> boolean_mask_scatter(
-  table_view const& source,
-  table_view const& target,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> boolean_mask_scatter(table_view const& source,
+                                            table_view const& target,
+                                            column_view const& boolean_mask,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
@@ -144,7 +140,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 56d41fd635c..4c4ad7834f4 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,12 +89,11 @@ std::unique_ptr<column> contains(column_view const& haystack,
  * @param mr Device memory resource used to allocate the returned vector
  * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
  */
-rmm::device_uvector<bool> contains(
-  table_view const& haystack,
-  table_view const& needles,
-  null_equality compare_nulls,
-  nan_equality compare_nans,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_uvector<bool> contains(table_view const& haystack,
+                                   table_view const& needles,
+                                   null_equality compare_nulls,
+                                   nan_equality compare_nans,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index 4a9bf5c74e1..3c3d1d0ed9e 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,12 +32,11 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> sequence(
-  size_type size,
-  scalar const& init,
-  scalar const& step,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> sequence(size_type size,
+                                 scalar const& init,
+                                 scalar const& step,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init,
@@ -46,11 +45,10 @@ std::unique_ptr<column> sequence(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> sequence(
-  size_type size,
-  scalar const& init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> sequence(size_type size,
+                                 scalar const& init,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::calendrical_month_sequence(size_type size,
@@ -60,12 +58,11 @@ std::unique_ptr<column> sequence(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> calendrical_month_sequence(
-  size_type size,
-  scalar const& init,
-  size_type months,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
+                                                         scalar const& init,
+                                                         size_type months,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 8e19ebb8da7..5b64f61f11a 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,24 +29,22 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> transform(
-  column_view const& input,
-  std::string const& unary_udf,
-  data_type output_type,
-  bool is_ptx,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> transform(column_view const& input,
+                                  std::string const& unary_udf,
+                                  data_type output_type,
+                                  bool is_ptx,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::compute_column
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> compute_column(
-  table_view const table,
-  ast::operation const& expr,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> compute_column(table_view const table,
+                                       ast::operation const& expr,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::nans_to_nulls
@@ -54,9 +52,7 @@ std::unique_ptr<column> compute_column(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::bools_to_mask
@@ -64,9 +60,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::encode
@@ -74,42 +68,37 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::one_hot_encode
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
-  column_view const& input,
-  column_view const& categories,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
+                                                              column_view const& categories,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::mask_to_bools
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> mask_to_bools(
-  bitmask_type const* null_mask,
-  size_type begin_bit,
-  size_type end_bit,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> mask_to_bools(bitmask_type const* null_mask,
+                                      size_type begin_bit,
+                                      size_type end_bit,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::row_bit_count
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> row_bit_count(
-  table_view const& t,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> row_bit_count(table_view const& t,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index 0470d625edc..d0be51860b2 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,10 +28,9 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::pair<std::unique_ptr<column>, table_view> transpose(
-  table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index b7ecedc1489..3fbdf4a5a8f 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -45,13 +45,12 @@ namespace detail {
  */
 
 template <typename InputIterator, typename Predicate>
-std::unique_ptr<column> true_if(
-  InputIterator begin,
-  InputIterator end,
-  size_type size,
-  Predicate p,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> true_if(InputIterator begin,
+                                InputIterator end,
+                                size_type size,
+                                Predicate p,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   auto output =
     make_numeric_column(data_type(type_id::BOOL8), size, mask_state::UNALLOCATED, stream, mr);
@@ -68,52 +67,47 @@ std::unique_ptr<column> true_if(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> unary_operation(
-  cudf::column_view const& input,
-  cudf::unary_operator op,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
+                                              cudf::unary_operator op,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_valid
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> is_valid(
-  cudf::column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::cast
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> cast(
-  column_view const& input,
-  data_type type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> cast(column_view const& input,
+                             data_type type,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_nan
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> is_nan(
-  cudf::column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> is_nan(cudf::column_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_not_nan
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> is_not_nan(
-  cudf::column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 04c78bed17d..76d6fd719a4 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,12 +86,11 @@ __global__ void valid_if_kernel(
  * null count
  */
 template <typename InputIterator, typename Predicate>
-std::pair<rmm::device_buffer, size_type> valid_if(
-  InputIterator begin,
-  InputIterator end,
-  Predicate p,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
+                                                  InputIterator end,
+                                                  Predicate p,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(begin <= end, "Invalid range.");
 
diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp
index bf833d4720c..6ae399fbe75 100644
--- a/cpp/include/cudf/io/text/detail/tile_state.hpp
+++ b/cpp/include/cudf/io/text/detail/tile_state.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,7 +82,7 @@ struct scan_tile_state {
 
   scan_tile_state(cudf::size_type num_tiles,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                  rmm::mr::device_memory_resource* mr)
     : tile_status(rmm::device_uvector<cuda::atomic<scan_tile_status, cuda::thread_scope_device>>(
         num_tiles, stream, mr)),
       tile_state_partial(rmm::device_uvector<T>(num_tiles, stream, mr)),
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index a908a9fa227..7bb2e4e2ece 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -165,7 +165,7 @@ struct trie {
    */
   static trie create(std::string const& pattern,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                     rmm::mr::device_memory_resource* mr)
 
   {
     return create(std::vector<std::string>{pattern}, stream, mr);
@@ -181,7 +181,7 @@ struct trie {
    */
   static trie create(std::vector<std::string> const& patterns,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                     rmm::mr::device_memory_resource* mr)
   {
     std::vector<char> tokens;
     std::vector<uint8_t> transitions;
diff --git a/cpp/include/cudf/lists/detail/contains.hpp b/cpp/include/cudf/lists/detail/contains.hpp
index 24318e72e98..58ec18cb9ef 100644
--- a/cpp/include/cudf/lists/detail/contains.hpp
+++ b/cpp/include/cudf/lists/detail/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,12 +29,11 @@ namespace detail {
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> index_of(
-  cudf::lists_column_view const& lists,
-  cudf::scalar const& search_key,
-  cudf::lists::duplicate_find_option find_option,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
+                                 cudf::scalar const& search_key,
+                                 cudf::lists::duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
@@ -43,12 +42,11 @@ std::unique_ptr<column> index_of(
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> index_of(
-  cudf::lists_column_view const& lists,
-  cudf::column_view const& search_keys,
-  cudf::lists::duplicate_find_option find_option,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
+                                 cudf::column_view const& search_keys,
+                                 cudf::lists::duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
@@ -56,11 +54,10 @@ std::unique_ptr<column> index_of(
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> contains(
-  cudf::lists_column_view const& lists,
-  cudf::scalar const& search_key,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
+                                 cudf::scalar const& search_key,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
@@ -68,11 +65,10 @@ std::unique_ptr<column> contains(
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> contains(
-  cudf::lists_column_view const& lists,
-  cudf::column_view const& search_keys,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
+                                 cudf::column_view const& search_keys,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 1aaa06750db..820dc8a3077 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -129,8 +129,10 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
   auto const group_labels_view = column_view(cudf::device_span<const size_type>(group_labels));
   auto const gather_map        = [&]() {
     if (is_presorted()) {  // assumes both keys and values are sorted, Spark does this.
-      return cudf::detail::sequence(
-        group_labels.size(), *cudf::make_fixed_width_scalar(size_type{0}, stream), stream);
+      return cudf::detail::sequence(group_labels.size(),
+                                    *cudf::make_fixed_width_scalar(size_type{0}, stream),
+                                    stream,
+                                    rmm::mr::get_current_device_resource());
     } else {
       auto sort_order = (rank_agg._method == rank_method::FIRST ? cudf::detail::stable_sorted_order
                                                                        : cudf::detail::sorted_order);
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 861b5b0fba4..7f88019beb2 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -215,7 +215,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                                                                   arrow::MemoryPool* ar_mr,
                                                                   rmm::cuda_stream_view stream)
 {
-  auto bitmask = bools_to_mask(input, stream);
+  auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
 
   auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
 
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index a0ba3e3ee35..afa260e215a 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -379,9 +379,11 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   // must be at least 32 when using warp-reduce on partials
   // must be at least 1 more than max possible concurrent tiles
   // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
-  auto num_tile_states  = std::max(32, TILES_PER_CHUNK * concurrency + 32);
-  auto tile_multistates = scan_tile_state<multistate>(num_tile_states, stream);
-  auto tile_offsets     = scan_tile_state<output_offset>(num_tile_states, stream);
+  auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32);
+  auto tile_multistates =
+    scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
+  auto tile_offsets =
+    scan_tile_state<output_offset>(num_tile_states, stream, rmm::mr::get_current_device_resource());
 
   multibyte_split_init_kernel<<<TILES_PER_CHUNK,
                                 THREADS_PER_TILE,
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 2585ff457ee..dcb6835ec09 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -64,8 +64,12 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   // Previously, the gather map was generated directly without this array but by calling to
   // `map.contains` inside the `thrust::copy_if` kernel. However, that led to increasing register
   // usage and reducing performance, as reported here: https://github.com/rapidsai/cudf/pull/10511.
-  auto const flagged =
-    cudf::detail::contains(right_keys, left_keys, compare_nulls, nan_equality::ALL_EQUAL, stream);
+  auto const flagged = cudf::detail::contains(right_keys,
+                                              left_keys,
+                                              compare_nulls,
+                                              nan_equality::ALL_EQUAL,
+                                              stream,
+                                              rmm::mr::get_current_device_resource());
 
   auto const left_num_rows = left_keys.num_rows();
   auto gather_map =
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 8b006548391..b890a0c82a2 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -246,7 +246,8 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
             auto const row_index = i % num_rows;
             return row_null_counts[row_index] != num_columns;
           },
-          stream);
+          stream,
+          rmm::mr::get_current_device_resource());
       }
       // NULLIFY_OUTPUT_ROW.  Output row is nullfied if any input row is null
       return cudf::detail::valid_if(
@@ -257,7 +258,8 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
           auto const row_index = i % num_rows;
           return row_null_counts[row_index] == 0;
         },
-        stream);
+        stream,
+        rmm::mr::get_current_device_resource());
     }();
     concat->set_null_mask(std::move(null_mask), null_count);
   }
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index bff63871e29..5d4a20d1cb8 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -101,8 +101,10 @@ std::unique_ptr<cudf::column> make_index_child(size_type index,
  */
 std::unique_ptr<cudf::column> make_index_offsets(size_type num_lists, rmm::cuda_stream_view stream)
 {
-  return cudf::detail::sequence(
-    num_lists + 1, cudf::scalar_type_t<size_type>(0, true, stream), stream);
+  return cudf::detail::sequence(num_lists + 1,
+                                cudf::scalar_type_t<size_type>(0, true, stream),
+                                stream,
+                                rmm::mr::get_current_device_resource());
 }
 
 }  // namespace
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index eb3ec5a8236..c05ef2fd644 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -83,8 +83,8 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   // Check existence for each row of the rhs_table in lhs_table.
-  auto const contained =
-    cudf::detail::contains(lhs_table, rhs_table, nulls_equal, nans_equal, stream);
+  auto const contained = cudf::detail::contains(
+    lhs_table, rhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
 
   auto const num_rows = lhs.size();
 
@@ -151,8 +151,8 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
   auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
-  auto const contained =
-    cudf::detail::contains(lhs_table, rhs_table, nulls_equal, nans_equal, stream);
+  auto const contained = cudf::detail::contains(
+    lhs_table, rhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
 
   auto const intersect_table = cudf::detail::copy_if(
     rhs_table,
@@ -231,8 +231,8 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
   auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
-  auto const contained =
-    cudf::detail::contains(rhs_table, lhs_table, nulls_equal, nans_equal, stream);
+  auto const contained = cudf::detail::contains(
+    rhs_table, lhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
 
   auto const difference_table = cudf::detail::copy_if(
     lhs_table,
diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu
index cdc453be8e4..cb086cda179 100644
--- a/cpp/tests/bitmask/valid_if_tests.cu
+++ b/cpp/tests/bitmask/valid_if_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ TEST_F(ValidIfTest, EmptyRange)
   auto actual        = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(0),
                                        odds_valid{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   auto const& buffer = actual.first;
   EXPECT_EQ(0u, buffer.size());
   EXPECT_EQ(nullptr, buffer.data());
@@ -55,7 +56,8 @@ TEST_F(ValidIfTest, InvalidRange)
   EXPECT_THROW(cudf::detail::valid_if(thrust::make_counting_iterator(1),
                                       thrust::make_counting_iterator(0),
                                       odds_valid{},
-                                      cudf::get_default_stream()),
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource()),
                cudf::logic_error);
 }
 
@@ -66,7 +68,8 @@ TEST_F(ValidIfTest, OddsValid)
   auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(10000),
                                        odds_valid{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(5000, actual.second);
 }
@@ -78,7 +81,8 @@ TEST_F(ValidIfTest, AllValid)
   auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(10000),
                                        all_valid{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(0, actual.second);
 }
@@ -90,7 +94,8 @@ TEST_F(ValidIfTest, AllNull)
   auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(10000),
                                        all_null{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(10000, actual.second);
 }
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 8a2c0b2b411..86c2add851a 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -56,7 +56,7 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
   auto [null_mask, null_count] = cudf::detail::valid_if(
       validity_begin, validity_end,
       [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
-      cudf::get_default_stream());
+      cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const exemplar_without_null_mask = cudf::column_view{
       exemplar.type(),
       exemplar.size(),
@@ -153,8 +153,9 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
                    });
 
   // Create a new nullmask from the validity data.
-  auto [new_null_mask, new_null_count] = cudf::detail::valid_if(
-      validity.begin(), validity.end(), thrust::identity{}, cudf::get_default_stream());
+  auto [new_null_mask, new_null_count] =
+      cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{},
+                             cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   if (new_null_count > 0) {
     // If the `overlap_result` column is nullable, perform `bitmask_and` of its nullmask and the
diff --git a/java/src/main/native/src/maps_column_view.cu b/java/src/main/native/src/maps_column_view.cu
index 23254c0d501..1af7689f972 100644
--- a/java/src/main/native/src/maps_column_view.cu
+++ b/java/src/main/native/src/maps_column_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,7 +55,8 @@ std::unique_ptr<column> get_values_for_impl(maps_column_view const &maps_view,
   CUDF_EXPECTS(lookup_keys.type().id() == keys_.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
   auto key_indices =
-      lists::detail::index_of(keys_, lookup_keys, lists::duplicate_find_option::FIND_LAST, stream);
+      lists::detail::index_of(keys_, lookup_keys, lists::duplicate_find_option::FIND_LAST, stream,
+                              rmm::mr::get_current_device_resource());
   auto constexpr absent_offset = size_type{-1};
   auto constexpr nullity_offset = std::numeric_limits<size_type>::min();
   thrust::replace(rmm::exec_policy(stream), key_indices->mutable_view().template begin<size_type>(),
@@ -86,7 +87,8 @@ std::unique_ptr<column> contains_impl(maps_column_view const &maps_view, KeyT co
   auto const keys = maps_view.keys();
   CUDF_EXPECTS(lookup_keys.type().id() == keys.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
-  auto const contains = lists::detail::contains(keys, lookup_keys, stream);
+  auto const contains =
+      lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
   // Replace nulls with BOOL8{false};
   auto const scalar_false = numeric_scalar<bool>{false, true, stream};
   return detail::replace_nulls(contains->view(), scalar_false, stream, mr);
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 747ff24f055..84f84f8b46f 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -1257,7 +1257,7 @@ static std::unique_ptr<column> fixed_width_convert_to_rows(
 
   // Allocate and set the offsets row for the byte array
   std::unique_ptr<column> offsets =
-      cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
+      cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr);
 
   std::unique_ptr<column> data =
       make_numeric_column(data_type(type_id::INT8), static_cast<size_type>(total_allocation),

From 913302aefc49db832da2d2d0b053016812805a4b Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Tue, 21 Mar 2023 23:08:46 -0400
Subject: [PATCH 37/63] Update `tests/column_utilities` to use
 `experimental::equality` row comparator (#12777)

This PR is a part of #11844

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12777
---
 cpp/include/cudf/utilities/type_checks.hpp    |  13 ++-
 cpp/src/copying/purge_nonempty_nulls.cu       |   4 +-
 cpp/src/table/row_operators.cu                |   2 +-
 cpp/src/utilities/type_checks.cpp             |   8 +-
 cpp/tests/copying/get_value_tests.cpp         |   6 +-
 cpp/tests/interop/from_arrow_test.cpp         |   4 +-
 cpp/tests/utilities/column_utilities.cu       | 109 ++++++++++--------
 .../utilities_tests/type_check_tests.cpp      |   3 +-
 8 files changed, 91 insertions(+), 58 deletions(-)

diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index 4fa712fe7c3..b925fc8ae92 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,4 +36,15 @@ namespace cudf {
  */
 bool column_types_equal(column_view const& lhs, column_view const& rhs);
 
+/**
+ * @brief Compare the type IDs of two `column_view`s
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is ignored.
+ *
+ * @param lhs The first `column_view` to compare
+ * @param rhs The second `column_view` to compare
+ * @return true if column types match
+ */
+bool column_types_equivalent(column_view const& lhs, column_view const& rhs);
+
 }  // namespace cudf
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index 5bdf10c8af6..20a8ce986aa 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,8 @@ bool has_nonempty_null_rows(cudf::column_view const& input, rmm::cuda_stream_vie
 {
   if (not input.has_nulls()) { return false; }  // No nulls => no dirty rows.
 
+  if ((input.size() == input.null_count()) && (input.num_children() == 0)) { return false; }
+
   // Cross-reference nullmask and offsets.
   auto const type         = input.type().id();
   auto const offsets      = (type == type_id::STRING) ? (strings_column_view{input}).offsets()
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index ae49ad17e53..0c6747f2d12 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -334,7 +334,7 @@ void check_shape_compatibility(table_view const& lhs, table_view const& rhs)
   CUDF_EXPECTS(lhs.num_columns() == rhs.num_columns(),
                "Cannot compare tables with different number of columns");
   for (size_type i = 0; i < lhs.num_columns(); ++i) {
-    CUDF_EXPECTS(column_types_equal(lhs.column(i), rhs.column(i)),
+    CUDF_EXPECTS(column_types_equivalent(lhs.column(i), rhs.column(i)),
                  "Cannot compare tables with different column types");
   }
 }
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index d297148de45..d6f5c65593a 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,4 +69,10 @@ bool column_types_equal(column_view const& lhs, column_view const& rhs)
   return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
 }
 
+bool column_types_equivalent(column_view const& lhs, column_view const& rhs)
+{
+  if (lhs.type().id() != rhs.type().id()) { return false; }
+  return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 1c51eab1f94..a35bbab0176 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -812,7 +812,7 @@ TYPED_TEST(StructGetValueTestTyped, mixed_types_valid)
   // col fields
   cudf::test::fixed_width_column_wrapper<TypeParam> f1{1, 2, 3};
   cudf::test::strings_column_wrapper f2{"aa", "bbb", "c"};
-  cudf::test::dictionary_column_wrapper<TypeParam, uint32_t> f3{42, 42, 24};
+  cudf::test::dictionary_column_wrapper<TypeParam, int32_t> f3{42, 42, 24};
   LCW f4{LCW{8, 8, 8}, LCW{9, 9}, LCW{10}};
 
   cudf::test::structs_column_wrapper col{f1, f2, f3, f4};
@@ -824,7 +824,7 @@ TYPED_TEST(StructGetValueTestTyped, mixed_types_valid)
   // expect fields
   cudf::test::fixed_width_column_wrapper<TypeParam> ef1{3};
   cudf::test::strings_column_wrapper ef2{"c"};
-  cudf::test::dictionary_column_wrapper<int32_t, TypeParam> ef3{24};
+  cudf::test::dictionary_column_wrapper<TypeParam, int32_t> ef3{24};
   LCW ef4{LCW{10}};
 
   cudf::table_view expect_data{{ef1, ef2, ef3, ef4}};
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index d2b159fc208..3f4d5bcf20f 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -264,7 +264,7 @@ TEST_F(FromArrowTest, DictionaryIndicesType)
   auto arrow_table = arrow::Table::Make(schema, {array1, array2, array3});
 
   std::vector<std::unique_ptr<cudf::column>> columns;
-  auto col = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
+  auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 6c441539621..3a94aac1cc9 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -24,7 +24,7 @@
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <cudf/table/row_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -46,6 +46,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/reduce.h>
+#include <thrust/remove.h>
 #include <thrust/scan.h>
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
@@ -371,55 +372,56 @@ struct column_property_comparator {
   }
 };
 
+template <typename DeviceComparator>
 class corresponding_rows_unequal {
  public:
-  corresponding_rows_unequal(table_device_view d_lhs,
-                             table_device_view d_rhs,
-                             column_device_view lhs_row_indices_,
+  corresponding_rows_unequal(column_device_view lhs_row_indices_,
                              column_device_view rhs_row_indices_,
-                             size_type /*fp_ulps*/)
-    : comp(cudf::nullate::YES{}, d_lhs, d_rhs, cudf::null_equality::EQUAL),
-      lhs_row_indices(lhs_row_indices_),
-      rhs_row_indices(rhs_row_indices_)
+                             size_type /*fp_ulps*/,
+                             DeviceComparator comp_,
+                             column_device_view /*lhs*/,
+                             column_device_view /*rhs*/)
+    : lhs_row_indices(lhs_row_indices_), rhs_row_indices(rhs_row_indices_), comp(comp_)
   {
   }
 
-  cudf::row_equality_comparator<cudf::nullate::YES> comp;
-
   __device__ bool operator()(size_type index)
   {
-    return !comp(lhs_row_indices.element<size_type>(index),
-                 rhs_row_indices.element<size_type>(index));
+    using cudf::experimental::row::lhs_index_type;
+    using cudf::experimental::row::rhs_index_type;
+
+    return !comp(lhs_index_type{lhs_row_indices.element<size_type>(index)},
+                 rhs_index_type{rhs_row_indices.element<size_type>(index)});
   }
 
   column_device_view lhs_row_indices;
   column_device_view rhs_row_indices;
+  DeviceComparator comp;
 };
 
+template <typename DeviceComparator>
 class corresponding_rows_not_equivalent {
-  table_device_view d_lhs;
-  table_device_view d_rhs;
-
   column_device_view lhs_row_indices;
   column_device_view rhs_row_indices;
-
   size_type const fp_ulps;
+  DeviceComparator comp;
+  column_device_view lhs;
+  column_device_view rhs;
 
  public:
-  corresponding_rows_not_equivalent(table_device_view d_lhs,
-                                    table_device_view d_rhs,
-                                    column_device_view lhs_row_indices_,
+  corresponding_rows_not_equivalent(column_device_view lhs_row_indices_,
                                     column_device_view rhs_row_indices_,
-                                    size_type fp_ulps_)
-    : d_lhs(d_lhs),
-      d_rhs(d_rhs),
-      comp(cudf::nullate::YES{}, d_lhs, d_rhs, null_equality::EQUAL),
-      lhs_row_indices(lhs_row_indices_),
+                                    size_type fp_ulps_,
+                                    DeviceComparator comp_,
+                                    column_device_view lhs_,
+                                    column_device_view rhs_)
+    : lhs_row_indices(lhs_row_indices_),
       rhs_row_indices(rhs_row_indices_),
-      fp_ulps(fp_ulps_)
+      fp_ulps(fp_ulps_),
+      comp(comp_),
+      lhs(lhs_),
+      rhs(rhs_)
   {
-    CUDF_EXPECTS(d_lhs.num_columns() == 1 and d_rhs.num_columns() == 1,
-                 "Unsupported number of columns");
   }
 
   struct typed_element_not_equivalent {
@@ -459,23 +461,17 @@ class corresponding_rows_not_equivalent {
     }
   };
 
-  cudf::row_equality_comparator<cudf::nullate::YES> comp;
-
   __device__ bool operator()(size_type index)
   {
+    using cudf::experimental::row::lhs_index_type;
+    using cudf::experimental::row::rhs_index_type;
+
     auto const lhs_index = lhs_row_indices.element<size_type>(index);
     auto const rhs_index = rhs_row_indices.element<size_type>(index);
 
-    if (not comp(lhs_index, rhs_index)) {
-      auto lhs_col = this->d_lhs.column(0);
-      auto rhs_col = this->d_rhs.column(0);
-      return type_dispatcher(lhs_col.type(),
-                             typed_element_not_equivalent{},
-                             lhs_col,
-                             rhs_col,
-                             lhs_index,
-                             rhs_index,
-                             fp_ulps);
+    if (not comp(lhs_index_type{lhs_index}, rhs_index_type{rhs_index})) {
+      return type_dispatcher(
+        lhs.type(), typed_element_not_equivalent{}, lhs, rhs, lhs_index, rhs_index, fp_ulps);
     }
     return false;
   }
@@ -536,25 +532,42 @@ struct column_comparator_impl {
                   size_type fp_ulps,
                   int depth)
   {
-    auto d_lhs = cudf::table_device_view::create(table_view{{lhs}});
-    auto d_rhs = cudf::table_device_view::create(table_view{{rhs}});
-
     auto d_lhs_row_indices = cudf::column_device_view::create(lhs_row_indices);
     auto d_rhs_row_indices = cudf::column_device_view::create(rhs_row_indices);
 
-    using ComparatorType = std::conditional_t<check_exact_equality,
-                                              corresponding_rows_unequal,
-                                              corresponding_rows_not_equivalent>;
+    auto d_lhs = cudf::column_device_view::create(lhs);
+    auto d_rhs = cudf::column_device_view::create(rhs);
+
+    auto lhs_tview = table_view{{lhs}};
+    auto rhs_tview = table_view{{rhs}};
+
+    auto const comparator = cudf::experimental::row::equality::two_table_comparator{
+      lhs_tview, rhs_tview, cudf::get_default_stream()};
+    auto const has_nulls = cudf::has_nested_nulls(lhs_tview) or cudf::has_nested_nulls(rhs_tview);
+
+    auto const device_comparator = comparator.equal_to<false>(cudf::nullate::DYNAMIC{has_nulls});
+
+    using ComparatorType =
+      std::conditional_t<check_exact_equality,
+                         corresponding_rows_unequal<decltype(device_comparator)>,
+                         corresponding_rows_not_equivalent<decltype(device_comparator)>>;
 
     auto differences = rmm::device_uvector<int>(
-      lhs.size(), cudf::get_default_stream());  // worst case: everything different
+      lhs_row_indices.size(), cudf::get_default_stream());  // worst case: everything different
     auto input_iter = thrust::make_counting_iterator(0);
-    auto diff_iter  = thrust::copy_if(
+
+    thrust::transform(
       rmm::exec_policy(cudf::get_default_stream()),
       input_iter,
       input_iter + lhs_row_indices.size(),
       differences.begin(),
-      ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps));
+      ComparatorType(
+        *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps, device_comparator, *d_lhs, *d_rhs));
+
+    auto diff_iter = thrust::remove(rmm::exec_policy(cudf::get_default_stream()),
+                                    differences.begin(),
+                                    differences.end(),
+                                    0);  // remove the zero entries
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
                        cudf::get_default_stream());  // shrink back down
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index 84a2d15d477..f65c3652dc9 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -147,6 +147,7 @@ TEST_F(ColumnTypeCheckTest, DifferentFixedWidth)
   fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
 
   EXPECT_FALSE(column_types_equal(lhs5, rhs5));
+  EXPECT_TRUE(column_types_equivalent(lhs5, rhs5));
 
   // Different rep, same scale
   fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});

From 6e41db08cc33e75ff64f66572e50486b9bc2ccea Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 22 Mar 2023 01:15:26 -0400
Subject: [PATCH 38/63] Add developer documentation forbidding default
 parameters in detail APIs (#12978)

Resolves #12944. Also reflows the text to ensure consistent text width.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12978
---
 .../developer_guide/DEVELOPER_GUIDE.md        | 148 +++++++++++-------
 1 file changed, 95 insertions(+), 53 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index a88f621095c..91c3dccfdc6 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -121,8 +121,8 @@ recommend watching Sean Parent's [C++ Seasoning talk](https://www.youtube.com/wa
 and we try to follow his rules: "No raw loops. No raw pointers. No raw synchronization primitives."
 
  * Prefer algorithms from STL and Thrust to raw loops.
- * Prefer libcudf and RMM [owning data structures and views](#libcudf-data-structures) to raw pointers
-   and raw memory allocation.
+ * Prefer libcudf and RMM [owning data structures and views](#libcudf-data-structures) to raw
+   pointers and raw memory allocation.
  * libcudf doesn't have a lot of CPU-thread concurrency, but there is some. And currently libcudf
    does use raw synchronization primitives. So we should revisit Parent's third rule and improve
    here.
@@ -146,8 +146,8 @@ The following guidelines apply to organizing `#include` lines.
  * Separate groups by a blank line.
  * Order the groups from "nearest" to "farthest". In other words, local includes, then includes
    from other RAPIDS libraries, then includes from related libraries, like `<thrust/...>`, then
-   includes from dependencies installed with cuDF, and then standard headers (for example `<string>`,
-   `<iostream>`).
+   includes from dependencies installed with cuDF, and then standard headers (for example
+   `<string>`, `<iostream>`).
  * Use `<>` instead of `""` unless the header is in the same directory as the source file.
  * Tools like `clangd` often auto-insert includes when they can, but they usually get the grouping
    and brackets wrong.
@@ -271,10 +271,12 @@ A *mutable*, non-owning view of a table.
 
 ## cudf::size_type
 
-The `cudf::size_type` is the type used for the number of elements in a column, offsets to elements within a column, indices to address specific elements, segments for subsets of column elements, etc.
+The `cudf::size_type` is the type used for the number of elements in a column, offsets to elements
+within a column, indices to address specific elements, segments for subsets of column elements, etc.
 It is equivalent to a signed, 32-bit integer type and therefore has a maximum value of 2147483647.
-Some APIs also accept negative index values and those functions support a minimum value of -2147483648.
-This fundamental type also influences output values not just for column size limits but for counting elements as well.
+Some APIs also accept negative index values and those functions support a minimum value of
+-2147483648. This fundamental type also influences output values not just for column size limits
+but for counting elements as well.
 
 ## Spans
 
@@ -343,8 +345,8 @@ auto s1 = static_cast<ScalarType *>(s.get());
 ```
 
 ### Passing to device
-Each scalar type, except `list_scalar`, has a corresponding non-owning device view class which allows
-access to the value and its validity from the device. This can be obtained using the function
+Each scalar type, except `list_scalar`, has a corresponding non-owning device view class which
+allows access to the value and its validity from the device. This can be obtained using the function
 `get_scalar_device_view(ScalarType s)`. Note that a device view is not provided for a base scalar
 object, only for the derived typed scalar class objects.
 
@@ -355,68 +357,84 @@ data, a specialized device view for list columns can be constructed via
 
 # libcudf Policies and Design Principles
 
-`libcudf` is designed to provide thread-safe, single-GPU accelerated algorithm primitives for solving a wide variety of problems that arise in data science.
-APIs are written to execute on the default GPU, which can be controlled by the caller through standard CUDA device APIs or environment variables like `CUDA_VISIBLE_DEVICES`.
-Our goal is to enable diverse use cases like Spark or Pandas to benefit from the performance of GPUs, and libcudf relies on these higher-level layers like Spark or Dask to orchestrate multi-GPU tasks.
+`libcudf` is designed to provide thread-safe, single-GPU accelerated algorithm primitives for
+solving a wide variety of problems that arise in data science.  APIs are written to execute on the
+default GPU, which can be controlled by the caller through standard CUDA device APIs or environment
+variables like `CUDA_VISIBLE_DEVICES`.  Our goal is to enable diverse use cases like Spark or Pandas
+to benefit from the performance of GPUs, and libcudf relies on these higher-level layers like Spark
+or Dask to orchestrate multi-GPU tasks.
 
-To best satisfy these use-cases, libcudf prioritizes performance and flexibility, which sometimes may come at the cost of convenience.
-While we welcome users to use libcudf directly, we design with the expectation that most users will be consuming libcudf through higher-level layers like Spark or cuDF Python that handle some of details that direct users of libcudf must handle on their own.
-We document these policies and the reasons behind them here.
+To best satisfy these use-cases, libcudf prioritizes performance and flexibility, which sometimes
+may come at the cost of convenience.  While we welcome users to use libcudf directly, we design with
+the expectation that most users will be consuming libcudf through higher-level layers like Spark or
+cuDF Python that handle some of details that direct users of libcudf must handle on their own.  We
+document these policies and the reasons behind them here.
 
 ## libcudf does not introspect data
 
 libcudf APIs generally do not perform deep introspection and validation of input data.
 There are numerous reasons for this:
 1. It violates the single responsibility principle: validation is separate from execution.
-2. Since libcudf data structures store data on the GPU, any validation incurs _at minimum_ the overhead of a kernel launch, and may in general be prohibitively expensive.
+2. Since libcudf data structures store data on the GPU, any validation incurs _at minimum_ the
+   overhead of a kernel launch, and may in general be prohibitively expensive.
 3. API promises around data introspection often significantly complicate implementation.
 
 Users are therefore responsible for passing valid data into such APIs.
 _Note that this policy does not mean that libcudf performs no validation whatsoever_.
 libcudf APIs should still perform any validation that does not require introspection.
-To give some idea of what should or should not be validated, here are (non-exhaustive) lists of examples.
+To give some idea of what should or should not be validated, here are (non-exhaustive) lists of
+examples.
 
 **Things that libcudf should validate**:
 - Input column/table sizes or data types
 
 **Things that libcudf should not validate**:
 - Integer overflow
-- Ensuring that outputs will not exceed the [2GB size](#cudfsize_type) limit for a given set of inputs
+- Ensuring that outputs will not exceed the [2GB size](#cudfsize_type) limit for a given set of
+  inputs
 
 
 ## libcudf expects nested types to have sanitized null masks
 
-Various libcudf APIs accepting columns of nested data types (such as `LIST` or `STRUCT`) may assume that these columns have been sanitized.
-In this context, sanitization refers to ensuring that the null elements in a column with a nested dtype are compatible with the elements of nested columns.
+Various libcudf APIs accepting columns of nested data types (such as `LIST` or `STRUCT`) may assume
+that these columns have been sanitized. In this context, sanitization refers to ensuring that the
+null elements in a column with a nested dtype are compatible with the elements of nested columns.
 Specifically:
-- Null elements of list columns should also be empty. The starting offset of a null element should be equal to the ending offset.
+- Null elements of list columns should also be empty. The starting offset of a null element should
+  be equal to the ending offset.
 - Null elements of struct columns should also be null elements in the underlying structs.
-- For compound columns, nulls should only be present at the level of the parent column. Child columns should not contain nulls.
+- For compound columns, nulls should only be present at the level of the parent column. Child
+  columns should not contain nulls.
 - Slice operations on nested columns do not propagate offsets to child columns.
 
-libcudf APIs _should_ promise to never return "dirty" columns, i.e. columns containing unsanitized data.
-Therefore, the only problem is if users construct input columns that are not correctly sanitized and then pass those into libcudf APIs.
+libcudf APIs _should_ promise to never return "dirty" columns, i.e. columns containing unsanitized
+data. Therefore, the only problem is if users construct input columns that are not correctly
+sanitized and then pass those into libcudf APIs.
 
 ## Treat libcudf APIs as if they were asynchronous
 
 libcudf APIs called on the host do not guarantee that the stream is synchronized before returning.
-Work in libcudf occurs on `cudf::get_default_stream().value`, which defaults to the CUDA default stream (stream 0).
-Note that the stream 0 behavior differs if [per-thread default stream is enabled](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html) via `CUDF_USE_PER_THREAD_DEFAULT_STREAM`.
-Any data provided to or returned by libcudf that uses a separate non-blocking stream requires synchronization with the default libcudf stream to ensure stream safety.
+Work in libcudf occurs on `cudf::get_default_stream().value`, which defaults to the CUDA default
+stream (stream 0). Note that the stream 0 behavior differs if [per-thread default stream is
+enabled](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html) via
+`CUDF_USE_PER_THREAD_DEFAULT_STREAM`. Any data provided to or returned by libcudf that uses a
+separate non-blocking stream requires synchronization with the default libcudf stream to ensure
+stream safety.
 
 ## libcudf generally does not make ordering guarantees
 
-Functions like merge or groupby in libcudf make no guarantees about the order of entries in the output.
-Promising deterministic ordering is not, in general, conducive to fast parallel algorithms.
+Functions like merge or groupby in libcudf make no guarantees about the order of entries in the
+output. Promising deterministic ordering is not, in general, conducive to fast parallel algorithms.
 Calling code is responsible for performing sorts after the fact if sorted outputs are needed.
 
 ## libcudf does not promise specific exception messages
 
-libcudf documents the exceptions that will be thrown by an API for different kinds of invalid inputs.
-The types of those exceptions (e.g. `cudf::logic_error`) are part of the public API.
-However, the explanatory string returned by the `what` method of those exceptions is not part of the API and is subject to change.
-Calling code should not rely on the contents of libcudf error messages to determine the nature of the error.
-For information on the types of exceptions that libcudf throws under different circumstances, see the [section on error handling](#errors).
+libcudf documents the exceptions that will be thrown by an API for different kinds of invalid
+inputs. The types of those exceptions (e.g. `cudf::logic_error`) are part of the public API.
+However, the explanatory string returned by the `what` method of those exceptions is not part of the
+API and is subject to change. Calling code should not rely on the contents of libcudf error
+messages to determine the nature of the error. For information on the types of exceptions that
+libcudf throws under different circumstances, see the [section on error handling](#errors).
 
 # libcudf API and Implementation
 
@@ -475,14 +493,6 @@ asynchrony if and when we add an asynchronous API to libcudf.
 **Note:** `cudaDeviceSynchronize()` should *never* be used.
 This limits the ability to do any multi-stream/multi-threaded work with libcudf APIs.
 
- ### NVTX Ranges
-
-In order to aid in performance optimization and debugging, all compute intensive libcudf functions
-should have a corresponding NVTX range. In libcudf, we have a convenience macro `CUDF_FUNC_RANGE()`
-that will automatically annotate the lifetime of the enclosing function and use the function's name
-as the name of the NVTX range. For more information about NVTX, see
-[here](https://github.com/NVIDIA/NVTX/tree/dev/c).
-
  ### Stream Creation
 
 There may be times in implementing libcudf features where it would be advantageous to use streams
@@ -494,8 +504,8 @@ should avoid creating streams (even if it is slightly less efficient). It is a g
 
 ## Memory Allocation
 
-Device [memory resources](#rmmdevice_memory_resource) are used in libcudf to abstract and control how device
-memory is allocated.
+Device [memory resources](#rmmdevice_memory_resource) are used in libcudf to abstract and control
+how device memory is allocated.
 
 ### Output Memory
 
@@ -515,6 +525,12 @@ std::unique_ptr<column> returns_output_memory(
 void does_not_allocate_output_memory(...);
 ```
 
+This rule automatically applies to all detail APIs that allocates memory. Any detail API may be
+called by any public API, and therefore could be allocating memory that is returned to the user.
+To support such uses cases, all detail APIs allocating memory resources should accept an `mr`
+parameter. Callers are responsible for either passing through a provided `mr` or
+`rmm::mr::get_current_device_resource()` as needed.
+
 ### Temporary Memory
 
 Not all memory allocated within a libcudf API is returned to the caller. Often algorithms must
@@ -535,7 +551,7 @@ rmm::device_buffer some_function(
 ### Memory Management
 
 libcudf code generally eschews raw pointers and direct memory allocation. Use RMM classes built to
-use `device_memory_resource`(*)s for device memory allocation with automated lifetime management.
+use `device_memory_resource`s for device memory allocation with automated lifetime management.
 
 #### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
@@ -617,6 +633,32 @@ rmm::mr::device_memory_resource * mr = new my_custom_resource{...};
 rmm::device_uvector<int32_t> v2{100, s, mr};
 ```
 
+## Default Parameters
+
+While public libcudf APIs are free to include default function parameters, detail functions should
+not. Default memory resource parameters make it easy for developers to accidentally allocate memory
+using the incorrect resource. Avoiding default memory resources forces developers to consider each
+memory allocation carefully.
+
+While streams are not currently exposed in libcudf's API, we plan to do so eventually. As a result,
+the same reasons for memory resources also apply to streams. Public APIs default to using
+`cudf::get_default_stream()`. However, including the same default in detail APIs opens the door for
+developers to forget to pass in a user-provided stream if one is passed to a public API. Forcing
+every detail API call to explicitly pass a stream is intended to prevent such mistakes.
+
+The memory resources (and eventually, the stream) are the final parameters for essentially all
+public APIs. For API consistency, the same is true throughout libcudf's internals. Therefore, a
+consequence of not allowing default streams or MRs is that no parameters in detail APIs may have
+defaults.
+
+## NVTX Ranges
+
+In order to aid in performance optimization and debugging, all compute intensive libcudf functions
+should have a corresponding NVTX range. libcudf has a convenience macro `CUDF_FUNC_RANGE()` that
+automatically annotates the lifetime of the enclosing function and uses the function's name as
+the name of the NVTX range. For more information about NVTX, see
+[here](https://github.com/NVIDIA/NVTX/tree/dev/c).
+
 ## Input/Output Style
 
 The preferred style for how inputs are passed in and outputs are returned is the following:
@@ -886,9 +928,9 @@ CUDF_FAIL("This code path should not be reached.");
 
 ### CUDA Error Checking
 
-Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
-macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The
-thrown exception includes a description of the CUDA error code in its `what()` message.
+Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions.
+This macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`.
+The thrown exception includes a description of the CUDA error code in its `what()` message.
 
 Example:
 
@@ -1111,8 +1153,8 @@ For list columns, the parent column's type is `LIST` and contains no data, but i
 the number of lists in the column, and its null mask represents the validity of each list element.
 The parent has two children.
 
-1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each list
-   in a dense column of elements.
+1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the
+   beginning of each list in a dense column of elements.
 2. A column containing the actual data and optional null mask for all elements of all the lists
    packed together.
 
@@ -1271,9 +1313,9 @@ libcudf provides view types for nested column types as well as for the data elem
 `cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of
 any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore
 `cudf::string_view` is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
-data type for a `cudf::column` of type [`size_type`](#cudfsize_type). As its name implies, this is a read-only object
-instance that points to device memory inside the strings column. It's lifespan is the same (or less)
-as the column it views.
+data type for a `cudf::column` of type [`size_type`](#cudfsize_type). As its name implies, this is a
+read-only object instance that points to device memory inside the strings column. It's lifespan is
+the same (or less) as the column it views.
 
 Use the `column_device_view::element` method to access an individual row element. Like any other
 column, do not call `element()` on a row that is null.

From bf18cea1461b63ffef32a5404b7552d1e90240e9 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 21 Mar 2023 22:16:14 -0700
Subject: [PATCH 39/63] Make timezone table independent from ORC (#12805)

Modifies the creation of timezone transition tables to make it usable outside of the ORC context:

- Remove the ORC epoch offset from the transition table.
- Use `timestamp_s` and `duration_s` instead of integral values in the table.
- Return table as `cudf::table` instead of the special `timezone_table` struct.
- Move the implementation/header out of `cudf::io::orc`.
- Split the header into C++ and CUDA parts. C++ API has public and `detail` counterparts.

Other:
Adapt ORC reader to changes to the timezone transition table.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Ashwin Srinath (https://github.com/shwina)
  - MithunR (https://github.com/mythrocks)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12805
---
 conda/recipes/libcudf/meta.yaml           |   3 +
 cpp/CMakeLists.txt                        |   2 +-
 cpp/include/cudf/detail/timezone.cuh      |  79 ++++++++++++
 cpp/include/cudf/detail/timezone.hpp      |  36 ++++++
 cpp/include/cudf/timezone.hpp             |  54 ++++++++
 cpp/src/{io/orc => datetime}/timezone.cpp | 108 ++++++++++------
 cpp/src/io/orc/orc.hpp                    |   2 +
 cpp/src/io/orc/orc_gpu.hpp                |   4 +-
 cpp/src/io/orc/reader_impl.cu             |  17 +--
 cpp/src/io/orc/reader_impl.hpp            |   6 +-
 cpp/src/io/orc/stripe_data.cu             |  46 +++----
 cpp/src/io/orc/stripe_enc.cu              |   7 +-
 cpp/src/io/orc/timezone.cuh               | 144 ----------------------
 13 files changed, 283 insertions(+), 225 deletions(-)
 create mode 100644 cpp/include/cudf/detail/timezone.cuh
 create mode 100644 cpp/include/cudf/detail/timezone.hpp
 create mode 100644 cpp/include/cudf/timezone.hpp
 rename cpp/src/{io/orc => datetime}/timezone.cpp (80%)
 delete mode 100644 cpp/src/io/orc/timezone.cuh

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index caa807bd7ec..0b2fc71aacd 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -92,6 +92,7 @@ outputs:
         - test -f $PREFIX/include/cudf/concatenate.hpp
         - test -f $PREFIX/include/cudf/copying.hpp
         - test -f $PREFIX/include/cudf/datetime.hpp
+        - test -f $PREFIX/include/cudf/timezone.hpp
         - test -f $PREFIX/include/cudf/detail/aggregation/aggregation.hpp
         - test -f $PREFIX/include/cudf/detail/aggregation/result_cache.hpp
         - test -f $PREFIX/include/cudf/detail/binaryop.hpp
@@ -128,6 +129,8 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
         - test -f $PREFIX/include/cudf/detail/structs/utilities.hpp
         - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp
+        - test -f $PREFIX/include/cudf/detail/timezone.cuh
+        - test -f $PREFIX/include/cudf/detail/timezone.hpp
         - test -f $PREFIX/include/cudf/detail/transform.hpp
         - test -f $PREFIX/include/cudf/detail/transpose.hpp
         - test -f $PREFIX/include/cudf/detail/unary.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0848af2a916..13583378134 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -369,7 +369,7 @@ add_library(
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu
   src/io/orc/stripe_init.cu
-  src/io/orc/timezone.cpp
+  src/datetime/timezone.cpp
   src/io/orc/writer_impl.cu
   src/io/parquet/compact_protocol_reader.cpp
   src/io/parquet/compact_protocol_writer.cpp
diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh
new file mode 100644
index 00000000000..830ee1a7fa6
--- /dev/null
+++ b/cpp/include/cudf/detail/timezone.cuh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/timezone.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+
+namespace cudf::detail {
+
+/**
+ * @brief Returns the UT offset for a given date and given timezone table.
+ *
+ * @param transition_times Transition times; trailing `solar_cycle_entry_count` entries are used for
+ * all times beyond the one covered by the TZif file
+ * @param offsets Time offsets in specific intervals; trailing `solar_cycle_entry_count` entries are
+ * used for all times beyond the one covered by the TZif file
+ * @param ts ORC timestamp
+ *
+ * @return offset from UT, in seconds
+ */
+inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp_s ts)
+{
+  if (tz_table.num_rows() == 0) { return duration_s{0}; }
+
+  cudf::device_span<timestamp_s const> transition_times(tz_table.column(0).head<timestamp_s>(),
+                                                        static_cast<size_t>(tz_table.num_rows()));
+
+  auto const ts_ttime_it = [&]() {
+    auto last_less_equal = [](auto begin, auto end, auto value) {
+      auto const first_larger = thrust::upper_bound(thrust::seq, begin, end, value);
+      // Return start of the range if all elements are larger than the value
+      if (first_larger == begin) return begin;
+      // Element before the first larger element is the last one less or equal
+      return first_larger - 1;
+    };
+
+    auto const file_entry_end =
+      transition_times.begin() + (transition_times.size() - solar_cycle_entry_count);
+
+    if (ts <= *(file_entry_end - 1)) {
+      // Search the file entries if the timestamp is in range
+      return last_less_equal(transition_times.begin(), file_entry_end, ts);
+    } else {
+      auto project_to_cycle = [](timestamp_s ts) {
+        // Years divisible by four are leap years
+        // Exceptions are years divisible by 100, but not divisible by 400
+        static constexpr int32_t num_leap_years_in_cycle =
+          solar_cycle_years / 4 - (solar_cycle_years / 100 - solar_cycle_years / 400);
+        static constexpr duration_s cycle_s = cuda::std::chrono::duration_cast<duration_s>(
+          duration_D{365 * solar_cycle_years + num_leap_years_in_cycle});
+        return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s};
+      };
+      // Search the 400-year cycle if outside of the file entries range
+      return last_less_equal(file_entry_end, transition_times.end(), project_to_cycle(ts));
+    }
+  }();
+
+  return tz_table.column(1).element<duration_s>(ts_ttime_it - transition_times.begin());
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
new file mode 100644
index 00000000000..f7f97c0a7c2
--- /dev/null
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/timezone.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::detail {
+
+/**
+ * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> make_timezone_transition_table(
+  std::optional<std::string_view> tzif_dir,
+  std::string_view timezone_name,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
new file mode 100644
index 00000000000..56678c73811
--- /dev/null
+++ b/cpp/include/cudf/timezone.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <memory>
+#include <optional>
+#include <string>
+
+namespace cudf {
+class table;
+
+// Cycle in which the time offsets repeat in Gregorian calendar
+static constexpr int32_t solar_cycle_years = 400;
+// Number of future entries in the timezone transition table:
+// Two entries per year, over the length of the Gregorian calendar's solar cycle
+static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
+
+/**
+ * @brief Creates a transition table to convert ORC timestamps to UTC.
+ *
+ * Uses system's TZif files. Assumes little-endian platform when parsing these files.
+ * The transition table starts with the entries from the TZif file. For timestamps after the file's
+ * last transition, the table includes entries that form a `solar_cycle_years`-year cycle (future
+ * entries). This portion of the table has `solar_cycle_entry_count` elements, as it assumes two
+ * transitions per year from Daylight Saving Time. If the timezone does not have DST, the table will
+ * still include the future entries, which will all have the same offset.
+ *
+ * @param tzif_dir The directory where the TZif files are located
+ * @param timezone_name standard timezone name (for example, "America/Los_Angeles")
+ * @param mr Device memory resource used to allocate the returned table's device memory.
+ *
+ * @return The transition table for the given timezone
+ */
+std::unique_ptr<table> make_timezone_transition_table(
+  std::optional<std::string_view> tzif_dir,
+  std::string_view timezone_name,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/datetime/timezone.cpp
similarity index 80%
rename from cpp/src/io/orc/timezone.cpp
rename to cpp/src/datetime/timezone.cpp
index 416369cc3f0..55d68fe4a1a 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -13,24 +13,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "timezone.cuh"
+#include <cudf/detail/timezone.hpp>
 
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <cudf/table/table.hpp>
 
 #include <algorithm>
+#include <filesystem>
 #include <fstream>
 
 namespace cudf {
-namespace io {
+
+namespace {
 
 constexpr uint32_t tzif_magic           = ('T' << 0) | ('Z' << 8) | ('i' << 16) | ('f' << 24);
 std::string const tzif_system_directory = "/usr/share/zoneinfo/";
 
-// Seconds from Jan 1st, 1970 to Jan 1st, 2015
-constexpr int64_t orc_utc_offset = 1420070400;
-
 #pragma pack(push, 1)
 /**
  * @brief 32-bit TZif header
@@ -129,12 +129,13 @@ struct timezone_file {
                  "Number of transition times is larger than the file size.");
   }
 
-  timezone_file(std::string const& timezone_name)
+  timezone_file(std::optional<std::string_view> tzif_dir, std::string_view timezone_name)
   {
     using std::ios_base;
 
     // Open the input file
-    std::string const tz_filename = tzif_system_directory + timezone_name;
+    auto const tz_filename =
+      std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name;
     std::ifstream fin;
     fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate);
     CUDF_EXPECTS(fin, "Failed to open the timezone file.");
@@ -375,45 +376,62 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
   return trans.time + cuda::std::chrono::duration_cast<duration_s>(duration_D{day}).count();
 }
 
-timezone_table build_timezone_transition_table(std::string const& timezone_name,
-                                               rmm::cuda_stream_view stream)
+}  // namespace
+
+std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
+                                                      std::string_view timezone_name,
+                                                      rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::make_timezone_transition_table(
+    tzif_dir, timezone_name, cudf::get_default_stream(), mr);
+}
+
+namespace detail {
+
+std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
+                                                      std::string_view timezone_name,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::mr::device_memory_resource* mr)
 {
   if (timezone_name == "UTC" || timezone_name.empty()) {
     // Return an empty table for UTC
-    return {};
+    return std::make_unique<cudf::table>();
   }
 
-  timezone_file const tzf(timezone_name);
+  timezone_file const tzf(tzif_dir, timezone_name);
 
-  std::vector<int64_t> ttimes(1);
-  std::vector<int32_t> offsets(1);
+  std::vector<timestamp_s::rep> transition_times(1);
+  std::vector<duration_s::rep> offsets(1);
   // One ancient rule entry, one per TZ file entry, 2 entries per year in the future cycle
-  ttimes.reserve(1 + tzf.timecnt() + cycle_entry_cnt);
-  offsets.reserve(1 + tzf.timecnt() + cycle_entry_cnt);
+  transition_times.reserve(1 + tzf.timecnt() + solar_cycle_entry_count);
+  offsets.reserve(1 + tzf.timecnt() + solar_cycle_entry_count);
   size_t earliest_std_idx = 0;
   for (size_t t = 0; t < tzf.timecnt(); t++) {
     auto const ttime = tzf.transition_times[t];
     auto const idx   = tzf.ttime_idx[t];
     CUDF_EXPECTS(idx < tzf.typecnt(), "Out-of-range type index");
     auto const utcoff = tzf.ttype[idx].utcoff;
-    ttimes.push_back(ttime);
+    transition_times.push_back(ttime);
     offsets.push_back(utcoff);
-    if (!earliest_std_idx && !tzf.ttype[idx].isdst) { earliest_std_idx = ttimes.size() - 1; }
+    if (!earliest_std_idx && !tzf.ttype[idx].isdst) {
+      earliest_std_idx = transition_times.size() - 1;
+    }
   }
 
   if (tzf.timecnt() != 0) {
     if (!earliest_std_idx) { earliest_std_idx = 1; }
-    ttimes[0]  = ttimes[earliest_std_idx];
-    offsets[0] = offsets[earliest_std_idx];
+    transition_times[0] = transition_times[earliest_std_idx];
+    offsets[0]          = offsets[earliest_std_idx];
   } else {
     if (tzf.typecnt() == 0 || tzf.ttype[0].utcoff == 0) {
       // No transitions, offset is zero; Table would be a no-op.
       // Return an empty table to speed up parsing.
-      return {};
+      return std::make_unique<cudf::table>();
     }
     // No transitions to use for the time/offset - use the first offset and apply to all timestamps
-    ttimes[0]  = std::numeric_limits<int64_t>::max();
-    offsets[0] = tzf.ttype[0].utcoff;
+    transition_times[0] = std::numeric_limits<int64_t>::max();
+    offsets[0]          = tzf.ttype[0].utcoff;
   }
 
   // Generate entries for times after the last transition
@@ -442,19 +460,19 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
 
   // Add entries to fill the transition cycle
   int64_t year_timestamp = 0;
-  for (int32_t year = 1970; year < 1970 + cycle_years; ++year) {
+  for (int32_t year = 1970; year < 1970 + solar_cycle_years; ++year) {
     auto const dst_start_time = get_transition_time(dst_start, year);
     auto const dst_end_time   = get_transition_time(dst_end, year);
 
     // Two entries per year, since there are two transitions
-    ttimes.push_back(year_timestamp + dst_start_time - future_std_offset);
+    transition_times.push_back(year_timestamp + dst_start_time - future_std_offset);
     offsets.push_back(future_dst_offset);
-    ttimes.push_back(year_timestamp + dst_end_time - future_dst_offset);
+    transition_times.push_back(year_timestamp + dst_end_time - future_dst_offset);
     offsets.push_back(future_std_offset);
 
     // Swap the newly added transitions if in descending order
-    if (ttimes.rbegin()[1] > ttimes.rbegin()[0]) {
-      std::swap(ttimes.rbegin()[0], ttimes.rbegin()[1]);
+    if (transition_times.rbegin()[1] > transition_times.rbegin()[0]) {
+      std::swap(transition_times.rbegin()[0], transition_times.rbegin()[1]);
       std::swap(offsets.rbegin()[0], offsets.rbegin()[1]);
     }
 
@@ -463,15 +481,33 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
                         .count();
   }
 
-  rmm::device_uvector<int64_t> d_ttimes =
-    cudf::detail::make_device_uvector_async(ttimes, stream, rmm::mr::get_current_device_resource());
-  rmm::device_uvector<int32_t> d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, stream, rmm::mr::get_current_device_resource());
-  auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset);
+  CUDF_EXPECTS(transition_times.size() == offsets.size(),
+               "Error reading TZif file for timezone " + std::string{timezone_name});
+
+  std::vector<timestamp_s> ttimes_typed;
+  ttimes_typed.reserve(transition_times.size());
+  std::transform(transition_times.cbegin(),
+                 transition_times.cend(),
+                 std::back_inserter(ttimes_typed),
+                 [](auto ts) { return timestamp_s{duration_s{ts}}; });
+  std::vector<duration_s> offsets_typed;
+  offsets_typed.reserve(offsets.size());
+  std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) {
+    return duration_s{ts};
+  });
+
+  auto d_ttimes  = cudf::detail::make_device_uvector_async(ttimes_typed, stream, mr);
+  auto d_offsets = cudf::detail::make_device_uvector_async(offsets_typed, stream, mr);
+
+  std::vector<std::unique_ptr<column>> tz_table_columns;
+  tz_table_columns.emplace_back(std::make_unique<cudf::column>(std::move(d_ttimes)));
+  tz_table_columns.emplace_back(std::make_unique<cudf::column>(std::move(d_offsets)));
+
+  // Need to finish copies before transition_times and offsets go out of scope
   stream.synchronize();
 
-  return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)};
+  return std::make_unique<cudf::table>(std::move(tz_table_columns));
 }
 
-}  // namespace io
+}  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index d30c3823080..21fc04a69ec 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -38,6 +38,8 @@ namespace io {
 namespace orc {
 
 static constexpr uint32_t block_header_size = 3;
+// Seconds from January 1st, 1970 to January 1st, 2015
+static constexpr int64_t orc_utc_epoch = 1420070400;
 
 struct PostScript {
   uint64_t footerLength       = 0;     // the length of the footer section in bytes
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 43f0565845c..05560a3ca62 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "timezone.cuh"
+#include <cudf/detail/timezone.cuh>
 
 #include "orc.hpp"
 
@@ -294,7 +294,7 @@ void DecodeOrcColumnData(ColumnDesc* chunks,
                          uint32_t num_columns,
                          uint32_t num_stripes,
                          size_t first_row,
-                         timezone_table_view tz_table,
+                         table_device_view tz_table,
                          uint32_t num_rowgroups,
                          uint32_t rowidx_stride,
                          size_t level,
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index bbc88a16c6a..bcf53159676 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -23,13 +23,13 @@
 #include "orc_gpu.hpp"
 
 #include "reader_impl.hpp"
-#include "timezone.cuh"
 
 #include <io/comp/gpuinflate.hpp>
 #include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
+#include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
@@ -603,7 +603,7 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
 void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                                       size_t num_dicts,
                                       size_t skip_rows,
-                                      timezone_table_view tz_table,
+                                      table_device_view tz_table,
                                       cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
                                       size_t row_index_stride,
                                       std::vector<column_buffer>& out_buffers,
@@ -915,11 +915,11 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   decimal128_columns = options.get_decimal128_columns();
 }
 
-timezone_table reader::impl::compute_timezone_table(
+std::unique_ptr<table> reader::impl::compute_timezone_table(
   const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
   rmm::cuda_stream_view stream)
 {
-  if (selected_stripes.empty()) return {};
+  if (selected_stripes.empty()) return std::make_unique<cudf::table>();
 
   auto const has_timestamp_column = std::any_of(
     selected_columns.levels.cbegin(), selected_columns.levels.cend(), [&](auto& col_lvl) {
@@ -927,10 +927,10 @@ timezone_table reader::impl::compute_timezone_table(
         return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
       });
     });
-  if (not has_timestamp_column) return {};
+  if (not has_timestamp_column) return std::make_unique<cudf::table>();
 
-  return build_timezone_transition_table(selected_stripes[0].stripe_info[0].second->writerTimezone,
-                                         stream);
+  return cudf::detail::make_timezone_transition_table(
+    {}, selected_stripes[0].stripe_info[0].second->writerTimezone, stream);
 }
 
 table_with_metadata reader::impl::read(size_type skip_rows,
@@ -1238,10 +1238,11 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         }
 
         if (not is_level_data_empty) {
+          auto const tz_table_dview = table_device_view::create(tz_table->view(), stream);
           decode_stream_data(chunks,
                              num_dict_entries,
                              skip_rows,
-                             tz_table.view(),
+                             *tz_table_dview,
                              row_groups,
                              _metadata.get_row_index_stride(),
                              out_buffers[level],
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 96492e4c2b2..94b0fdc09d2 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,7 +145,7 @@ class reader::impl {
   void decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                           size_t num_dicts,
                           size_t skip_rows,
-                          timezone_table_view tz_table,
+                          table_device_view tz_table,
                           cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
                           size_t row_index_stride,
                           std::vector<column_buffer>& out_buffers,
@@ -210,7 +210,7 @@ class reader::impl {
    *
    * @return Timezone table with timestamp offsets
    */
-  timezone_table compute_timezone_table(
+  std::unique_ptr<table> compute_timezone_table(
     const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
     rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index d0d077d2611..8e698dd9dff 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,9 +43,6 @@ inline __device__ uint8_t is_rlev1(uint8_t encoding_mode) { return encoding_mode
 
 inline __device__ uint8_t is_dictionary(uint8_t encoding_mode) { return encoding_mode & 1; }
 
-static __device__ __constant__ int64_t kORCTimeToUTC =
-  1420070400;  // Seconds from January 1st, 1970 to January 1st, 2015
-
 struct orc_bytestream_s {
   const uint8_t* base;
   uint32_t pos;
@@ -101,7 +98,7 @@ struct orc_datadec_state_s {
   uint32_t max_vals;        // max # of non-zero values to decode in this batch
   uint32_t nrows;           // # of rows in current batch (up to block_size)
   uint32_t buffered_count;  // number of buffered values in the secondary data stream
-  int64_t utc_epoch;        // kORCTimeToUTC - gmtOffset
+  duration_s tz_epoch;      // orc_ut_epoch - ut_offset
   RowGroup index;
 };
 
@@ -1374,7 +1371,7 @@ template <int block_size>
 __global__ void __launch_bounds__(block_size)
   gpuDecodeOrcColumnData(ColumnDesc* chunks,
                          DictionaryEntry* global_dictionary,
-                         timezone_table_view tz_table,
+                         table_device_view tz_table,
                          device_2dspan<RowGroup> row_groups,
                          size_t first_row,
                          uint32_t rowidx_stride,
@@ -1446,7 +1443,8 @@ __global__ void __launch_bounds__(block_size)
     }
     if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; }
 
-    s->top.data.utc_epoch = kORCTimeToUTC - tz_table.gmt_offset;
+    static constexpr duration_s d_orc_utc_epoch = duration_s{orc_utc_epoch};
+    s->top.data.tz_epoch = d_orc_utc_epoch - get_ut_offset(tz_table, timestamp_s{d_orc_utc_epoch});
 
     bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]);
     bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]);
@@ -1769,37 +1767,33 @@ __global__ void __launch_bounds__(block_size)
               break;
             }
             case TIMESTAMP: {
-              int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch;
-              int64_t nanos   = secondary_val;
-              nanos           = (nanos >> 3) * kTimestampNanoScale[nanos & 7];
-              if (!tz_table.ttimes.empty()) {
-                seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
-              }
+              auto seconds = s->top.data.tz_epoch + duration_s{s->vals.i64[t + vals_skipped]};
+              // Convert to UTC
+              seconds += get_ut_offset(tz_table, timestamp_s{seconds});
+
+              duration_ns nanos = duration_ns{(static_cast<int64_t>(secondary_val) >> 3) *
+                                              kTimestampNanoScale[secondary_val & 7]};
+
               // Adjust seconds only for negative timestamps with positive nanoseconds.
               // Alternative way to represent negative timestamps is with negative nanoseconds
               // in which case the adjustment in not needed.
               // Comparing with 999999 instead of zero to match the apache writer.
-              if (seconds < 0 and nanos > 999999) { seconds -= 1; }
-
-              duration_ns d_ns{nanos};
-              duration_s d_s{seconds};
+              if (seconds.count() < 0 and nanos.count() > 999999) { seconds -= duration_s{1}; }
 
               static_cast<int64_t*>(data_out)[row] = [&]() {
                 using cuda::std::chrono::duration_cast;
                 switch (s->chunk.timestamp_type_id) {
                   case type_id::TIMESTAMP_SECONDS:
-                    return d_s.count() + duration_cast<duration_s>(d_ns).count();
+                    return (seconds + duration_cast<duration_s>(nanos)).count();
                   case type_id::TIMESTAMP_MILLISECONDS:
-                    return duration_cast<duration_ms>(d_s).count() +
-                           duration_cast<duration_ms>(d_ns).count();
+                    return (seconds + duration_cast<duration_ms>(nanos)).count();
                   case type_id::TIMESTAMP_MICROSECONDS:
-                    return duration_cast<duration_us>(d_s).count() +
-                           duration_cast<duration_us>(d_ns).count();
+                    return (seconds + duration_cast<duration_us>(nanos)).count();
                   case type_id::TIMESTAMP_NANOSECONDS:
                   default:
-                    return duration_cast<duration_ns>(d_s).count() +
-                           d_ns.count();  // nanoseconds as output in case of `type_id::EMPTY` and
-                                          // `type_id::TIMESTAMP_NANOSECONDS`
+                    // nanoseconds as output in case of `type_id::EMPTY` and
+                    // `type_id::TIMESTAMP_NANOSECONDS`
+                    return (seconds + nanos).count();
                 }
               }();
 
@@ -1887,7 +1881,7 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
                                   uint32_t num_columns,
                                   uint32_t num_stripes,
                                   size_t first_row,
-                                  timezone_table_view tz_table,
+                                  table_device_view tz_table,
                                   uint32_t num_rowgroups,
                                   uint32_t rowidx_stride,
                                   size_t level,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 9032e3d2502..427167e2d0f 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,9 +49,6 @@ constexpr int scratch_buffer_size = 512 * 4;
 // Workaround replaces zero-length patch lists by a dummy zero patch
 constexpr bool zero_pll_war = true;
 
-static __device__ __constant__ int64_t kORCTimeToUTC =
-  1420070400;  // Seconds from January 1st, 1970 to January 1st, 2015
-
 struct byterle_enc_state_s {
   uint32_t literal_run;
   uint32_t repeat_run;
@@ -814,7 +811,7 @@ __global__ void __launch_bounds__(block_size)
             int32_t ts_scale    = powers_of_ten[9 - min(s->chunk.scale, 9)];
             int64_t seconds     = ts / ts_scale;
             int64_t nanos       = (ts - seconds * ts_scale);
-            s->vals.i64[nz_idx] = seconds - kORCTimeToUTC;
+            s->vals.i64[nz_idx] = seconds - orc_utc_epoch;
             if (nanos != 0) {
               // Trailing zeroes are encoded in the lower 3-bits
               uint32_t zeroes = 0;
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
deleted file mode 100644
index 52736d6451a..00000000000
--- a/cpp/src/io/orc/timezone.cuh
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/span.hpp>
-#include <io/utilities/time_utils.cuh>
-
-#include <rmm/device_uvector.hpp>
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-namespace cudf {
-namespace io {
-
-struct timezone_table_view {
-  int32_t gmt_offset = 0;
-  cudf::device_span<int64_t const> ttimes;
-  cudf::device_span<int32_t const> offsets;
-};
-
-// Cycle in which the time offsets repeat
-static constexpr int32_t cycle_years = 400;
-// Number of seconds in 400 years
-static constexpr int64_t cycle_seconds =
-  cuda::std::chrono::duration_cast<duration_s>(duration_D{365 * cycle_years + (100 - 3)}).count();
-// Two entries per year, over the length of the cycle
-static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years;
-
-/**
- * @brief Returns the GMT offset for a given date and given timezone table.
- *
- * @param ttimes Transition times; trailing `cycle_entry_cnt` entries are used for all times
- * beyond the one covered by the TZif file
- * @param offsets Time offsets in specific intervals; trailing `cycle_entry_cnt` entries are used
- * for all times beyond the one covered by the TZif file
- * @param count Number of elements in @p ttimes and @p offsets
- * @param ts ORC timestamp
- *
- * @return GMT offset
- */
-CUDF_HOST_DEVICE inline int32_t get_gmt_offset_impl(int64_t const* ttimes,
-                                                    int32_t const* offsets,
-                                                    size_t count,
-                                                    int64_t ts)
-{
-  // Returns start of the range if all elements are larger than the input timestamp
-  auto last_less_equal_ttime_idx = [&](long begin_idx, long end_idx, int64_t ts) {
-    auto const first_larger_ttime =
-      thrust::upper_bound(thrust::seq, ttimes + begin_idx, ttimes + end_idx, ts);
-    // Element before the first larger element is the last one less of equal
-    return std::max(first_larger_ttime - ttimes - 1, begin_idx);
-  };
-
-  auto const file_entry_cnt = count - cycle_entry_cnt;
-  // Search in the file entries if the timestamp is in range
-  if (ts <= ttimes[file_entry_cnt - 1]) {
-    return offsets[last_less_equal_ttime_idx(0, file_entry_cnt, ts)];
-  } else {
-    // Search in the 400-year cycle if outside of the file entries range
-    return offsets[last_less_equal_ttime_idx(
-      file_entry_cnt, count, (ts + cycle_seconds) % cycle_seconds)];
-  }
-}
-
-/**
- * @brief Host `get_gmt_offset` interface.
- *
- * Implemented in `get_gmt_offset_impl`.
- */
-inline __host__ int32_t get_gmt_offset(cudf::host_span<int64_t const> ttimes,
-                                       cudf::host_span<int32_t const> offsets,
-                                       int64_t ts)
-{
-  CUDF_EXPECTS(ttimes.size() == offsets.size(),
-               "transition times and offsets must have the same length");
-  return get_gmt_offset_impl(ttimes.begin(), offsets.begin(), ttimes.size(), ts);
-}
-
-/**
- * @brief Device `get_gmt_offset` interface.
- *
- * Implemented in `get_gmt_offset_impl`.
- */
-inline __device__ int32_t get_gmt_offset(cudf::device_span<int64_t const> ttimes,
-                                         cudf::device_span<int32_t const> offsets,
-                                         int64_t ts)
-{
-  return get_gmt_offset_impl(ttimes.begin(), offsets.begin(), ttimes.size(), ts);
-}
-
-class timezone_table {
-  int32_t gmt_offset = 0;
-  rmm::device_uvector<int64_t> ttimes;
-  rmm::device_uvector<int32_t> offsets;
-
- public:
-  // Safe to use the default stream, device_uvectors will not change after they are created empty
-  timezone_table() : ttimes{0, cudf::get_default_stream()}, offsets{0, cudf::get_default_stream()}
-  {
-  }
-  timezone_table(int32_t gmt_offset,
-                 rmm::device_uvector<int64_t>&& ttimes,
-                 rmm::device_uvector<int32_t>&& offsets)
-    : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)}
-  {
-  }
-  [[nodiscard]] timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; }
-};
-
-/**
- * @brief Creates a transition table to convert ORC timestamps to UTC.
- *
- * Uses system's TZif files. Assumes little-endian platform when parsing these files.
- *
- * @param timezone_name standard timezone name (for example, "US/Pacific")
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return The transition table for the given timezone
- */
-timezone_table build_timezone_transition_table(std::string const& timezone_name,
-                                               rmm::cuda_stream_view stream);
-
-}  // namespace io
-}  // namespace cudf

From 00c6000fd9a9a2718878e8314c8096ae9cdade25 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 22 Mar 2023 15:01:32 -0500
Subject: [PATCH 40/63] Add `force_nullable_schema` parameter to Parquet
 writer. (#12952)

Requires: https://github.com/rapidsai/cudf/pull/12933

This PR adds `nullability` parameter to parquet writer. When it is `True`, all columns are written as `null` in the schema. When `False`, all columns are written as `not null` in the schema, however, if a column contains null values, this parameter is ignored.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12952
---
 python/cudf/cudf/_lib/cpp/io/types.pxd |  3 +-
 python/cudf/cudf/_lib/parquet.pyx      | 46 ++++++++++++++++++++++----
 python/cudf/cudf/io/parquet.py         | 13 +++++++-
 python/cudf/cudf/tests/test_parquet.py | 33 ++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py      |  4 +++
 5 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index 21809ef7bd9..b2b0a77c45f 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -74,6 +74,7 @@ cdef extern from "cudf/io/types.hpp" \
         column_in_metadata& set_decimal_precision(uint8_t precision)
         column_in_metadata& child(size_type i)
         column_in_metadata& set_output_as_binary(bool binary)
+        string get_name()
 
     cdef cppclass table_input_metadata:
         table_input_metadata() except +
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 464d9243408..59571b0e4b3 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -321,7 +321,8 @@ def write_parquet(
     object row_group_size_rows=None,
     object max_page_size_bytes=None,
     object max_page_size_rows=None,
-    object partitions_info=None
+    object partitions_info=None,
+    object force_nullable_schema=False,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -364,7 +365,9 @@ def write_parquet(
 
         tbl_meta.get().column_metadata[i].set_name(name.encode())
         _set_col_metadata(
-            table[name]._column, tbl_meta.get().column_metadata[i]
+            table[name]._column,
+            tbl_meta.get().column_metadata[i],
+            force_nullable_schema
         )
 
     cdef map[string, string] tmp_user_data
@@ -467,6 +470,16 @@ cdef class ParquetWriter:
     max_page_size_rows: int, default 20000
         Maximum number of rows of each page of the output.
         By default, 20000 will be used.
+    force_nullable_schema : bool, default True.
+        If True, writes all columns as `null` in schema.
+        If False, columns are written as `null` if they contain null values,
+        otherwise as `not null`.
+
+    Notes
+    -----
+    `DataFrame.to_parquet` and `ParquetWriter` differ in the default
+    value for `force_nullable_schema` to enable all the chunks being
+    written by chunked parquet writer to be schema identical.
 
     See Also
     --------
@@ -484,13 +497,15 @@ cdef class ParquetWriter:
     cdef size_type row_group_size_rows
     cdef size_t max_page_size_bytes
     cdef size_type max_page_size_rows
+    cdef bool force_nullable_schema
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
                   int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
                   int row_group_size_rows=1000000,
                   int max_page_size_bytes=524288,
-                  int max_page_size_rows=20000):
+                  int max_page_size_rows=20000,
+                  bool force_nullable_schema=True):
         filepaths_or_buffers = (
             list(filepath_or_buffer)
             if is_list_like(filepath_or_buffer)
@@ -505,6 +520,7 @@ cdef class ParquetWriter:
         self.row_group_size_rows = row_group_size_rows
         self.max_page_size_bytes = max_page_size_bytes
         self.max_page_size_rows = max_page_size_rows
+        self.force_nullable_schema = force_nullable_schema
 
     def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
@@ -597,7 +613,9 @@ cdef class ParquetWriter:
         for i, name in enumerate(table._column_names, num_index_cols_meta):
             self.tbl_meta.get().column_metadata[i].set_name(name.encode())
             _set_col_metadata(
-                table[name]._column, self.tbl_meta.get().column_metadata[i]
+                table[name]._column,
+                self.tbl_meta.get().column_metadata[i],
+                self.force_nullable_schema
             )
 
         index = (
@@ -675,15 +693,29 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression):
         raise ValueError("Unsupported `compression` type")
 
 
-cdef _set_col_metadata(Column col, column_in_metadata& col_meta):
+cdef _set_col_metadata(
+    Column col,
+    column_in_metadata& col_meta,
+    bool force_nullable_schema
+):
+    col_meta.set_nullability(force_nullable_schema or col.nullable)
+
     if is_struct_dtype(col):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
             col_meta.child(i).set_name(name.encode())
-            _set_col_metadata(child_col, col_meta.child(i))
+            _set_col_metadata(
+                child_col,
+                col_meta.child(i),
+                force_nullable_schema
+            )
     elif is_list_dtype(col):
-        _set_col_metadata(col.children[1], col_meta.child(1))
+        _set_col_metadata(
+            col.children[1],
+            col_meta.child(1),
+            force_nullable_schema
+        )
     else:
         if is_decimal_dtype(col):
             col_meta.set_decimal_precision(col.dtype.precision)
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 1b7c1116205..3e1a4b1f024 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -60,6 +60,7 @@ def _write_parquet(
     max_page_size_rows=None,
     partitions_info=None,
     storage_options=None,
+    force_nullable_schema=False,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -89,6 +90,7 @@ def _write_parquet(
         "max_page_size_bytes": max_page_size_bytes,
         "max_page_size_rows": max_page_size_rows,
         "partitions_info": partitions_info,
+        "force_nullable_schema": force_nullable_schema,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -126,6 +128,7 @@ def write_to_dataset(
     max_page_size_bytes=None,
     max_page_size_rows=None,
     storage_options=None,
+    force_nullable_schema=False,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
     For each combination of partition group and value,
@@ -179,7 +182,6 @@ def write_to_dataset(
     max_page_size_rows: integer or None, default None
         Maximum number of rows of each page of the output.
         If None, 20000 will be used.
-
     storage_options : dict, optional, default None
         Extra options that make sense for a particular storage connection,
         e.g. host, port, username, password, etc. For HTTP(S) URLs the
@@ -187,6 +189,10 @@ def write_to_dataset(
         header options. For other URLs (e.g. starting with "s3://", and
         "gcs://") the key-value pairs are forwarded to ``fsspec.open``.
         Please see ``fsspec`` and ``urllib`` for more details.
+    force_nullable_schema : bool, default False.
+        If True, writes all columns as `null` in schema.
+        If False, columns are written as `null` if they contain null values,
+        otherwise as `not null`.
     """
 
     fs = ioutils._ensure_filesystem(fs, root_path, storage_options)
@@ -224,6 +230,7 @@ def write_to_dataset(
             row_group_size_rows=row_group_size_rows,
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
+            force_nullable_schema=force_nullable_schema,
         )
 
     else:
@@ -244,6 +251,7 @@ def write_to_dataset(
             row_group_size_rows=row_group_size_rows,
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
+            force_nullable_schema=force_nullable_schema,
         )
 
     return metadata
@@ -712,6 +720,7 @@ def to_parquet(
     max_page_size_rows=None,
     storage_options=None,
     return_metadata=False,
+    force_nullable_schema=False,
     *args,
     **kwargs,
 ):
@@ -760,6 +769,7 @@ def to_parquet(
                 max_page_size_rows=max_page_size_rows,
                 return_metadata=return_metadata,
                 storage_options=storage_options,
+                force_nullable_schema=force_nullable_schema,
             )
 
         partition_info = (
@@ -784,6 +794,7 @@ def to_parquet(
             max_page_size_rows=max_page_size_rows,
             partitions_info=partition_info,
             storage_options=storage_options,
+            force_nullable_schema=force_nullable_schema,
         )
 
     else:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 7cc67347467..9b783b03dad 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2775,3 +2775,36 @@ def test_parquet_reader_unsupported_page_encoding(datadir):
     # expect a failure when reading the whole file
     with pytest.raises(RuntimeError):
         cudf.read_parquet(fname)
+
+
+@pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}])
+@pytest.mark.parametrize("force_nullable_schema", [True, False])
+def test_parquet_writer_schema_nullability(data, force_nullable_schema):
+    df = cudf.DataFrame(data)
+    file_obj = BytesIO()
+
+    df.to_parquet(file_obj, force_nullable_schema=force_nullable_schema)
+
+    assert pa.parquet.read_schema(file_obj).field(0).nullable == (
+        force_nullable_schema or df.isnull().any().any()
+    )
+
+
+@pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}])
+@pytest.mark.parametrize("force_nullable_schema", [True, False])
+def test_parquet_chunked_writer_schema_nullability(
+    data, force_nullable_schema
+):
+    df = cudf.DataFrame(data)
+    file_obj = BytesIO()
+
+    writer = ParquetWriter(
+        file_obj, force_nullable_schema=force_nullable_schema
+    )
+
+    writer.write_table(df)
+
+    writer.close()
+    assert pa.parquet.read_schema(file_obj).field(0).nullable == (
+        force_nullable_schema or df.isnull().any().any()
+    )
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 5f39c8722d9..bf51b360fec 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -290,6 +290,10 @@
     include the file path metadata (relative to `root_path`).
     To request metadata binary blob when using with ``partition_cols``, Pass
     ``return_metadata=True`` instead of specifying ``metadata_file_path``
+force_nullable_schema : bool, default False.
+    If True, writes all columns as `null` in schema.
+    If False, columns are written as `null` if they contain null values,
+    otherwise as `not null`.
 **kwargs
     Additional parameters will be passed to execution engines other
     than ``cudf``.

From 253f2ab1dd89e03726a103185bbae4df8db4bb1e Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 22 Mar 2023 16:53:11 -0400
Subject: [PATCH 41/63] Use rapids-cmake parallel testing feature (#12451)

Converts libcudf over to use rapids-cmake new GPU aware parallel testing feature, which allows tests to run across all the GPUs on a machine without oversubscription.

This will allow developers to run `ctest -j<N>` and ctest will figure out given the current machine how many tests it can run in parallel given the current GPU set ( currently 2 tests per GPU ).

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/12451
---
 ci/test_cpp.sh                         | 42 ++++++-----
 cpp/cmake/config.json                  |  6 +-
 cpp/include/cudf_test/base_fixture.hpp |  5 +-
 cpp/libcudf_kafka/tests/CMakeLists.txt | 23 +++---
 cpp/tests/CMakeLists.txt               | 99 +++++++++++++++++++++-----
 dependencies.yaml                      |  1 +
 6 files changed, 126 insertions(+), 50 deletions(-)

diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 3f65399d3af..846b90c78e5 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -16,28 +16,26 @@ echo "STREAM_IDENTIFY_LIB=${STREAM_IDENTIFY_LIB_MODE_CUDF}"
 # Run libcudf and libcudf_kafka gtests from libcudf-tests package
 rapids-logger "Run gtests"
 
-# TODO: exit code handling is too verbose. Find a cleaner solution.
-
-for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do
-    test_name=$(basename ${gt})
-    echo "Running gtest $test_name"
-
-    # TODO: This strategy for using the stream lib will need to change when we
-    # switch to invoking ctest. For one, we will want to set the test
-    # properties to use the lib (which means that the decision will be made at
-    # CMake-configure time instead of runtime). We may also need to leverage
-    # something like gtest_discover_tests to be able to filter on the
-    # underlying test names.
-    if [[ ${test_name} == "SPAN_TEST" ]]; then
-        # This one test is specifically designed to test using a thrust device
-        # vector, so we expect and allow it to include default stream usage.
-        gtest_filter="SpanTest.CanConstructFromDeviceContainers"
-        GTEST_CUDF_STREAM_MODE="new_cudf_default" LD_PRELOAD=${STREAM_IDENTIFY_LIB_MODE_CUDF} ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR} --gtest_filter="-${gtest_filter}" && \
-            ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR} --gtest_filter="${gtest_filter}"
-    else
-        GTEST_CUDF_STREAM_MODE="new_cudf_default" LD_PRELOAD=${STREAM_IDENTIFY_LIB_MODE_CUDF} ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR}
-    fi
-done
+cd $CONDA_PREFIX/bin/gtests/libcudf/
+export GTEST_CUDF_STREAM_MODE="new_cudf_default"
+export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/
+export LD_PRELOAD=${STREAM_IDENTIFY_LIB_MODE_CUDF}
+
+ctest -E SPAN_TEST -j20 --output-on-failure
+
+# This one test is specifically designed to test using a thrust device vector,
+# so we expect and allow it to include default stream usage.
+_allowlist_filter="SpanTest.CanConstructFromDeviceContainers"
+GTEST_FILTER="-${_allowlist_filter}" ctest -R SPAN_TEST -VV
+LD_PRELOAD= GTEST_CUDF_STREAM_MODE=default GTEST_FILTER="${_allowlist_filter}" ctest -R SPAN_TEST -VV
+
+SUITEERROR=$?
+
+if (( ${SUITEERROR} == 0 )); then
+    cd $CONDA_PREFIX/bin/gtests/libcudf_kafka/
+    ctest -j20 --output-on-failure
+    SUITEERROR=$?
+fi
 
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/cpp/cmake/config.json b/cpp/cmake/config.json
index f7d7b001856..a65afe9e58d 100644
--- a/cpp/cmake/config.json
+++ b/cpp/cmake/config.json
@@ -13,7 +13,11 @@
         }
       },
       "ConfigureTest": {
-        "flags": ["TEST_NAME", "TEST_SRC"]
+        "flags": ["TEST_NAME", "TEST_SRC"],
+        "kwargs": {
+          "GPUS": 1,
+          "PERCENT": 1
+        }
       },
       "ConfigureBench": {
         "flags": ["BENCH_NAME", "BENCH_SRC"]
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 6bdfc7bfe98..1477314c592 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -244,7 +244,10 @@ inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_res
 
 inline auto make_pool()
 {
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+  auto const [free, total] = rmm::detail::available_device_memory();
+  auto min_alloc =
+    rmm::detail::align_down(std::min(free, total / 10), rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda(), min_alloc);
 }
 
 inline auto make_arena()
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index afa10f02c16..68a5327b455 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -13,7 +13,12 @@
 # =============================================================================
 
 # ##################################################################################################
-# * compiler function -----------------------------------------------------------------------------
+# enable testing ################################################################################
+# ##################################################################################################
+enable_testing()
+
+include(rapids-test)
+rapids_test_init()
 
 # This function takes in a test name and test source and handles setting all of the associated
 # properties and linking to build the test
@@ -27,12 +32,12 @@ function(ConfigureTest test_name)
   target_link_libraries(
     ${test_name} PRIVATE GTest::gmock GTest::gmock_main GTest::gtest_main cudf_kafka
   )
-  add_test(NAME ${test_name} COMMAND ${test_name})
-  install(
-    TARGETS ${test_name}
-    COMPONENT testing
-    DESTINATION bin/gtests/libcudf_kafka
-    EXCLUDE_FROM_ALL
+  rapids_test_add(
+    NAME ${test_name}
+    COMMAND ${test_name}
+    GPUS 1
+    PERCENT 25
+    INSTALL_COMPONENT_SET testing
   )
 endfunction()
 
@@ -40,3 +45,5 @@ endfunction()
 # * Kafka host tests
 # ----------------------------------------------------------------------------------
 ConfigureTest(KAFKA_HOST_TEST kafka_consumer_tests.cpp)
+
+rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcudf_kafka)
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 0d58b19de6a..bd4077aff4e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -13,12 +13,32 @@
 # =============================================================================
 
 # ##################################################################################################
-# * compiler function -----------------------------------------------------------------------------
+# enable testing ################################################################################
+# ##################################################################################################
+enable_testing()
+
+include(rapids-test)
+rapids_test_init()
 
 # This function takes in a test name and test source and handles setting all of the associated
 # properties and linking to build the test
 function(ConfigureTest CMAKE_TEST_NAME)
-  add_executable(${CMAKE_TEST_NAME} ${ARGN})
+  set(options)
+  set(one_value GPUS PERCENT)
+  set(multi_value)
+  cmake_parse_arguments(_CUDF_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
+  if(NOT DEFINED _CUDF_TEST_GPUS AND NOT DEFINED _CUDF_TEST_PERCENT)
+    set(_CUDF_TEST_GPUS 1)
+    set(_CUDF_TEST_PERCENT 15)
+  endif()
+  if(NOT DEFINED _CUDF_TEST_GPUS)
+    set(_CUDF_TEST_GPUS 1)
+  endif()
+  if(NOT DEFINED _CUDF_TEST_PERCENT)
+    set(_CUDF_TEST_PERCENT 100)
+  endif()
+
+  add_executable(${CMAKE_TEST_NAME} ${_CUDF_TEST_UNPARSED_ARGUMENTS})
   set_target_properties(
     ${CMAKE_TEST_NAME}
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gtests>"
@@ -35,12 +55,12 @@ function(ConfigureTest CMAKE_TEST_NAME)
     ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main
                                $<TARGET_NAME_IF_EXISTS:conda_env>
   )
-  add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
-  install(
-    TARGETS ${CMAKE_TEST_NAME}
-    COMPONENT testing
-    DESTINATION bin/gtests/libcudf
-    EXCLUDE_FROM_ALL
+  rapids_test_add(
+    NAME ${CMAKE_TEST_NAME}
+    COMMAND ${CMAKE_TEST_NAME}
+    GPUS ${_CUDF_TEST_GPUS}
+    PERCENT ${_CUDF_TEST_PERCENT}
+    INSTALL_COMPONENT_SET testing
   )
 endfunction()
 
@@ -112,6 +132,8 @@ ConfigureTest(
   groupby/sum_tests.cpp
   groupby/tdigest_tests.cu
   groupby/var_tests.cpp
+  GPUS 1
+  PERCENT 100
 )
 
 # ##################################################################################################
@@ -138,6 +160,8 @@ ConfigureTest(HASHING_TEST hashing/hash_test.cpp)
 ConfigureTest(
   PARTITIONING_TEST partitioning/hash_partition_test.cpp partitioning/round_robin_test.cpp
   partitioning/partition_test.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -149,6 +173,8 @@ ConfigureTest(HASH_MAP_TEST hash_map/map_test.cu)
 ConfigureTest(
   QUANTILES_TEST quantiles/percentile_approx_test.cpp quantiles/quantile_test.cpp
   quantiles/quantiles_test.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -162,6 +188,8 @@ ConfigureTest(
   reductions/segmented_reduction_tests.cpp
   reductions/list_rank_test.cpp
   reductions/tdigest_tests.cu
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -221,17 +249,41 @@ ConfigureTest(
 # * io tests --------------------------------------------------------------------------------------
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 
-ConfigureTest(CSV_TEST io/csv_test.cpp)
-ConfigureTest(FILE_IO_TEST io/file_io_test.cpp)
-ConfigureTest(ORC_TEST io/orc_test.cpp)
-ConfigureTest(PARQUET_TEST io/parquet_test.cpp io/parquet_chunked_reader_test.cpp)
-ConfigureTest(JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp)
+ConfigureTest(
+  CSV_TEST io/csv_test.cpp
+  GPUS 1
+  PERCENT 30
+)
+ConfigureTest(
+  FILE_IO_TEST io/file_io_test.cpp
+  GPUS 1
+  PERCENT 30
+)
+ConfigureTest(
+  ORC_TEST io/orc_test.cpp
+  GPUS 1
+  PERCENT 30
+)
+ConfigureTest(
+  PARQUET_TEST io/parquet_test.cpp io/parquet_chunked_reader_test.cpp
+  GPUS 1
+  PERCENT 30
+)
+ConfigureTest(
+  JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp
+  GPUS 1
+  PERCENT 30
+)
 ConfigureTest(JSON_WRITER_TEST io/json_writer.cpp)
 ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu)
 ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
-ConfigureTest(DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp)
+ConfigureTest(
+  DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
+  GPUS 1
+  PERCENT 30
+)
 target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
 ConfigureTest(FST_TEST io/fst/fst_test.cu)
@@ -245,6 +297,8 @@ endif()
 ConfigureTest(
   SORT_TEST sort/segmented_sort_tests.cpp sort/sort_test.cpp sort/stable_sort_tests.cpp
   sort/rank_test.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -275,6 +329,8 @@ ConfigureTest(
   copying/split_tests.cpp
   copying/utility_tests.cpp
   copying/reverse_tests.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -318,7 +374,11 @@ ConfigureTest(DEVICE_ATOMICS_TEST device_atomics/device_atomics_test.cu)
 
 # ##################################################################################################
 # * transpose tests -------------------------------------------------------------------------------
-ConfigureTest(TRANSPOSE_TEST transpose/transpose_test.cpp)
+ConfigureTest(
+  TRANSPOSE_TEST transpose/transpose_test.cpp
+  GPUS 1
+  PERCENT 70
+)
 
 # ##################################################################################################
 # * table tests -----------------------------------------------------------------------------------
@@ -359,6 +419,8 @@ ConfigureTest(
   rolling/range_rolling_window_test.cpp
   rolling/range_window_bounds_test.cpp
   rolling/rolling_test.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -504,6 +566,8 @@ ConfigureTest(
   lists/sort_lists_tests.cpp
   lists/stream_compaction/apply_boolean_mask_tests.cpp
   lists/stream_compaction/distinct_tests.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -524,7 +588,6 @@ set_tests_properties(
 )
 
 # ##################################################################################################
-# enable testing ################################################################################
+# Install tests ####################################################################################
 # ##################################################################################################
-
-enable_testing()
+rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcudf)
diff --git a/dependencies.yaml b/dependencies.yaml
index 12a6d1b370e..5f72b8b6dea 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -410,6 +410,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
+          - *cmake_ver
           - *gtest
           - *gmock
     specific:

From 0d1fb96110ebd902457dcaaeb821e6aa63c9970d Mon Sep 17 00:00:00 2001
From: Erik Welch <erik.n.welch@gmail.com>
Date: Wed, 22 Mar 2023 16:04:29 -0500
Subject: [PATCH 42/63] Fix `sort_values` when column is all empty strings
 (#12988)

See test for simple MRE.

This fixes https://github.com/rapidsai/cugraph/issues/3058

Authors:
  - Erik Welch (https://github.com/eriknw)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/12988
---
 python/dask_cudf/dask_cudf/sorting.py         | 10 ++++++----
 python/dask_cudf/dask_cudf/tests/test_sort.py | 12 +++++++++++-
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index 0f2dc0d4efc..e841f2d8830 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from collections.abc import Iterator
 
@@ -218,9 +218,11 @@ def quantile_divisions(df, by, npartitions):
                 divisions[col].iloc[-1] += 1
                 divisions[col] = divisions[col].astype(dtype)
             else:
-                divisions[col].iloc[-1] = chr(
-                    ord(divisions[col].iloc[-1][0]) + 1
-                )
+                if last := divisions[col].iloc[-1]:
+                    val = chr(ord(last[0]) + 1)
+                else:
+                    val = "this string intentionally left empty"  # any but ""
+                divisions[col].iloc[-1] = val
         divisions = divisions.drop_duplicates().sort_index()
     return divisions
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 770a52316b6..94609b180d6 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -104,3 +104,13 @@ def f(partition, by_columns, ascending, na_position, **kwargs):
         )
     expect = df.sort_values(by=by)
     dd.assert_eq(got, expect, check_index=False)
+
+
+@pytest.mark.parametrize("by", ["a", "b", ["a", "b"], ["b", "a"]])
+def test_sort_values_empty_string(by):
+    df = cudf.DataFrame({"a": [3, 2, 1, 4], "b": [""] * 4})
+    ddf = dd.from_pandas(df, npartitions=2)
+    got = ddf.sort_values(by)
+    if "a" in by:
+        expect = df.sort_values(by)
+        assert dd.assert_eq(got, expect, check_index=False)

From d9ad58cd76e35b062094305b1d6ace2e6ea3e42c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 22 Mar 2023 17:50:45 -0400
Subject: [PATCH 43/63] Rework gtests SequenceTest to remove using namepace
 cudf (#12985)

Remove the `using namespace cudf;` from the top of the source file to make it easier to follow. Seemed an unnecessary usage and violated the spirit of #12784

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/12985
---
 cpp/tests/filling/sequence_tests.cpp | 86 +++++++++++++---------------
 1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index 383a69affa1..1f55cdf3df0 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/scalar/scalar.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -23,11 +21,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/filling.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/unary.hpp>
-
-using namespace cudf;
-using namespace cudf::test;
+#include <cudf/scalar/scalar.hpp>
 
 template <typename T>
 class SequenceTypedTestFixture : public cudf::test::BaseFixture {
@@ -44,13 +38,13 @@ TYPED_TEST(SequenceTypedTestFixture, Incrementing)
 {
   using T = TypeParam;
 
-  numeric_scalar<T> init(0);
-  numeric_scalar<T> step(1);
+  cudf::numeric_scalar<T> init(0);
+  cudf::numeric_scalar<T> step(1);
 
-  size_type num_els = 10;
+  cudf::size_type num_els = 10;
 
   T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
 
   auto result = cudf::sequence(num_els, init, step);
 
@@ -61,13 +55,13 @@ TYPED_TEST(SequenceTypedTestFixture, Decrementing)
 {
   using T = TypeParam;
 
-  numeric_scalar<T> init(0);
-  numeric_scalar<T> step(-5);
+  cudf::numeric_scalar<T> init(0);
+  cudf::numeric_scalar<T> step(-5);
 
-  size_type num_els = 10;
+  cudf::size_type num_els = 10;
 
   T expected[] = {0, -5, -10, -15, -20, -25, -30, -35, -40, -45};
-  fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
 
   auto result = cudf::sequence(num_els, init, step);
 
@@ -78,13 +72,13 @@ TYPED_TEST(SequenceTypedTestFixture, EmptyOutput)
 {
   using T = TypeParam;
 
-  numeric_scalar<T> init(0);
-  numeric_scalar<T> step(-5);
+  cudf::numeric_scalar<T> init(0);
+  cudf::numeric_scalar<T> step(-5);
 
-  size_type num_els = 0;
+  cudf::size_type num_els = 0;
 
   T expected[] = {};
-  fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
 
   auto result = cudf::sequence(num_els, init, step);
 
@@ -93,31 +87,31 @@ TYPED_TEST(SequenceTypedTestFixture, EmptyOutput)
 
 TEST_F(SequenceTestFixture, BadTypes)
 {
-  string_scalar string_init("zero");
-  string_scalar string_step("???");
+  cudf::string_scalar string_init("zero");
+  cudf::string_scalar string_step("???");
   EXPECT_THROW(cudf::sequence(10, string_init, string_step), cudf::logic_error);
 
-  numeric_scalar<bool> bool_init(true);
-  numeric_scalar<bool> bool_step(false);
+  cudf::numeric_scalar<bool> bool_init(true);
+  cudf::numeric_scalar<bool> bool_step(false);
   EXPECT_THROW(cudf::sequence(10, bool_init, bool_step), cudf::logic_error);
 
-  timestamp_scalar<timestamp_s> ts_init(duration_s{10}, true);
-  timestamp_scalar<timestamp_s> ts_step(duration_s{10}, true);
+  cudf::timestamp_scalar<cudf::timestamp_s> ts_init(cudf::duration_s{10}, true);
+  cudf::timestamp_scalar<cudf::timestamp_s> ts_step(cudf::duration_s{10}, true);
   EXPECT_THROW(cudf::sequence(10, ts_init, ts_step), cudf::logic_error);
 }
 
 TEST_F(SequenceTestFixture, MismatchedInputs)
 {
-  numeric_scalar<int> init(0);
-  numeric_scalar<float> step(-5);
+  cudf::numeric_scalar<int> init(0);
+  cudf::numeric_scalar<float> step(-5);
   EXPECT_THROW(cudf::sequence(10, init, step), cudf::logic_error);
 
-  numeric_scalar<int> init2(0);
-  numeric_scalar<int8_t> step2(-5);
+  cudf::numeric_scalar<int> init2(0);
+  cudf::numeric_scalar<int8_t> step2(-5);
   EXPECT_THROW(cudf::sequence(10, init2, step2), cudf::logic_error);
 
-  numeric_scalar<float> init3(0);
-  numeric_scalar<double> step3(-5);
+  cudf::numeric_scalar<float> init3(0);
+  cudf::numeric_scalar<double> step3(-5);
   EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::logic_error);
 }
 
@@ -125,12 +119,12 @@ TYPED_TEST(SequenceTypedTestFixture, DefaultStep)
 {
   using T = TypeParam;
 
-  numeric_scalar<T> init(0);
+  cudf::numeric_scalar<T> init(0);
 
-  size_type num_els = 10;
+  cudf::size_type num_els = 10;
 
   T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
 
   auto result = cudf::sequence(num_els, init);
 
@@ -140,11 +134,11 @@ TYPED_TEST(SequenceTypedTestFixture, DefaultStep)
 TEST_F(SequenceTestFixture, DateSequenceBasic)
 {
   // Timestamp generated using https://www.epochconverter.com/
-  timestamp_scalar<timestamp_s> init(1629852896L, true);  // 2021-08-25 00:54:56 GMT
-  size_type size{5};
-  size_type months{1};
+  cudf::timestamp_scalar<cudf::timestamp_s> init(1629852896L, true);  // 2021-08-25 00:54:56 GMT
+  cudf::size_type size{5};
+  cudf::size_type months{1};
 
-  fixed_width_column_wrapper<timestamp_s, int64_t> expected{
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_s, int64_t> expected{
     1629852896L,  // 2021-08-25 00:54:56 GMT
     1632531296L,  // 2021-09-25 00:54:56 GMT
     1635123296L,  // 2021-10-25 00:54:56 GMT
@@ -160,11 +154,11 @@ TEST_F(SequenceTestFixture, DateSequenceBasic)
 TEST_F(SequenceTestFixture, DateSequenceLeapYear)
 {
   // Timestamp generated using https://www.epochconverter.com/
-  timestamp_scalar<timestamp_s> init(951876379L, true);  // 2000-02-29 02:06:19 GMT
-  size_type size{5};
-  size_type months{12};
+  cudf::timestamp_scalar<cudf::timestamp_s> init(951876379L, true);  // 2000-02-29 02:06:19 GMT
+  cudf::size_type size{5};
+  cudf::size_type months{12};
 
-  fixed_width_column_wrapper<timestamp_s, int64_t> expected{
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_s, int64_t> expected{
     951876379L,   // 2000-02-29 02:06:19 GMT Leap Year
     983412379L,   // 2001-02-28 02:06:19 GMT
     1014948379L,  // 2002-02-28 02:06:19 GMT
@@ -179,9 +173,9 @@ TEST_F(SequenceTestFixture, DateSequenceLeapYear)
 
 TEST_F(SequenceTestFixture, DateSequenceBadTypes)
 {
-  numeric_scalar<int64_t> init(951876379, true);
-  size_type size   = 5;
-  size_type months = 12;
+  cudf::numeric_scalar<int64_t> init(951876379, true);
+  cudf::size_type size   = 5;
+  cudf::size_type months = 12;
 
   EXPECT_THROW(calendrical_month_sequence(size, init, months), cudf::logic_error);
 }

From 9753acec803a22672d495186a3265ed5fff38616 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 22 Mar 2023 19:41:05 -0700
Subject: [PATCH 44/63] Remove unused variable and fix memory issue in ORC
 writer (#12984)

This removes unused variable in ORC writer, and also fix a memory issue with dereferencing dangling pointers due to a device buffer being destroyed early while it should be kept alive.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/12984
---
 cpp/src/io/orc/writer_impl.cu | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index d3bb0a45c12..bd526f4f4eb 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -594,8 +594,6 @@ void build_dictionaries(orc_table_view& orc_table,
                         bool enable_dictionary,
                         rmm::cuda_stream_view stream)
 {
-  const auto num_rowgroups = dict.size().first;
-
   for (size_t dict_idx = 0; dict_idx < orc_table.num_string_columns(); ++dict_idx) {
     auto& str_column = orc_table.string_column(dict_idx);
     str_column.attach_stripe_dict(stripe_dict.base_host_ptr(), stripe_dict.base_device_ptr());
@@ -2206,6 +2204,7 @@ std::tuple<orc_streams,
            hostdevice_2dvector<gpu::StripeStream>,
            encoded_data,
            file_segmentation,
+           hostdevice_2dvector<gpu::StripeDictionary>,
            std::vector<StripeInformation>,
            orc_table_view,
            rmm::device_buffer,
@@ -2227,7 +2226,8 @@ convert_table_to_orc_data(table_view const& input,
 
   auto orc_table = make_orc_table_view(input, *input_tview, table_meta, stream);
 
-  auto const pd_masks = init_pushdown_null_masks(orc_table, stream);
+  // This is unused but it holds memory buffers for later access thus needs to be kept alive.
+  [[maybe_unused]] auto const pd_masks = init_pushdown_null_masks(orc_table, stream);
 
   auto rowgroup_bounds = calculate_rowgroup_bounds(orc_table, row_index_stride, stream);
 
@@ -2297,6 +2297,7 @@ convert_table_to_orc_data(table_view const& input,
             std::move(strm_descs),
             std::move(enc_data),
             std::move(segmentation),
+            std::move(stripe_dict),
             std::move(stripes),
             std::move(orc_table),
             rmm::device_buffer{},  // compressed_data
@@ -2382,6 +2383,7 @@ convert_table_to_orc_data(table_view const& input,
           std::move(strm_descs),
           std::move(enc_data),
           std::move(segmentation),
+          std::move(stripe_dict),
           std::move(stripes),
           std::move(orc_table),
           std::move(compressed_data),
@@ -2454,16 +2456,17 @@ void writer::impl::write(table_view const& input)
   // is still intact.
   // Note that `out_sink_` is intentionally passed by const reference to prevent accidentally
   // writing anything to it.
-  auto [streams,
-        comp_results,
-        strm_descs,
-        enc_data,
-        segmentation,
-        stripes,
-        orc_table,
-        compressed_data,
-        intermediate_stats,
-        stream_output] = [&] {
+  [[maybe_unused]] auto [streams,
+                         comp_results,
+                         strm_descs,
+                         enc_data,
+                         segmentation,
+                         stripe_dict, /* unused, but its data will be accessed via pointer later */
+                         stripes,
+                         orc_table,
+                         compressed_data,
+                         intermediate_stats,
+                         stream_output] = [&] {
     try {
       return convert_table_to_orc_data(input,
                                        *table_meta,

From 90d2cb162785f044e357290779e762ea980f9583 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 23 Mar 2023 07:00:13 -0400
Subject: [PATCH 45/63] Move detail reduction functions to
 cudf::reduction::detail namespace (#12971)

Moves internal/detail reduction (and segmented reduction) functions and other declarations from the `cudf::detail` namespace to the `cudf::reduction::detail` namespace. The detail function headers also moved from `cpp/include/cudf/detail` to `cpp/include/cudf/reduction/detail` directory.
The public header `cpp/include/cudf/reduction.hpp` was not changed or moved.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Robert Maynard (https://github.com/robertmaynard)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/12971
---
 conda/recipes/libcudf/meta.yaml               |   4 +-
 .../cudf/{ => reduction}/detail/reduction.cuh |   4 +-
 .../detail/reduction_functions.hpp            |   2 +
 .../detail/reduction_operators.cuh            |   9 +-
 .../detail/segmented_reduction.cuh            |   2 +-
 .../detail/segmented_reduction_functions.hpp  | 148 +++++++++---------
 .../stream_compaction/apply_boolean_mask.cu   |  16 +-
 cpp/src/reductions/all.cu                     |  21 +--
 cpp/src/reductions/any.cu                     |  21 +--
 cpp/src/reductions/collect_ops.cu             |  40 ++---
 cpp/src/reductions/compound.cuh               |   2 +-
 cpp/src/reductions/max.cu                     |  19 ++-
 cpp/src/reductions/mean.cu                    |  12 +-
 cpp/src/reductions/min.cu                     |  21 ++-
 cpp/src/reductions/nth_element.cu             |  18 ++-
 cpp/src/reductions/product.cu                 |  12 +-
 cpp/src/reductions/reductions.cpp             | 103 ++++++------
 cpp/src/reductions/segmented/all.cu           |  14 +-
 cpp/src/reductions/segmented/any.cu           |  14 +-
 cpp/src/reductions/segmented/compound.cuh     |   2 +-
 cpp/src/reductions/segmented/max.cu           |  15 +-
 cpp/src/reductions/segmented/mean.cu          |   6 +-
 cpp/src/reductions/segmented/min.cu           |  15 +-
 cpp/src/reductions/segmented/product.cu       |  17 +-
 cpp/src/reductions/segmented/reductions.cpp   |  77 +++++----
 cpp/src/reductions/segmented/simple.cuh       |  16 +-
 cpp/src/reductions/segmented/std.cu           |   7 +-
 cpp/src/reductions/segmented/sum.cu           |  17 +-
 .../reductions/segmented/sum_of_squares.cu    |   6 +-
 cpp/src/reductions/segmented/var.cu           |   7 +-
 cpp/src/reductions/simple.cuh                 |  12 +-
 cpp/src/reductions/std.cu                     |  10 +-
 cpp/src/reductions/struct_minmax_util.cuh     |  10 +-
 cpp/src/reductions/sum.cu                     |  11 +-
 cpp/src/reductions/sum_of_squares.cu          |  11 +-
 cpp/src/reductions/var.cu                     |   9 +-
 36 files changed, 354 insertions(+), 376 deletions(-)
 rename cpp/include/cudf/{ => reduction}/detail/reduction.cuh (99%)
 rename cpp/include/cudf/{ => reduction}/detail/reduction_functions.hpp (99%)
 rename cpp/include/cudf/{ => reduction}/detail/reduction_operators.cuh (97%)
 rename cpp/include/cudf/{ => reduction}/detail/segmented_reduction.cuh (99%)
 rename cpp/include/cudf/{ => reduction}/detail/segmented_reduction_functions.hpp (69%)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 0b2fc71aacd..469c25fb673 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -114,7 +114,6 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/nvtx/nvtx3.hpp
         - test -f $PREFIX/include/cudf/detail/nvtx/ranges.hpp
         - test -f $PREFIX/include/cudf/detail/quantiles.hpp
-        - test -f $PREFIX/include/cudf/detail/reduction_functions.hpp
         - test -f $PREFIX/include/cudf/detail/repeat.hpp
         - test -f $PREFIX/include/cudf/detail/replace.hpp
         - test -f $PREFIX/include/cudf/detail/reshape.hpp
@@ -123,7 +122,6 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/scan.hpp
         - test -f $PREFIX/include/cudf/detail/scatter.hpp
         - test -f $PREFIX/include/cudf/detail/search.hpp
-        - test -f $PREFIX/include/cudf/detail/segmented_reduction_functions.hpp
         - test -f $PREFIX/include/cudf/detail/sequence.hpp
         - test -f $PREFIX/include/cudf/detail/sorting.hpp
         - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
@@ -218,6 +216,8 @@ outputs:
         - test -f $PREFIX/include/cudf/partitioning.hpp
         - test -f $PREFIX/include/cudf/quantiles.hpp
         - test -f $PREFIX/include/cudf/reduction.hpp
+        - test -f $PREFIX/include/cudf/reduction/detail/reduction_functions.hpp
+        - test -f $PREFIX/include/cudf/reduction/detail/segmented_reduction_functions.hpp
         - test -f $PREFIX/include/cudf/replace.hpp
         - test -f $PREFIX/include/cudf/reshape.hpp
         - test -f $PREFIX/include/cudf/rolling.hpp
diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/reduction/detail/reduction.cuh
similarity index 99%
rename from cpp/include/cudf/detail/reduction.cuh
rename to cpp/include/cudf/reduction/detail/reduction.cuh
index 9dc3b996afc..1620635e0e3 100644
--- a/cpp/include/cudf/detail/reduction.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/detail/reduction_operators.cuh>
+#include "reduction_operators.cuh"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -31,6 +31,8 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/iterator_traits.h>
 
+#include <optional>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
similarity index 99%
rename from cpp/include/cudf/detail/reduction_functions.hpp
rename to cpp/include/cudf/reduction/detail/reduction_functions.hpp
index c554ea6a83e..014a6ba70eb 100644
--- a/cpp/include/cudf/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -27,6 +27,7 @@
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 /**
  * @brief Computes sum of elements in input column
  *
@@ -323,5 +324,6 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh
similarity index 97%
rename from cpp/include/cudf/detail/reduction_operators.cuh
rename to cpp/include/cudf/reduction/detail/reduction_operators.cuh
index 5a0cb4c1714..0dba84a0b28 100644
--- a/cpp/include/cudf/detail/reduction_operators.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 // intermediate data structure to compute `var`, `std`
 template <typename ResultType>
 struct var_std {
@@ -244,7 +245,7 @@ struct variance : public compound_op<variance> {
   using op = cudf::DeviceSum;
 
   template <typename ResultType>
-  using transformer = cudf::reduction::transformer_var_std<ResultType>;
+  using transformer = cudf::reduction::detail::transformer_var_std<ResultType>;
 
   template <typename ResultType>
   struct intermediate {
@@ -270,7 +271,7 @@ struct standard_deviation : public compound_op<standard_deviation> {
   using op = cudf::DeviceSum;
 
   template <typename ResultType>
-  using transformer = cudf::reduction::transformer_var_std<ResultType>;
+  using transformer = cudf::reduction::detail::transformer_var_std<ResultType>;
 
   template <typename ResultType>
   struct intermediate {
@@ -288,7 +289,7 @@ struct standard_deviation : public compound_op<standard_deviation> {
     };
   };
 };
-
 }  // namespace op
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/segmented_reduction.cuh b/cpp/include/cudf/reduction/detail/segmented_reduction.cuh
similarity index 99%
rename from cpp/include/cudf/detail/segmented_reduction.cuh
rename to cpp/include/cudf/reduction/detail/segmented_reduction.cuh
index 1c39d5eab1e..5c2eaf8cdcb 100644
--- a/cpp/include/cudf/detail/segmented_reduction.cuh
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/detail/reduction_operators.cuh>
+#include "reduction_operators.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/include/cudf/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
similarity index 69%
rename from cpp/include/cudf/detail/segmented_reduction_functions.hpp
rename to cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
index 7b5628fa49a..c1bf59e5f65 100644
--- a/cpp/include/cudf/detail/segmented_reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
@@ -27,6 +27,7 @@
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 /**
  * @brief Compute sum of each segment in the input column
@@ -50,14 +51,13 @@ namespace reduction {
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Sums of segments as type `output_dtype`
  */
-std::unique_ptr<column> segmented_sum(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_sum(column_view const& col,
+                                      device_span<size_type const> offsets,
+                                      data_type const output_dtype,
+                                      null_policy null_handling,
+                                      std::optional<std::reference_wrapper<scalar const>> init,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes product of each segment in the input column
@@ -81,14 +81,13 @@ std::unique_ptr<column> segmented_sum(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Product of segments as type `output_dtype`
  */
-std::unique_ptr<column> segmented_product(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_product(column_view const& col,
+                                          device_span<size_type const> offsets,
+                                          data_type const output_dtype,
+                                          null_policy null_handling,
+                                          std::optional<std::reference_wrapper<scalar const>> init,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Compute minimum of each segment in the input column
@@ -111,14 +110,13 @@ std::unique_ptr<column> segmented_product(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minimums of segments as type `output_dtype`
  */
-std::unique_ptr<column> segmented_min(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_min(column_view const& col,
+                                      device_span<size_type const> offsets,
+                                      data_type const output_dtype,
+                                      null_policy null_handling,
+                                      std::optional<std::reference_wrapper<scalar const>> init,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Compute maximum of each segment in the input column
@@ -141,14 +139,13 @@ std::unique_ptr<column> segmented_min(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Maximums of segments as type `output_dtype`
  */
-std::unique_ptr<column> segmented_max(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_max(column_view const& col,
+                                      device_span<size_type const> offsets,
+                                      data_type const output_dtype,
+                                      null_policy null_handling,
+                                      std::optional<std::reference_wrapper<scalar const>> init,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Compute if any of the values in the segment are true when typecasted to bool
@@ -172,14 +169,13 @@ std::unique_ptr<column> segmented_max(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of type BOOL8 for the results of the segments
  */
-std::unique_ptr<column> segmented_any(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_any(column_view const& col,
+                                      device_span<size_type const> offsets,
+                                      data_type const output_dtype,
+                                      null_policy null_handling,
+                                      std::optional<std::reference_wrapper<scalar const>> init,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Compute if all of the values in the segment are true when typecasted to bool
@@ -203,14 +199,13 @@ std::unique_ptr<column> segmented_any(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of BOOL8 for the results of the segments
  */
-std::unique_ptr<column> segmented_all(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_all(column_view const& col,
+                                      device_span<size_type const> offsets,
+                                      data_type const output_dtype,
+                                      null_policy null_handling,
+                                      std::optional<std::reference_wrapper<scalar const>> init,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes mean of elements of segments in the input column
@@ -233,13 +228,12 @@ std::unique_ptr<column> segmented_all(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of `output_dtype` for the reduction results of the segments
  */
-std::unique_ptr<column> segmented_mean(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_mean(column_view const& col,
+                                       device_span<size_type const> offsets,
+                                       data_type const output_dtype,
+                                       null_policy null_handling,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes sum of squares of elements of segments in the input column
@@ -262,13 +256,12 @@ std::unique_ptr<column> segmented_mean(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of `output_dtype` for the reduction results of the segments
  */
-std::unique_ptr<column> segmented_sum_of_squares(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_sum_of_squares(column_view const& col,
+                                                 device_span<size_type const> offsets,
+                                                 data_type const output_dtype,
+                                                 null_policy null_handling,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes the standard deviation of elements of segments in the input column
@@ -293,14 +286,13 @@ std::unique_ptr<column> segmented_sum_of_squares(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of `output_dtype` for the reduction results of the segments
  */
-std::unique_ptr<column> segmented_standard_deviation(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  size_type ddof,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_standard_deviation(column_view const& col,
+                                                     device_span<size_type const> offsets,
+                                                     data_type const output_dtype,
+                                                     null_policy null_handling,
+                                                     size_type ddof,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes the variance of elements of segments in the input column
@@ -325,14 +317,14 @@ std::unique_ptr<column> segmented_standard_deviation(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of `output_dtype` for the reduction results of the segments
  */
-std::unique_ptr<column> segmented_variance(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  size_type ddof,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_variance(column_view const& col,
+                                           device_span<size_type const> offsets,
+                                           data_type const output_dtype,
+                                           null_policy null_handling,
+                                           size_type ddof,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index 5acb1cb8849..0aaa8356304 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -21,10 +21,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
-#include <cudf/detail/segmented_reduction_functions.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/lists/detail/stream_compaction.hpp>
 #include <cudf/lists/stream_compaction.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -65,12 +65,14 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
       cudf::detail::slice(
         boolean_mask.offsets(), {boolean_mask.offset(), boolean_mask.size() + 1}, stream)
         .front();
-    auto const sizes       = cudf::reduction::segmented_sum(boolean_mask_sliced_child,
-                                                      boolean_mask_sliced_offsets,
-                                                      offset_data_type,
-                                                      null_policy::EXCLUDE,
-                                                      std::nullopt,
-                                                      stream);
+    auto const sizes =
+      cudf::reduction::detail::segmented_sum(boolean_mask_sliced_child,
+                                             boolean_mask_sliced_offsets,
+                                             offset_data_type,
+                                             null_policy::EXCLUDE,
+                                             std::nullopt,
+                                             stream,
+                                             rmm::mr::get_current_device_resource());
     auto const d_sizes     = column_device_view::create(*sizes, stream);
     auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, offset_type{0});
     auto const sizes_end   = sizes_begin + sizes->size();
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 185e14b6e2f..9d32bc4c7f6 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,8 +56,7 @@ struct all_fn {
   {
     auto const d_dict = cudf::column_device_view::create(input, stream);
     auto const iter   = [&] {
-      auto null_iter =
-        cudf::reduction::op::min{}.template get_null_replacing_element_transformer<bool>();
+      auto null_iter = op::min{}.template get_null_replacing_element_transformer<bool>();
       auto pair_iter =
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
@@ -78,7 +78,6 @@ struct all_fn {
 };
 
 }  // namespace
-}  // namespace detail
 
 std::unique_ptr<cudf::scalar> all(column_view const& col,
                                   cudf::data_type const output_dtype,
@@ -93,15 +92,11 @@ std::unique_ptr<cudf::scalar> all(column_view const& col,
     return cudf::type_dispatcher(
       dictionary_column_view(col).keys().type(), detail::all_fn{}, col, stream, mr);
   }
+  using reducer = simple::detail::bool_result_element_dispatcher<op::min>;
   // dispatch for non-dictionary types
-  return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::bool_result_element_dispatcher<cudf::reduction::op::min>{},
-    col,
-    init,
-    stream,
-    mr);
+  return cudf::type_dispatcher(col.type(), reducer{}, col, init, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index 871672e5c03..07977d2417f 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,8 +56,7 @@ struct any_fn {
   {
     auto const d_dict = cudf::column_device_view::create(input, stream);
     auto const iter   = [&] {
-      auto null_iter =
-        cudf::reduction::op::max{}.template get_null_replacing_element_transformer<bool>();
+      auto null_iter = op::max{}.template get_null_replacing_element_transformer<bool>();
       auto pair_iter =
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
@@ -78,7 +78,6 @@ struct any_fn {
 };
 
 }  // namespace
-}  // namespace detail
 
 std::unique_ptr<cudf::scalar> any(column_view const& col,
                                   cudf::data_type const output_dtype,
@@ -93,15 +92,11 @@ std::unique_ptr<cudf::scalar> any(column_view const& col,
     return cudf::type_dispatcher(
       dictionary_column_view(col).keys().type(), detail::any_fn{}, col, stream, mr);
   }
+  using reducer = simple::detail::bool_result_element_dispatcher<op::max>;
   // dispatch for non-dictionary types
-  return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::bool_result_element_dispatcher<cudf::reduction::op::max>{},
-    col,
-    init,
-    stream,
-    mr);
+  return cudf::type_dispatcher(col.type(), reducer{}, col, init, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/collect_ops.cu b/cpp/src/reductions/collect_ops.cu
index 4d6a32b528a..743eddbffaf 100644
--- a/cpp/src/reductions/collect_ops.cu
+++ b/cpp/src/reductions/collect_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,15 +17,15 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy_if.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/reduction_functions.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
 namespace cudf {
 namespace reduction {
-
+namespace detail {
 namespace {
 
 /**
@@ -49,8 +49,8 @@ std::unique_ptr<scalar> collect_list(column_view const& col,
 {
   if (need_handle_nulls(col, null_handling)) {
     auto d_view             = column_device_view::create(col, stream);
-    auto filter             = detail::validity_accessor(*d_view);
-    auto null_purged_table  = detail::copy_if(table_view{{col}}, filter, stream, mr);
+    auto filter             = cudf::detail::validity_accessor(*d_view);
+    auto null_purged_table  = cudf::detail::copy_if(table_view{{col}}, filter, stream, mr);
     column* null_purged_col = null_purged_table->release().front().release();
     null_purged_col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
     return std::make_unique<list_scalar>(std::move(*null_purged_col), true, stream, mr);
@@ -86,13 +86,13 @@ std::unique_ptr<scalar> collect_set(column_view const& col,
     return std::pair(col, std::unique_ptr<scalar>(nullptr));
   }();
 
-  auto distinct_table = detail::distinct(table_view{{input_as_collect_list}},
-                                         std::vector<size_type>{0},
-                                         duplicate_keep_option::KEEP_ANY,
-                                         nulls_equal,
-                                         nans_equal,
-                                         stream,
-                                         mr);
+  auto distinct_table = cudf::detail::distinct(table_view{{input_as_collect_list}},
+                                               std::vector<size_type>{0},
+                                               duplicate_keep_option::KEEP_ANY,
+                                               nulls_equal,
+                                               nans_equal,
+                                               stream,
+                                               mr);
 
   return std::make_unique<list_scalar>(std::move(distinct_table->get_column(0)), true, stream, mr);
 }
@@ -104,15 +104,15 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    rmm::mr::device_memory_resource* mr)
 {
   auto flatten_col    = col.get_sliced_child(stream);
-  auto distinct_table = detail::distinct(table_view{{flatten_col}},
-                                         std::vector<size_type>{0},
-                                         duplicate_keep_option::KEEP_ANY,
-                                         nulls_equal,
-                                         nans_equal,
-                                         stream,
-                                         mr);
+  auto distinct_table = cudf::detail::distinct(table_view{{flatten_col}},
+                                               std::vector<size_type>{0},
+                                               duplicate_keep_option::KEEP_ANY,
+                                               nulls_equal,
+                                               nans_equal,
+                                               stream,
+                                               mr);
   return std::make_unique<list_scalar>(std::move(distinct_table->get_column(0)), true, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 9458ae2d581..3428130d912 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <cudf/detail/reduction.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
+#include <cudf/reduction/detail/reduction.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
diff --git a/cpp/src/reductions/max.cu b/cpp/src/reductions/max.cu
index b57896e5fc0..1cf2b6f53b6 100644
--- a/cpp/src/reductions/max.cu
+++ b/cpp/src/reductions/max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> max(column_view const& col,
                                   cudf::data_type const output_dtype,
@@ -35,14 +37,11 @@ std::unique_ptr<cudf::scalar> max(column_view const& col,
   auto const dispatch_type = cudf::is_dictionary(col.type())
                                ? cudf::dictionary_column_view(col).indices().type()
                                : col.type();
-  return cudf::type_dispatcher(
-    dispatch_type,
-    simple::detail::same_element_type_dispatcher<cudf::reduction::op::max>{},
-    col,
-    init,
-    stream,
-    mr);
+
+  using reducer = simple::detail::same_element_type_dispatcher<op::max>;
+  return cudf::type_dispatcher(dispatch_type, reducer{}, col, init, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/mean.cu b/cpp/src/reductions/mean.cu
index e4b5f754b9b..e64660932ce 100644
--- a/cpp/src/reductions/mean.cu
+++ b/cpp/src/reductions/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,26 +14,30 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "compound.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/compound.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> mean(column_view const& col,
                                    cudf::data_type const output_dtype,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
-  using reducer = compound::detail::element_type_dispatcher<cudf::reduction::op::mean>;
   auto col_type =
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
+
+  using reducer = compound::detail::element_type_dispatcher<op::mean>;
   return cudf::type_dispatcher(
     col_type, reducer(), col, output_dtype, /* ddof is not used for mean*/ 1, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/min.cu b/cpp/src/reductions/min.cu
index ed16cec5ffd..792965e8b99 100644
--- a/cpp/src/reductions/min.cu
+++ b/cpp/src/reductions/min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
-
+namespace detail {
 std::unique_ptr<cudf::scalar> min(column_view const& col,
                                   data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
@@ -33,14 +34,10 @@ std::unique_ptr<cudf::scalar> min(column_view const& col,
   auto const dispatch_type = cudf::is_dictionary(col.type())
                                ? cudf::dictionary_column_view(col).indices().type()
                                : col.type();
-  return cudf::type_dispatcher(
-    dispatch_type,
-    simple::detail::same_element_type_dispatcher<cudf::reduction::op::min>{},
-    col,
-    init,
-    stream,
-    mr);
-}
 
+  using reducer = simple::detail::same_element_type_dispatcher<op::min>;
+  return cudf::type_dispatcher(dispatch_type, reducer{}, col, init, stream, mr);
+}
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/nth_element.cu b/cpp/src/reductions/nth_element.cu
index 78c469ee767..ef58ec3f42e 100644
--- a/cpp/src/reductions/nth_element.cu
+++ b/cpp/src/reductions/nth_element.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -28,11 +28,13 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 
-std::unique_ptr<cudf::scalar> cudf::reduction::nth_element(column_view const& col,
-                                                           size_type n,
-                                                           null_policy null_handling,
-                                                           rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+namespace cudf::reduction::detail {
+
+std::unique_ptr<cudf::scalar> nth_element(column_view const& col,
+                                          size_type n,
+                                          null_policy null_handling,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(n >= -col.size() and n < col.size(), "Index out of bounds");
   auto wrap_n = [n](size_type size) { return (n < 0 ? size + n : n); };
@@ -60,3 +62,5 @@ std::unique_ptr<cudf::scalar> cudf::reduction::nth_element(column_view const& co
     return cudf::detail::get_element(col, n, stream, mr);
   }
 }
+
+}  // namespace cudf::reduction::detail
diff --git a/cpp/src/reductions/product.cu b/cpp/src/reductions/product.cu
index 39e031f69d1..2e483813939 100644
--- a/cpp/src/reductions/product.cu
+++ b/cpp/src/reductions/product.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> product(column_view const& col,
                                       cudf::data_type const output_dtype,
@@ -31,13 +33,13 @@ std::unique_ptr<cudf::scalar> product(column_view const& col,
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
-    simple::detail::element_type_dispatcher<cudf::reduction::op::product>{},
+    simple::detail::element_type_dispatcher<op::product>{},
     col,
     output_dtype,
     init,
     stream,
     mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index cae2699aac7..b6c050287cf 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -19,11 +19,11 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/quantiles.hpp>
-#include <cudf/detail/reduction_functions.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
+namespace reduction {
 namespace detail {
 struct reduce_dispatch_functor {
   column_view const col;
@@ -52,89 +53,89 @@ struct reduce_dispatch_functor {
   std::unique_ptr<scalar> operator()(reduce_aggregation const& agg)
   {
     switch (k) {
-      case aggregation::SUM: return reduction::sum(col, output_dtype, init, stream, mr);
-      case aggregation::PRODUCT: return reduction::product(col, output_dtype, init, stream, mr);
-      case aggregation::MIN: return reduction::min(col, output_dtype, init, stream, mr);
-      case aggregation::MAX: return reduction::max(col, output_dtype, init, stream, mr);
-      case aggregation::ANY: return reduction::any(col, output_dtype, init, stream, mr);
-      case aggregation::ALL: return reduction::all(col, output_dtype, init, stream, mr);
-      case aggregation::SUM_OF_SQUARES:
-        return reduction::sum_of_squares(col, output_dtype, stream, mr);
-      case aggregation::MEAN: return reduction::mean(col, output_dtype, stream, mr);
+      case aggregation::SUM: return sum(col, output_dtype, init, stream, mr);
+      case aggregation::PRODUCT: return product(col, output_dtype, init, stream, mr);
+      case aggregation::MIN: return min(col, output_dtype, init, stream, mr);
+      case aggregation::MAX: return max(col, output_dtype, init, stream, mr);
+      case aggregation::ANY: return any(col, output_dtype, init, stream, mr);
+      case aggregation::ALL: return all(col, output_dtype, init, stream, mr);
+      case aggregation::SUM_OF_SQUARES: return sum_of_squares(col, output_dtype, stream, mr);
+      case aggregation::MEAN: return mean(col, output_dtype, stream, mr);
       case aggregation::VARIANCE: {
-        auto var_agg = static_cast<var_aggregation const&>(agg);
-        return reduction::variance(col, output_dtype, var_agg._ddof, stream, mr);
+        auto var_agg = static_cast<cudf::detail::var_aggregation const&>(agg);
+        return variance(col, output_dtype, var_agg._ddof, stream, mr);
       }
       case aggregation::STD: {
-        auto var_agg = static_cast<std_aggregation const&>(agg);
-        return reduction::standard_deviation(col, output_dtype, var_agg._ddof, stream, mr);
+        auto var_agg = static_cast<cudf::detail::std_aggregation const&>(agg);
+        return standard_deviation(col, output_dtype, var_agg._ddof, stream, mr);
       }
       case aggregation::MEDIAN: {
-        auto current_mr = rmm::mr::get_current_device_resource();
-        auto sorted_indices =
-          sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
+        auto current_mr     = rmm::mr::get_current_device_resource();
+        auto sorted_indices = cudf::detail::sorted_order(
+          table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
         auto valid_sorted_indices =
-          split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
-        auto col_ptr = quantile(
+          cudf::detail::split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
+        auto col_ptr = cudf::detail::quantile(
           col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream, current_mr);
-        return get_element(*col_ptr, 0, stream, mr);
+        return cudf::detail::get_element(*col_ptr, 0, stream, mr);
       }
       case aggregation::QUANTILE: {
-        auto quantile_agg = static_cast<quantile_aggregation const&>(agg);
+        auto quantile_agg = static_cast<cudf::detail::quantile_aggregation const&>(agg);
         CUDF_EXPECTS(quantile_agg._quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
-        auto current_mr = rmm::mr::get_current_device_resource();
-        auto sorted_indices =
-          sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
+        auto current_mr     = rmm::mr::get_current_device_resource();
+        auto sorted_indices = cudf::detail::sorted_order(
+          table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
         auto valid_sorted_indices =
-          split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
+          cudf::detail::split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
 
-        auto col_ptr = quantile(col,
-                                quantile_agg._quantiles,
-                                quantile_agg._interpolation,
-                                valid_sorted_indices,
-                                true,
-                                stream,
-                                current_mr);
-        return get_element(*col_ptr, 0, stream, mr);
+        auto col_ptr = cudf::detail::quantile(col,
+                                              quantile_agg._quantiles,
+                                              quantile_agg._interpolation,
+                                              valid_sorted_indices,
+                                              true,
+                                              stream,
+                                              current_mr);
+        return cudf::detail::get_element(*col_ptr, 0, stream, mr);
       }
       case aggregation::NUNIQUE: {
-        auto nunique_agg = static_cast<nunique_aggregation const&>(agg);
-        return make_fixed_width_scalar(
-          detail::distinct_count(col, nunique_agg._null_handling, nan_policy::NAN_IS_VALID, stream),
+        auto nunique_agg = static_cast<cudf::detail::nunique_aggregation const&>(agg);
+        return cudf::make_fixed_width_scalar(
+          cudf::detail::distinct_count(
+            col, nunique_agg._null_handling, nan_policy::NAN_IS_VALID, stream),
           stream,
           mr);
       }
       case aggregation::NTH_ELEMENT: {
-        auto nth_agg = static_cast<nth_element_aggregation const&>(agg);
-        return reduction::nth_element(col, nth_agg._n, nth_agg._null_handling, stream, mr);
+        auto nth_agg = static_cast<cudf::detail::nth_element_aggregation const&>(agg);
+        return nth_element(col, nth_agg._n, nth_agg._null_handling, stream, mr);
       }
       case aggregation::COLLECT_LIST: {
-        auto col_agg = static_cast<collect_list_aggregation const&>(agg);
-        return reduction::collect_list(col, col_agg._null_handling, stream, mr);
+        auto col_agg = static_cast<cudf::detail::collect_list_aggregation const&>(agg);
+        return collect_list(col, col_agg._null_handling, stream, mr);
       }
       case aggregation::COLLECT_SET: {
-        auto col_agg = static_cast<collect_set_aggregation const&>(agg);
-        return reduction::collect_set(
+        auto col_agg = static_cast<cudf::detail::collect_set_aggregation const&>(agg);
+        return collect_set(
           col, col_agg._null_handling, col_agg._nulls_equal, col_agg._nans_equal, stream, mr);
       }
       case aggregation::MERGE_LISTS: {
-        return reduction::merge_lists(col, stream, mr);
+        return merge_lists(col, stream, mr);
       }
       case aggregation::MERGE_SETS: {
-        auto col_agg = static_cast<merge_sets_aggregation const&>(agg);
-        return reduction::merge_sets(col, col_agg._nulls_equal, col_agg._nans_equal, stream, mr);
+        auto col_agg = static_cast<cudf::detail::merge_sets_aggregation const&>(agg);
+        return merge_sets(col, col_agg._nulls_equal, col_agg._nans_equal, stream, mr);
       }
       case aggregation::TDIGEST: {
         CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT,
                      "Tdigest aggregations expect output type to be STRUCT");
-        auto td_agg = static_cast<tdigest_aggregation const&>(agg);
+        auto td_agg = static_cast<cudf::detail::tdigest_aggregation const&>(agg);
         return tdigest::detail::reduce_tdigest(col, td_agg.max_centroids, stream, mr);
       }
       case aggregation::MERGE_TDIGEST: {
         CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT,
                      "Tdigest aggregations expect output type to be STRUCT");
-        auto td_agg = static_cast<merge_tdigest_aggregation const&>(agg);
+        auto td_agg = static_cast<cudf::detail::merge_tdigest_aggregation const&>(agg);
         return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr);
       }
       default: CUDF_FAIL("Unsupported reduction operator");
@@ -183,10 +184,11 @@ std::unique_ptr<scalar> reduce(column_view const& col,
     return result;
   }
 
-  return aggregation_dispatcher(
+  return cudf::detail::aggregation_dispatcher(
     agg.kind, reduce_dispatch_functor{col, output_dtype, init, stream, mr}, agg);
 }
 }  // namespace detail
+}  // namespace reduction
 
 std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
@@ -194,7 +196,8 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reduce(col, agg, output_dtype, std::nullopt, cudf::get_default_stream(), mr);
+  return reduction::detail::reduce(
+    col, agg, output_dtype, std::nullopt, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<scalar> reduce(column_view const& col,
@@ -204,6 +207,6 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr);
+  return reduction::detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/all.cu b/cpp/src/reductions/segmented/all.cu
index f75fcd8066c..b81a088155c 100644
--- a/cpp/src/reductions/segmented/all.cu
+++ b/cpp/src/reductions/segmented/all.cu
@@ -16,10 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_all(
   column_view const& col,
@@ -33,17 +34,12 @@ std::unique_ptr<cudf::column> segmented_all(
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "segmented_all() operation requires output type `BOOL8`");
 
+  using reducer = simple::detail::bool_result_column_dispatcher<op::min>;
   // A minimum over bool types is used to implement all()
   return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::bool_result_column_dispatcher<cudf::reduction::op::min>{},
-    col,
-    offsets,
-    null_handling,
-    init,
-    stream,
-    mr);
+    col.type(), reducer{}, col, offsets, null_handling, init, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/any.cu b/cpp/src/reductions/segmented/any.cu
index 6a4fc70d438..9210fbd3c7c 100644
--- a/cpp/src/reductions/segmented/any.cu
+++ b/cpp/src/reductions/segmented/any.cu
@@ -16,10 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_any(
   column_view const& col,
@@ -33,17 +34,12 @@ std::unique_ptr<cudf::column> segmented_any(
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "segmented_any() operation requires output type `BOOL8`");
 
+  using reducer = simple::detail::bool_result_column_dispatcher<op::max>;
   // A maximum over bool types is used to implement any()
   return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::bool_result_column_dispatcher<cudf::reduction::op::max>{},
-    col,
-    offsets,
-    null_handling,
-    init,
-    stream,
-    mr);
+    col.type(), reducer{}, col, offsets, null_handling, init, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/compound.cuh b/cpp/src/reductions/segmented/compound.cuh
index e8abd32cf61..395ad4c1dc9 100644
--- a/cpp/src/reductions/segmented/compound.cuh
+++ b/cpp/src/reductions/segmented/compound.cuh
@@ -21,7 +21,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.cuh>
-#include <cudf/detail/segmented_reduction.cuh>
+#include <cudf/reduction/detail/segmented_reduction.cuh>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/reductions/segmented/max.cu b/cpp/src/reductions/segmented/max.cu
index d72b65301c1..c07c8fb2269 100644
--- a/cpp/src/reductions/segmented/max.cu
+++ b/cpp/src/reductions/segmented/max.cu
@@ -16,10 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_max(
   column_view const& col,
@@ -32,16 +33,10 @@ std::unique_ptr<cudf::column> segmented_max(
 {
   CUDF_EXPECTS(col.type() == output_dtype,
                "segmented_max() operation requires matching output type");
+  using reducer = simple::detail::same_column_type_dispatcher<op::max>;
   return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::same_column_type_dispatcher<cudf::reduction::op::max>{},
-    col,
-    offsets,
-    null_handling,
-    init,
-    stream,
-    mr);
+    col.type(), reducer{}, col, offsets, null_handling, init, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/mean.cu b/cpp/src/reductions/segmented/mean.cu
index b7a5bfa43d6..99f1533a154 100644
--- a/cpp/src/reductions/segmented/mean.cu
+++ b/cpp/src/reductions/segmented/mean.cu
@@ -16,12 +16,13 @@
 
 #include "compound.cuh"
 
-#include <cudf/detail/segmented_reduction_functions.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_mean(column_view const& col,
                                              device_span<size_type const> offsets,
@@ -30,11 +31,12 @@ std::unique_ptr<cudf::column> segmented_mean(column_view const& col,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
-  using reducer = compound::detail::compound_segmented_dispatcher<cudf::reduction::op::mean>;
+  using reducer            = compound::detail::compound_segmented_dispatcher<op::mean>;
   constexpr size_type ddof = 1;  // ddof for mean calculation
   return cudf::type_dispatcher(
     col.type(), reducer{}, col, offsets, output_dtype, null_handling, ddof, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/min.cu b/cpp/src/reductions/segmented/min.cu
index b7fbedf2690..f1597f90267 100644
--- a/cpp/src/reductions/segmented/min.cu
+++ b/cpp/src/reductions/segmented/min.cu
@@ -16,10 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_min(
   column_view const& col,
@@ -32,16 +33,10 @@ std::unique_ptr<cudf::column> segmented_min(
 {
   CUDF_EXPECTS(col.type() == output_dtype,
                "segmented_min() operation requires matching output type");
+  using reducer = simple::detail::same_column_type_dispatcher<op::min>;
   return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::same_column_type_dispatcher<cudf::reduction::op::min>{},
-    col,
-    offsets,
-    null_handling,
-    init,
-    stream,
-    mr);
+    col.type(), reducer{}, col, offsets, null_handling, init, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/product.cu b/cpp/src/reductions/segmented/product.cu
index d5442126660..ea9c6f484c0 100644
--- a/cpp/src/reductions/segmented/product.cu
+++ b/cpp/src/reductions/segmented/product.cu
@@ -16,11 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
-
+namespace detail {
 std::unique_ptr<cudf::column> segmented_product(
   column_view const& col,
   device_span<size_type const> offsets,
@@ -30,17 +30,10 @@ std::unique_ptr<cudf::column> segmented_product(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
+  using reducer = simple::detail::column_type_dispatcher<op::product>;
   return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::column_type_dispatcher<cudf::reduction::op::product>{},
-    col,
-    offsets,
-    output_dtype,
-    null_handling,
-    init,
-    stream,
-    mr);
+    col.type(), reducer{}, col, offsets, output_dtype, null_handling, init, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 1de55b371b3..66b98fa8322 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -18,7 +18,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/segmented_reduction_functions.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
+namespace reduction {
 namespace detail {
 struct segmented_reduce_dispatch_functor {
   column_view const& col;
@@ -69,41 +70,32 @@ struct segmented_reduce_dispatch_functor {
   {
     switch (k) {
       case segmented_reduce_aggregation::SUM:
-        return reduction::segmented_sum(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_sum(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::PRODUCT:
-        return reduction::segmented_product(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_product(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::MIN:
-        return reduction::segmented_min(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_min(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::MAX:
-        return reduction::segmented_max(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_max(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::ANY:
-        return reduction::segmented_any(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_any(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::ALL:
-        return reduction::segmented_all(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_all(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::SUM_OF_SQUARES:
-        return reduction::segmented_sum_of_squares(
-          col, offsets, output_dtype, null_handling, stream, mr);
+        return segmented_sum_of_squares(col, offsets, output_dtype, null_handling, stream, mr);
       case segmented_reduce_aggregation::MEAN:
-        return reduction::segmented_mean(col, offsets, output_dtype, null_handling, stream, mr);
-      case aggregation::VARIANCE: {
-        auto var_agg = static_cast<var_aggregation const&>(agg);
-        return reduction::segmented_variance(
+        return segmented_mean(col, offsets, output_dtype, null_handling, stream, mr);
+      case segmented_reduce_aggregation::VARIANCE: {
+        auto var_agg = static_cast<cudf::detail::var_aggregation const&>(agg);
+        return segmented_variance(
           col, offsets, output_dtype, null_handling, var_agg._ddof, stream, mr);
       }
-      case aggregation::STD: {
-        auto var_agg = static_cast<std_aggregation const&>(agg);
-        return reduction::segmented_standard_deviation(
+      case segmented_reduce_aggregation::STD: {
+        auto var_agg = static_cast<cudf::detail::std_aggregation const&>(agg);
+        return segmented_standard_deviation(
           col, offsets, output_dtype, null_handling, var_agg._ddof, stream, mr);
       }
-      default:
-        CUDF_FAIL("Unsupported aggregation type.");
-        // TODO: Add support for compound_ops. GH #10432
+      default: CUDF_FAIL("Unsupported aggregation type.");
     }
   }
 };
@@ -127,13 +119,14 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
   }
   CUDF_EXPECTS(offsets.size() > 0, "`offsets` should have at least 1 element.");
 
-  return aggregation_dispatcher(
+  return cudf::detail::aggregation_dispatcher(
     agg.kind,
     segmented_reduce_dispatch_functor{
       segmented_values, offsets, output_dtype, null_handling, init, stream, mr},
     agg);
 }
 }  // namespace detail
+}  // namespace reduction
 
 std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          device_span<size_type const> offsets,
@@ -143,14 +136,14 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_reduce(segmented_values,
-                                  offsets,
-                                  agg,
-                                  output_dtype,
-                                  null_handling,
-                                  std::nullopt,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return reduction::detail::segmented_reduce(segmented_values,
+                                             offsets,
+                                             agg,
+                                             output_dtype,
+                                             null_handling,
+                                             std::nullopt,
+                                             cudf::get_default_stream(),
+                                             mr);
 }
 
 std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
@@ -162,14 +155,14 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_reduce(segmented_values,
-                                  offsets,
-                                  agg,
-                                  output_dtype,
-                                  null_handling,
-                                  init,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return reduction::detail::segmented_reduce(segmented_values,
+                                             offsets,
+                                             agg,
+                                             output_dtype,
+                                             null_handling,
+                                             init,
+                                             cudf::get_default_stream(),
+                                             mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh
index 0c22848fd89..32138f0835b 100644
--- a/cpp/src/reductions/segmented/simple.cuh
+++ b/cpp/src/reductions/segmented/simple.cuh
@@ -22,11 +22,11 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/segmented_reduction.cuh>
 #include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/reduction/detail/segmented_reduction.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -133,8 +133,8 @@ std::unique_ptr<column> simple_segmented_reduction(
 
 template <typename InputType,
           typename Op,
-          CUDF_ENABLE_IF(std::is_same_v<Op, cudf::reduction::op::min> ||
-                         std::is_same_v<Op, cudf::reduction::op::max>)>
+          CUDF_ENABLE_IF(std::is_same_v<Op, cudf::reduction::detail::op::min> ||
+                         std::is_same_v<Op, cudf::reduction::detail::op::max>)>
 std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                    device_span<size_type const> offsets,
                                                    null_policy null_handling,
@@ -147,7 +147,7 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
   auto it                 = thrust::make_counting_iterator(0);
   auto const num_segments = static_cast<size_type>(offsets.size()) - 1;
 
-  bool constexpr is_argmin = std::is_same_v<Op, cudf::reduction::op::min>;
+  bool constexpr is_argmin = std::is_same_v<Op, cudf::reduction::detail::op::min>;
   auto string_comparator =
     cudf::detail::element_argminmax_fn<InputType>{*device_col, col.has_nulls(), is_argmin};
   auto constexpr identity =
@@ -178,8 +178,8 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
 
 template <typename InputType,
           typename Op,
-          CUDF_ENABLE_IF(!std::is_same_v<Op, cudf::reduction::op::min>() &&
-                         !std::is_same_v<Op, cudf::reduction::op::max>())>
+          CUDF_ENABLE_IF(!std::is_same_v<Op, cudf::reduction::detail::op::min>() &&
+                         !std::is_same_v<Op, cudf::reduction::detail::op::max>())>
 std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                    device_span<size_type const> offsets,
                                                    null_policy null_handling,
@@ -215,7 +215,7 @@ std::unique_ptr<column> fixed_point_segmented_reduction(
   auto result =
     simple_segmented_reduction<RepType, RepType, Op>(col, offsets, null_handling, init, stream, mr);
   auto const scale = [&] {
-    if constexpr (std::is_same_v<Op, cudf::reduction::op::product>) {
+    if constexpr (std::is_same_v<Op, cudf::reduction::detail::op::product>) {
       // The product aggregation requires updating the scale of the fixed-point output column.
       // The output scale needs to be the maximum count of all segments multiplied by
       // the input scale value.
@@ -245,7 +245,7 @@ std::unique_ptr<column> fixed_point_segmented_reduction(
       return new_scale;
     }
 
-    if constexpr (std::is_same_v<Op, cudf::reduction::op::sum_of_squares>) {
+    if constexpr (std::is_same_v<Op, cudf::reduction::detail::op::sum_of_squares>) {
       return numeric::scale_type{col.type().scale() * 2};
     }
 
diff --git a/cpp/src/reductions/segmented/std.cu b/cpp/src/reductions/segmented/std.cu
index 6af5a9cf9b6..5f5ced63b8f 100644
--- a/cpp/src/reductions/segmented/std.cu
+++ b/cpp/src/reductions/segmented/std.cu
@@ -16,12 +16,13 @@
 
 #include "compound.cuh"
 
-#include <cudf/detail/segmented_reduction_functions.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_standard_deviation(column_view const& col,
                                                            device_span<size_type const> offsets,
@@ -31,11 +32,11 @@ std::unique_ptr<cudf::column> segmented_standard_deviation(column_view const& co
                                                            rmm::cuda_stream_view stream,
                                                            rmm::mr::device_memory_resource* mr)
 {
-  using reducer =
-    compound::detail::compound_segmented_dispatcher<cudf::reduction::op::standard_deviation>;
+  using reducer = compound::detail::compound_segmented_dispatcher<op::standard_deviation>;
   return cudf::type_dispatcher(
     col.type(), reducer(), col, offsets, output_dtype, null_handling, ddof, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/sum.cu b/cpp/src/reductions/segmented/sum.cu
index 0cb8decdc58..7e84961dee0 100644
--- a/cpp/src/reductions/segmented/sum.cu
+++ b/cpp/src/reductions/segmented/sum.cu
@@ -16,10 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_sum(
   column_view const& col,
@@ -30,16 +31,10 @@ std::unique_ptr<cudf::column> segmented_sum(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  return cudf::type_dispatcher(col.type(),
-                               simple::detail::column_type_dispatcher<cudf::reduction::op::sum>{},
-                               col,
-                               offsets,
-                               output_dtype,
-                               null_handling,
-                               init,
-                               stream,
-                               mr);
+  using reducer = simple::detail::column_type_dispatcher<op::sum>;
+  return cudf::type_dispatcher(
+    col.type(), reducer{}, col, offsets, output_dtype, null_handling, init, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/sum_of_squares.cu b/cpp/src/reductions/segmented/sum_of_squares.cu
index 1ee4f992b6d..6c3f286fd8d 100644
--- a/cpp/src/reductions/segmented/sum_of_squares.cu
+++ b/cpp/src/reductions/segmented/sum_of_squares.cu
@@ -16,12 +16,13 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/segmented_reduction_functions.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_sum_of_squares(column_view const& col,
                                                        device_span<size_type const> offsets,
@@ -30,10 +31,11 @@ std::unique_ptr<cudf::column> segmented_sum_of_squares(column_view const& col,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::mr::device_memory_resource* mr)
 {
-  using reducer = simple::detail::column_type_dispatcher<cudf::reduction::op::sum_of_squares>;
+  using reducer = simple::detail::column_type_dispatcher<op::sum_of_squares>;
   return cudf::type_dispatcher(
     col.type(), reducer{}, col, offsets, output_dtype, null_handling, std::nullopt, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/var.cu b/cpp/src/reductions/segmented/var.cu
index 84adf353700..4ac815b542f 100644
--- a/cpp/src/reductions/segmented/var.cu
+++ b/cpp/src/reductions/segmented/var.cu
@@ -16,12 +16,13 @@
 
 #include "compound.cuh"
 
-#include <cudf/detail/segmented_reduction_functions.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_variance(column_view const& col,
                                                  device_span<size_type const> offsets,
@@ -31,10 +32,10 @@ std::unique_ptr<cudf::column> segmented_variance(column_view const& col,
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
-  using reducer = compound::detail::compound_segmented_dispatcher<cudf::reduction::op::variance>;
+  using reducer = compound::detail::compound_segmented_dispatcher<op::variance>;
   return cudf::type_dispatcher(
     col.type(), reducer(), col, offsets, output_dtype, null_handling, ddof, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index c7c0d400106..189c17f9b28 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include <reductions/struct_minmax_util.cuh>
+#include "struct_minmax_util.cuh"
 
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/reduction.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/reduction/detail/reduction.cuh>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/struct_view.hpp>
@@ -117,10 +117,10 @@ std::unique_ptr<scalar> fixed_point_reduction(
   auto result = simple_reduction<Type, Type, Op>(col, init, stream, mr);
 
   auto const scale = [&] {
-    if (std::is_same_v<Op, cudf::reduction::op::product>) {
+    if (std::is_same_v<Op, cudf::reduction::detail::op::product>) {
       auto const valid_count = static_cast<int32_t>(col.size() - col.null_count());
       return numeric::scale_type{col.type().scale() * (valid_count + (init.has_value() ? 1 : 0))};
-    } else if (std::is_same_v<Op, cudf::reduction::op::sum_of_squares>) {
+    } else if (std::is_same_v<Op, cudf::reduction::detail::op::sum_of_squares>) {
       return numeric::scale_type{col.type().scale() * 2};
     }
     return numeric::scale_type{col.type().scale()};
@@ -300,8 +300,8 @@ struct same_element_type_dispatcher {
  public:
   template <typename ElementType,
             std::enable_if_t<std::is_same_v<ElementType, cudf::struct_view> &&
-                             (std::is_same_v<Op, cudf::reduction::op::min> ||
-                              std::is_same_v<Op, cudf::reduction::op::max>)>* = nullptr>
+                             (std::is_same_v<Op, cudf::reduction::detail::op::min> ||
+                              std::is_same_v<Op, cudf::reduction::detail::op::max>)>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu
index e9ba75f68e6..9df83634667 100644
--- a/cpp/src/reductions/std.cu
+++ b/cpp/src/reductions/std.cu
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "compound.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/compound.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> standard_deviation(column_view const& col,
                                                  cudf::data_type const output_dtype,
@@ -31,8 +33,7 @@ std::unique_ptr<cudf::scalar> standard_deviation(column_view const& col,
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
-  using reducer =
-    compound::detail::element_type_dispatcher<cudf::reduction::op::standard_deviation>;
+  using reducer = compound::detail::element_type_dispatcher<op::standard_deviation>;
   auto col_type =
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
   return cudf::type_dispatcher(col_type, reducer(), col, output_dtype, ddof, stream, mr);
@@ -43,5 +44,6 @@ std::unique_ptr<cudf::scalar> standard_deviation(column_view const& col,
 #endif
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
index b2106066ff2..f8f7ee84e34 100644
--- a/cpp/src/reductions/struct_minmax_util.cuh
+++ b/cpp/src/reductions/struct_minmax_util.cuh
@@ -17,10 +17,10 @@
 #pragma once
 
 #include <cudf/aggregation.hpp>
-#include <cudf/detail/reduction_operators.cuh>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/reduction/detail/reduction_operators.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
@@ -134,10 +134,10 @@ class comparison_binop_generator {
   template <typename BinOp>
   static auto create(column_view const& input, rmm::cuda_stream_view stream)
   {
-    return comparison_binop_generator(
-      input,
-      stream,
-      std::is_same_v<BinOp, cudf::reduction::op::min> || std::is_same_v<BinOp, cudf::DeviceMin>);
+    return comparison_binop_generator(input,
+                                      stream,
+                                      std::is_same_v<BinOp, cudf::reduction::detail::op::min> ||
+                                        std::is_same_v<BinOp, cudf::DeviceMin>);
   }
 
   template <cudf::aggregation::Kind K>
diff --git a/cpp/src/reductions/sum.cu b/cpp/src/reductions/sum.cu
index b919d871cc2..85c6b32dbaf 100644
--- a/cpp/src/reductions/sum.cu
+++ b/cpp/src/reductions/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> sum(column_view const& col,
                                   cudf::data_type const output_dtype,
@@ -31,7 +33,7 @@ std::unique_ptr<cudf::scalar> sum(column_view const& col,
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
-    simple::detail::element_type_dispatcher<cudf::reduction::op::sum>{},
+    simple::detail::element_type_dispatcher<op::sum>{},
     col,
     output_dtype,
     init,
@@ -39,5 +41,6 @@ std::unique_ptr<cudf::scalar> sum(column_view const& col,
     mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/sum_of_squares.cu b/cpp/src/reductions/sum_of_squares.cu
index af28ba19c9a..7b85c4e6dc9 100644
--- a/cpp/src/reductions/sum_of_squares.cu
+++ b/cpp/src/reductions/sum_of_squares.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> sum_of_squares(column_view const& col,
                                              cudf::data_type const output_dtype,
@@ -30,7 +32,7 @@ std::unique_ptr<cudf::scalar> sum_of_squares(column_view const& col,
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
-    simple::detail::element_type_dispatcher<cudf::reduction::op::sum_of_squares>{},
+    simple::detail::element_type_dispatcher<op::sum_of_squares>{},
     col,
     output_dtype,
     std::nullopt,
@@ -38,5 +40,6 @@ std::unique_ptr<cudf::scalar> sum_of_squares(column_view const& col,
     mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu
index 4d86918d6c6..d559531dc59 100644
--- a/cpp/src/reductions/var.cu
+++ b/cpp/src/reductions/var.cu
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "compound.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/compound.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> variance(column_view const& col,
                                        cudf::data_type const output_dtype,
@@ -31,7 +33,7 @@ std::unique_ptr<cudf::scalar> variance(column_view const& col,
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
-  using reducer = compound::detail::element_type_dispatcher<cudf::reduction::op::variance>;
+  using reducer = compound::detail::element_type_dispatcher<op::variance>;
   auto col_type =
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
   return cudf::type_dispatcher(col_type, reducer(), col, output_dtype, ddof, stream, mr);
@@ -42,5 +44,6 @@ std::unique_ptr<cudf::scalar> variance(column_view const& col,
 #endif
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf

From 6966fd59fc10c03f6d58b68bca7eb73e2e1e627c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 23 Mar 2023 09:56:46 -0400
Subject: [PATCH 46/63] Remove default detail mrs: part4 (#12967)

This is the fourth PR in a sequence removing default mr parameters in detail APIs. Contributes to https://github.com/rapidsai/cudf/issues/12944.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12967
---
 cpp/include/cudf/detail/null_mask.hpp         |  54 ++++-----
 cpp/include/cudf/detail/structs/utilities.hpp |  10 +-
 cpp/include/cudf/lists/detail/concatenate.hpp |   9 +-
 cpp/include/cudf/lists/detail/extract.hpp     |  20 ++--
 .../cudf/lists/detail/interleave_columns.hpp  |  11 +-
 cpp/include/cudf/lists/detail/reverse.hpp     |   9 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |  43 ++++---
 cpp/include/cudf/lists/detail/sorting.hpp     |  24 ++--
 .../cudf/lists/detail/stream_compaction.hpp   |  22 ++--
 cpp/src/groupby/hash/groupby.cu               |   4 +-
 cpp/src/groupby/sort/group_scan_util.cuh      |   6 +-
 cpp/src/groupby/sort/sort_helper.cu           |   3 +-
 cpp/src/join/hash_join.cu                     |  48 ++++++--
 cpp/src/join/mixed_join.cu                    |   6 +-
 cpp/src/join/mixed_join_semi.cu               |   6 +-
 cpp/src/reductions/struct_minmax_util.cuh     |   3 +-
 cpp/src/scalar/scalar.cpp                     |   3 +-
 cpp/src/search/contains_table.cu              |  19 +--
 cpp/src/stream_compaction/distinct_count.cu   |   3 +-
 cpp/src/table/row_operators.cu                |   9 +-
 cpp/src/unary/cast_ops.cu                     |   4 +-
 cpp/tests/join/join_tests.cpp                 |   4 +-
 cpp/tests/structs/utilities_tests.cpp         | 110 ++++++++++++------
 java/src/main/native/src/ColumnVectorJni.cpp  |   6 +-
 java/src/main/native/src/ColumnViewJni.cu     |  10 +-
 25 files changed, 256 insertions(+), 190 deletions(-)

diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index a0e04d7b215..7f1b15893c5 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,11 +31,10 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-rmm::device_buffer create_null_mask(
-  size_type size,
-  mask_state state,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_buffer create_null_mask(size_type size,
+                                    mask_state state,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::set_null_mask(bitmask_type*, size_type, size_type, bool)
@@ -209,22 +208,20 @@ std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-rmm::device_buffer copy_bitmask(
-  bitmask_type const* mask,
-  size_type begin_bit,
-  size_type end_bit,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_buffer copy_bitmask(bitmask_type const* mask,
+                                size_type begin_bit,
+                                size_type end_bit,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::copy_bitmask(column_view const& view, rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-rmm::device_buffer copy_bitmask(
-  column_view const& view,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_buffer copy_bitmask(column_view const& view,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc bitmask_and(host_span<bitmask_type const* const>, host_span<size_type> const,
@@ -232,32 +229,29 @@ rmm::device_buffer copy_bitmask(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::pair<rmm::device_buffer, size_type> bitmask_and(
-  host_span<bitmask_type const* const> masks,
-  host_span<size_type const> masks_begin_bits,
-  size_type mask_size_bits,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type const* const> masks,
+                                                     host_span<size_type const> masks_begin_bits,
+                                                     size_type mask_size_bits,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::bitmask_and
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::pair<rmm::device_buffer, size_type> bitmask_and(
-  table_view const& view,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::bitmask_or
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::pair<rmm::device_buffer, size_type> bitmask_or(
-  table_view const& view,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index 4a708d2fb51..5fcc331a382 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -175,7 +175,7 @@ class flattened_table {
   std::vector<null_order> const& null_precedence,
   column_nullability nullability,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Superimpose nulls from a given null mask into the input column, using bitwise AND.
@@ -222,9 +222,7 @@ class flattened_table {
  *         to be kept alive.
  */
 [[nodiscard]] std::pair<column_view, temporary_nullable_data> push_down_nulls(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Push down nulls from columns of the input table into their children columns, using
@@ -251,9 +249,7 @@ class flattened_table {
  *         to be kept alive.
  */
 [[nodiscard]] std::pair<table_view, temporary_nullable_data> push_down_nulls(
-  table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Checks if a column or any of its children is a struct column with structs that are null.
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index 5a8b4bc3bf3..a1f149d4ccf 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,10 +43,9 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with concatenated results.
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/extract.hpp b/cpp/include/cudf/lists/detail/extract.hpp
index 44c31c9ddb2..013f9b491dd 100644
--- a/cpp/include/cudf/lists/detail/extract.hpp
+++ b/cpp/include/cudf/lists/detail/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,22 +27,20 @@ namespace detail {
  * rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> extract_list_element(
-  lists_column_view lists_column,
-  size_type const index,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
+                                             size_type const index,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view, column_view const&,
  * rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> extract_list_element(
-  lists_column_view lists_column,
-  column_view const& indices,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
+                                             column_view const& indices,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/interleave_columns.hpp b/cpp/include/cudf/lists/detail/interleave_columns.hpp
index 7ae90779fdc..a5cf67c95b9 100644
--- a/cpp/include/cudf/lists/detail/interleave_columns.hpp
+++ b/cpp/include/cudf/lists/detail/interleave_columns.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,11 +44,10 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return The interleaved columns as a single column.
  */
-std::unique_ptr<column> interleave_columns(
-  table_view const& input,
-  bool has_null_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           bool has_null_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/reverse.hpp b/cpp/include/cudf/lists/detail/reverse.hpp
index d467a9ac70e..6e3b952a3b0 100644
--- a/cpp/include/cudf/lists/detail/reverse.hpp
+++ b/cpp/include/cudf/lists/detail/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,9 +23,8 @@ namespace cudf::lists::detail {
  * @copydoc cudf::lists::reverse
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<column> reverse(
-  lists_column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> reverse(lists_column_view const& input,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index c2b4778aac8..856914b445e 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -89,15 +89,14 @@ rmm::device_uvector<unbound_list_view> list_vector_from_column(
  * @return New lists column.
  */
 template <typename MapIterator>
-std::unique_ptr<column> scatter_impl(
-  rmm::device_uvector<unbound_list_view> const& source_vector,
-  rmm::device_uvector<unbound_list_view>& target_vector,
-  MapIterator scatter_map_begin,
-  MapIterator scatter_map_end,
-  column_view const& source,
-  column_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> const& source_vector,
+                                     rmm::device_uvector<unbound_list_view>& target_vector,
+                                     MapIterator scatter_map_begin,
+                                     MapIterator scatter_map_end,
+                                     column_view const& source,
+                                     column_view const& target,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
 
@@ -170,13 +169,12 @@ std::unique_ptr<column> scatter_impl(
  * @return New lists column.
  */
 template <typename MapIterator>
-std::unique_ptr<column> scatter(
-  column_view const& source,
-  MapIterator scatter_map_begin,
-  MapIterator scatter_map_end,
-  column_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> scatter(column_view const& source,
+                                MapIterator scatter_map_begin,
+                                MapIterator scatter_map_end,
+                                column_view const& target,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
@@ -227,13 +225,12 @@ std::unique_ptr<column> scatter(
  * @return New lists column.
  */
 template <typename MapIterator>
-std::unique_ptr<column> scatter(
-  scalar const& slr,
-  MapIterator scatter_map_begin,
-  MapIterator scatter_map_end,
-  column_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> scatter(scalar const& slr,
+                                MapIterator scatter_map_begin,
+                                MapIterator scatter_map_end,
+                                column_view const& target,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
index 1068a4c4b69..c378ca8cf06 100644
--- a/cpp/include/cudf/lists/detail/sorting.hpp
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,24 +28,22 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> sort_lists(
-  lists_column_view const& input,
-  order column_order,
-  null_order null_precedence,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> sort_lists(lists_column_view const& input,
+                                   order column_order,
+                                   null_order null_precedence,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::stable_sort_lists
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> stable_sort_lists(
-  lists_column_view const& input,
-  order column_order,
-  null_order null_precedence,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
+                                          order column_order,
+                                          null_order null_precedence,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index ba3dbb6594b..7ab9cf9a343 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,22 +28,20 @@ namespace cudf::lists::detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<column> apply_boolean_mask(
-  lists_column_view const& input,
-  lists_column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
+                                           lists_column_view const& boolean_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> distinct(
-  lists_column_view const& input,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> distinct(lists_column_view const& input,
+                                 null_equality nulls_equal,
+                                 nan_equality nans_equal,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index f8203218760..6c55b1438ee 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -487,7 +487,9 @@ void compute_single_pass_aggs(table_view const& keys,
     keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   auto row_bitmask =
-    skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).first : rmm::device_buffer{};
+    skip_key_rows_with_nulls
+      ? cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first
+      : rmm::device_buffer{};
 
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator(0),
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index cb954e614f2..f12efd3cd24 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,7 +124,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
         make_null_replacement_iterator(*values_view, OpType::template identity<DeviceType>()),
         thrust::identity<ResultDeviceType>{});
       do_scan(input, result_view->begin<ResultDeviceType>(), OpType{});
-      result->set_null_mask(cudf::detail::copy_bitmask(values, stream));
+      result->set_null_mask(cudf::detail::copy_bitmask(values, stream, mr));
     } else {
       auto input = thrust::make_transform_iterator(values_view->begin<DeviceType>(),
                                                    thrust::identity<ResultDeviceType>{});
@@ -175,7 +175,7 @@ struct group_scan_functor<K,
     // turn the string_view vector into a strings column
     auto results = make_strings_column(results_vector, string_view{}, stream, mr);
     if (values.has_nulls())
-      results->set_null_mask(cudf::detail::copy_bitmask(values, stream), values.null_count());
+      results->set_null_mask(cudf::detail::copy_bitmask(values, stream, mr), values.null_count());
     return results;
   }
 };
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index ebafcd75e6d..6e992f2f53b 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -223,7 +223,8 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea
 {
   if (_keys_bitmask_column) return _keys_bitmask_column->view();
 
-  auto [row_bitmask, null_count] = cudf::detail::bitmask_and(_keys, stream);
+  auto [row_bitmask, null_count] =
+    cudf::detail::bitmask_and(_keys, stream, rmm::mr::get_current_device_resource());
 
   _keys_bitmask_column = make_numeric_column(
     data_type(type_id::INT8), _keys.num_rows(), std::move(row_bitmask), null_count, stream);
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 7fb35e179e9..d0bdad73614 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -284,7 +284,8 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
                              cudf::null_equality compare_nulls,
                              rmm::cuda_stream_view stream)
   : _is_empty{build.num_rows() == 0},
-    _composite_bitmask{cudf::detail::bitmask_and(build, stream).first},
+    _composite_bitmask{
+      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first},
     _nulls_equal{compare_nulls},
     _hash_table{compute_hash_table_size(build.num_rows()),
                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
@@ -298,8 +299,13 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
                "Build column size is too big for hash join");
 
   // need to store off the owning structures for some of the views in _build
-  _flattened_build_table = structs::detail::flatten_nested_columns(
-    build, {}, {}, structs::detail::column_nullability::FORCE, stream);
+  _flattened_build_table =
+    structs::detail::flatten_nested_columns(build,
+                                            {},
+                                            {},
+                                            structs::detail::column_nullability::FORCE,
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
   _build = _flattened_build_table->flattened_columns();
 
   if (_is_empty) { return; }
@@ -356,8 +362,13 @@ std::size_t hash_join<Hasher>::inner_join_size(cudf::table_view const& probe,
   // Return directly if build table is empty
   if (_is_empty) { return 0; }
 
-  auto flattened_probe = structs::detail::flatten_nested_columns(
-    probe, {}, {}, structs::detail::column_nullability::FORCE, stream);
+  auto flattened_probe =
+    structs::detail::flatten_nested_columns(probe,
+                                            {},
+                                            {},
+                                            structs::detail::column_nullability::FORCE,
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
   auto const flattened_probe_table = flattened_probe->flattened_columns();
 
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
@@ -381,8 +392,13 @@ std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& probe,
   // Trivial left join case - exit early
   if (_is_empty) { return probe.num_rows(); }
 
-  auto flattened_probe = structs::detail::flatten_nested_columns(
-    probe, {}, {}, structs::detail::column_nullability::FORCE, stream);
+  auto flattened_probe =
+    structs::detail::flatten_nested_columns(probe,
+                                            {},
+                                            {},
+                                            structs::detail::column_nullability::FORCE,
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
   auto const flattened_probe_table = flattened_probe->flattened_columns();
 
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
@@ -407,8 +423,13 @@ std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& probe,
   // Trivial left join case - exit early
   if (_is_empty) { return probe.num_rows(); }
 
-  auto flattened_probe = structs::detail::flatten_nested_columns(
-    probe, {}, {}, structs::detail::column_nullability::FORCE, stream);
+  auto flattened_probe =
+    structs::detail::flatten_nested_columns(probe,
+                                            {},
+                                            {},
+                                            structs::detail::column_nullability::FORCE,
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
   auto const flattened_probe_table = flattened_probe->flattened_columns();
 
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
@@ -474,8 +495,13 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
   CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
                "Probe column size is too big for hash join");
 
-  auto flattened_probe = structs::detail::flatten_nested_columns(
-    probe, {}, {}, structs::detail::column_nullability::FORCE, stream);
+  auto flattened_probe =
+    structs::detail::flatten_nested_columns(probe,
+                                            {},
+                                            {},
+                                            structs::detail::column_nullability::FORCE,
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
   auto const flattened_probe_table = flattened_probe->flattened_columns();
 
   CUDF_EXPECTS(_build.num_columns() == flattened_probe_table.num_columns(),
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index d35aeab39ec..8d66cba8f8d 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -139,7 +139,8 @@ mixed_join(
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
-  auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
+  auto const row_bitmask =
+    cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
   build_join_hash_table(
     build, hash_table, compare_nulls, static_cast<bitmask_type const*>(row_bitmask.data()), stream);
   auto hash_table_view = hash_table.get_device_view();
@@ -387,7 +388,8 @@ compute_mixed_join_output_size(table_view const& left_equality,
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
-  auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
+  auto const row_bitmask =
+    cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
   build_join_hash_table(
     build, hash_table, compare_nulls, static_cast<bitmask_type const*>(row_bitmask.data()), stream);
   auto hash_table_view = hash_table.get_device_view();
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index fced5b1b33f..1304c4ae3b0 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -195,7 +195,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
-    auto const [row_bitmask, _] = cudf::detail::bitmask_and(build, stream);
+    auto const [row_bitmask, _] =
+      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource());
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
@@ -433,7 +434,8 @@ compute_mixed_join_output_size_semi(table_view const& left_equality,
     hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
-    auto const [row_bitmask, _] = cudf::detail::bitmask_and(build, stream);
+    auto const [row_bitmask, _] =
+      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource());
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
index f8f7ee84e34..7b56646b153 100644
--- a/cpp/src/reductions/struct_minmax_util.cuh
+++ b/cpp/src/reductions/struct_minmax_util.cuh
@@ -102,7 +102,8 @@ class comparison_binop_generator {
         {},
         std::vector<null_order>{DEFAULT_NULL_ORDER},
         cudf::structs::detail::column_nullability::MATCH_INCOMING,
-        stream)},
+        stream,
+        rmm::mr::get_current_device_resource())},
       d_flattened_input_ptr{
         table_device_view::create(flattened_input->flattened_columns(), stream)},
       is_min_op(is_min_op),
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 046bfee9e41..403dc8c9189 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -583,7 +583,8 @@ table struct_scalar::init_data(table&& data,
   auto data_cols = data.release();
 
   // push validity mask down
-  auto const validity = cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream);
+  auto const validity = cudf::detail::create_null_mask(
+    1, mask_state::ALL_NULL, stream, rmm::mr::get_current_device_resource());
   for (auto& col : data_cols) {
     col = cudf::structs::detail::superimpose_nulls(
       static_cast<bitmask_type const*>(validity.data()), 1, std::move(col), stream, mr);
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index f770b4598cf..1a2f242ef87 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -125,7 +125,10 @@ std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view
   // If there are more than one nullable column, we compute `bitmask_and` of their null masks.
   // Otherwise, we have only one nullable column and can use its null mask directly.
   if (nullable_columns.size() > 1) {
-    auto row_bitmask = cudf::detail::bitmask_and(table_view{nullable_columns}, stream).first;
+    auto row_bitmask =
+      cudf::detail::bitmask_and(
+        table_view{nullable_columns}, stream, rmm::mr::get_current_device_resource())
+        .first;
     auto const row_bitmask_ptr = static_cast<bitmask_type const*>(row_bitmask.data());
     return std::pair(std::move(row_bitmask), row_bitmask_ptr);
   }
@@ -322,13 +325,13 @@ rmm::device_uvector<bool> contains_without_lists_or_nans(table_view const& hayst
   auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
 
   // Flatten the input tables.
-  auto const flatten_nullability = has_any_nulls
-                                     ? structs::detail::column_nullability::FORCE
-                                     : structs::detail::column_nullability::MATCH_INCOMING;
-  auto const haystack_flattened_tables =
-    structs::detail::flatten_nested_columns(haystack, {}, {}, flatten_nullability, stream);
-  auto const needles_flattened_tables =
-    structs::detail::flatten_nested_columns(needles, {}, {}, flatten_nullability, stream);
+  auto const flatten_nullability       = has_any_nulls
+                                           ? structs::detail::column_nullability::FORCE
+                                           : structs::detail::column_nullability::MATCH_INCOMING;
+  auto const haystack_flattened_tables = structs::detail::flatten_nested_columns(
+    haystack, {}, {}, flatten_nullability, stream, rmm::mr::get_current_device_resource());
+  auto const needles_flattened_tables = structs::detail::flatten_nested_columns(
+    needles, {}, {}, flatten_nullability, stream, rmm::mr::get_current_device_resource());
   auto const haystack_flattened = haystack_flattened_tables->flattened_columns();
   auto const needles_flattened  = needles_flattened_tables->flattened_columns();
   auto const haystack_tdv_ptr   = table_device_view::create(haystack_flattened, stream);
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 0dae26c18a9..8c50f8d29e8 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -151,7 +151,8 @@ cudf::size_type distinct_count(table_view const& keys,
     // when nulls are equal, insert non-null rows only to improve efficiency
     if (nulls_equal == null_equality::EQUAL and has_nulls) {
       thrust::counting_iterator<size_type> stencil(0);
-      auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream);
+      auto const [row_bitmask, null_count] =
+        cudf::detail::bitmask_or(keys, stream, rmm::mr::get_current_device_resource());
       row_validity pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
       key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value());
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 0c6747f2d12..6997de18be5 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -25,6 +25,8 @@
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -401,9 +403,10 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(table_view const&
 {
   check_eq_compatibility(t);
 
-  auto [null_pushed_table, nullable_data] = structs::detail::push_down_nulls(t, stream);
-  auto struct_offset_removed_table        = remove_struct_child_offsets(null_pushed_table);
-  auto verticalized_t = std::get<0>(decompose_structs(struct_offset_removed_table));
+  auto [null_pushed_table, nullable_data] =
+    structs::detail::push_down_nulls(t, stream, rmm::mr::get_current_device_resource());
+  auto struct_offset_removed_table = remove_struct_child_offsets(null_pushed_table);
+  auto verticalized_t              = std::get<0>(decompose_structs(struct_offset_removed_table));
 
   auto d_t = table_device_view_owner(table_device_view::create(verticalized_t, stream));
   return std::shared_ptr<preprocessed_table>(new preprocessed_table(
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index b569ce04c31..6e19fc2ca3f 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -328,7 +328,7 @@ struct dispatch_unary_cast_to {
       auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
                                              size,
                                              rmm::device_buffer{size * cudf::size_of(type), stream},
-                                             copy_bitmask(input, stream),
+                                             copy_bitmask(input, stream, mr),
                                              input.null_count());
 
       mutable_column_view output_mutable = *output;
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 31500319592..404ff7d8380 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -1865,8 +1865,8 @@ TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown)
 
   // Note: Join result might not have nulls pushed down, since it's an output of gather().
   // Must superimpose parent nulls before comparisons.
-  auto [superimposed_results, _] =
-    cudf::structs::detail::push_down_nulls(*result, cudf::get_default_stream());
+  auto [superimposed_results, _] = cudf::structs::detail::push_down_nulls(
+    *result, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto const expected = [] {
     auto fact_ints    = ints{0};
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index e92b96553c0..327fede6126 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -53,9 +53,14 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevel)
   auto lists_col = lists{{0, 1}, {22, 33}, {44, 55, 66}};
   auto nums_col  = nums{{0, 1, 2}, cudf::test::iterators::null_at(6)};
 
-  auto table           = cudf::table_view{{lists_col, nums_col}};
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto table = cudf::table_view{{lists_col, nums_col}};
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(table, flattened_table->flattened_columns());
 }
@@ -76,7 +81,8 @@ TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported)
                  {},
                  {},
                  cudf::structs::detail::column_nullability::FORCE,
-                 cudf::get_default_stream()),
+                 cudf::get_default_stream(),
+                 rmm::mr::get_current_device_resource()),
                cudf::logic_error);
 }
 
@@ -90,9 +96,14 @@ TYPED_TEST(TypedStructUtilitiesTest, NoStructs)
     {"", "1", "22", "333", "4444", "55555", "666666"}, cudf::test::iterators::null_at(1)};
   auto nuther_nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, cudf::test::iterators::null_at(6)};
 
-  auto table           = cudf::table_view{{nums_col, strings_col, nuther_nums_col}};
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto table = cudf::table_view{{nums_col, strings_col, nuther_nums_col}};
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(table, flattened_table->flattened_columns());
 }
@@ -118,8 +129,13 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct)
   auto expected = cudf::table_view{
     {expected_nums_col_1, expected_structs_col, expected_nums_col_2, expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -146,8 +162,13 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls)
   auto expected = cudf::table_view{
     {expected_nums_col_1, expected_structs_col, expected_nums_col_2, expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -185,8 +206,13 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct)
                                     expected_nums_col_3,
                                     expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -225,8 +251,13 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel)
                                     expected_nums_col_3,
                                     expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -266,8 +297,13 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel)
                                     expected_nums_col_3,
                                     expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -307,8 +343,13 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
                                     expected_nums_col_3,
                                     expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -330,7 +371,8 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported)
                  {},
                  {},
                  cudf::structs::detail::column_nullability::FORCE,
-                 cudf::get_default_stream()),
+                 cudf::get_default_stream(),
+                 rmm::mr::get_current_device_resource()),
                cudf::logic_error);
 }
 
@@ -346,8 +388,8 @@ TYPED_TEST_SUITE(TypedSuperimposeTest, cudf::test::FixedWidthTypes);
 void test_non_struct_columns(cudf::column_view const& input)
 {
   // push_down_nulls() on non-struct columns should return the input column, unchanged.
-  auto [superimposed, backing_data] =
-    cudf::structs::detail::push_down_nulls(input, cudf::get_default_stream());
+  auto [superimposed, backing_data] = cudf::structs::detail::push_down_nulls(
+    input, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(input, superimposed);
   EXPECT_TRUE(backing_data.new_null_masks.empty());
@@ -410,8 +452,8 @@ TYPED_TEST(TypedSuperimposeTest, BasicStruct)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(structs_view.child(1),
                                  make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5})));
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(structs_view, cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    structs_view, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), the struct nulls (i.e. at index-0) should have been pushed
   // down to the children. All members should have nulls at row-index 0.
@@ -436,8 +478,8 @@ TYPED_TEST(TypedSuperimposeTest, NonNullableParentStruct)
                                                           cudf::test::iterators::no_nulls()}
                          .release();
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(structs_input->view(), cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    structs_input->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), none of the child structs should have changed,
   // because the parent had no nulls to begin with.
@@ -471,8 +513,8 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_ChildNullable_ParentNonNullable)
   auto structs_of_structs =
     cudf::test::structs_column_wrapper{std::move(outer_struct_members)}.release();
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(structs_of_structs->view(), cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    structs_of_structs->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), outer-struct column should not have pushed nulls to child
   // structs. But the child struct column must push its nulls to its own children.
@@ -514,8 +556,8 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_ChildNullable_ParentNullable)
   cudf::detail::set_null_mask(
     structs_of_structs_view.null_mask(), 1, 2, false, cudf::get_default_stream());
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(structs_of_structs->view(), cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    structs_of_structs->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), outer-struct column should not have pushed nulls to child
   // structs. But the child struct column must push its nulls to its own children.
@@ -570,8 +612,8 @@ TYPED_TEST(TypedSuperimposeTest, Struct_Sliced)
   // nums_member:  11011
   // lists_member: 00111
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(sliced_structs, cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    sliced_structs, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), the null masks should be:
   // STRUCT:       11110
@@ -623,8 +665,8 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_Sliced)
   // nums_member:    11010
   // lists_member:   00110
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(sliced_structs, cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    sliced_structs, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), the null masks will be:
   // STRUCT<STRUCT>: 11101
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 6dc7de13560..1d22d8a5d79 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
@@ -296,7 +297,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env
         cudf::jni::native_jpointerArray<column_view>{env, column_handles}.get_dereferenced();
     auto const is_lists_column = columns[0].type().id() == cudf::type_id::LIST;
     return release_as_jlong(
-        is_lists_column ? cudf::lists::detail::concatenate(columns, cudf::get_default_stream()) :
+        is_lists_column ? cudf::lists::detail::concatenate(columns, cudf::get_default_stream(),
+                                                           rmm::mr::get_current_device_resource()) :
                           cudf::concatenate(columns));
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 86c2add851a..9a96374688a 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -164,7 +164,8 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
       auto [null_mask, null_count] = cudf::detail::bitmask_and(
           std::vector<bitmask_type const *>{
               overlap_cv.null_mask(), static_cast<bitmask_type const *>(new_null_mask.data())},
-          std::vector<cudf::size_type>{0, 0}, overlap_cv.size(), stream);
+          std::vector<cudf::size_type>{0, 0}, overlap_cv.size(), stream,
+          rmm::mr::get_current_device_resource());
       overlap_result->set_null_mask(std::move(null_mask), null_count);
     } else {
       // Just set the output nullmask as the new nullmask.
@@ -214,9 +215,10 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
   cudf::detail::labels_to_offsets(labels_begin, labels_begin + out_labels.size(), offsets_begin,
                                   offsets_begin + out_offsets->size(), stream);
 
-  return cudf::make_lists_column(input.size(), std::move(out_offsets), std::move(out_structs),
-                                 input.null_count(),
-                                 cudf::detail::copy_bitmask(input.parent(), stream), stream);
+  return cudf::make_lists_column(
+      input.size(), std::move(out_offsets), std::move(out_structs), input.null_count(),
+      cudf::detail::copy_bitmask(input.parent(), stream, rmm::mr::get_current_device_resource()),
+      stream);
 }
 
 } // namespace cudf::jni

From 3a2609ba68d7511bc53451ef155ff87a0948ff9a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 23 Mar 2023 16:18:56 +0000
Subject: [PATCH 47/63] Fix `__setitem__` on string columns when the scalar
 value ends in a null byte (#12991)

Since numpy strings are fixed width and use a null byte as an
indicator of the end of the string, there is no way to distinguish
between numpy.str_("abc\x00").item() and numpy.str_("abc").item().
This has consequences for scalar preprocessing we do when constructing
a cudf.Scalar, since that usually goes through
numpy.astype(...).item(). So, when preprocessing as scalar, if we
notice it is a string with trailing null bytes, keep it as is.

Closes #12990.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/12991
---
 python/cudf/cudf/tests/test_setitem.py | 15 +++++++++++++++
 python/cudf/cudf/utils/dtypes.py       |  9 +++++++++
 2 files changed, 24 insertions(+)

diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 4d9ffc7cd81..dd82a9244b6 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -353,3 +353,18 @@ def test_scatter_by_slice_with_start_and_step():
     target[1::2] = source
     ctarget[1::2] = csource
     assert_eq(target, ctarget)
+
+
+@pytest.mark.parametrize("n", [1, 3])
+def test_setitem_str_trailing_null(n):
+    trailing_nulls = "\x00" * n
+    s = cudf.Series(["a", "b", "c" + trailing_nulls])
+    assert s[2] == "c" + trailing_nulls
+    s[0] = "a" + trailing_nulls
+    assert s[0] == "a" + trailing_nulls
+    s[1] = trailing_nulls
+    assert s[1] == trailing_nulls
+    s[0] = ""
+    assert s[0] == ""
+    s[0] = "\x00"
+    assert s[0] == "\x00"
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index acf00b3a3d5..2484003bd38 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -260,6 +260,15 @@ def to_cudf_compatible_scalar(val, dtype=None):
     ) or cudf.api.types.is_string_dtype(dtype):
         dtype = "str"
 
+        if isinstance(val, str) and val.endswith("\x00"):
+            # Numpy string dtypes are fixed width and use NULL to
+            # indicate the end of the string, so they cannot
+            # distinguish between "abc\x00" and "abc".
+            # https://github.com/numpy/numpy/issues/20118
+            # In this case, don't try going through numpy and just use
+            # the string value directly (cudf.DeviceScalar will DTRT)
+            return val
+
     if isinstance(val, datetime.datetime):
         val = np.datetime64(val)
     elif isinstance(val, datetime.timedelta):

From 7456690220eb1677aaf0aada0afa61a0307331d9 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 23 Mar 2023 16:50:31 +0000
Subject: [PATCH 48/63] Implement `groupby.sample` (#12882)

To do so, obtain the group offsets and values (and hence index). Sample within each group, and then pull out rows from the original object.

The fastest way to do this in Python is via the builtin random library, since neither numpy nor cupy offer a broadcasted/ufunc random.sample, and looping over the groups is very slow using either of them. Looping over the groups and using python random.sample is also slow, but less so.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12882
---
 python/cudf/benchmarks/API/bench_dataframe.py |  26 +++-
 python/cudf/cudf/_lib/cpp/sorting.pxd         |   9 +-
 python/cudf/cudf/_lib/sort.pyx                |  67 ++++++++-
 python/cudf/cudf/core/groupby/groupby.py      | 136 +++++++++++++++++-
 python/cudf/cudf/tests/test_groupby.py        |  87 +++++++++++
 5 files changed, 318 insertions(+), 7 deletions(-)

diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
index 42bfa854396..28777b23583 100644
--- a/python/cudf/benchmarks/API/bench_dataframe.py
+++ b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 """Benchmarks of DataFrame methods."""
 
@@ -104,6 +104,30 @@ def bench_groupby_agg(benchmark, dataframe, agg, num_key_cols, as_index, sort):
     benchmark(dataframe.groupby(by=by, as_index=as_index, sort=sort).agg, agg)
 
 
+@benchmark_with_object(cls="dataframe", dtype="int", nulls=False, cols=6)
+@pytest.mark.parametrize(
+    "num_key_cols",
+    [2, 3, 4],
+)
+@pytest.mark.parametrize("use_frac", [True, False])
+@pytest.mark.parametrize("replace", [True, False])
+@pytest.mark.parametrize("target_sample_frac", [0.1, 0.5, 1])
+def bench_groupby_sample(
+    benchmark, dataframe, num_key_cols, use_frac, replace, target_sample_frac
+):
+    grouper = dataframe.groupby(by=list(dataframe.columns[:num_key_cols]))
+    if use_frac:
+        kwargs = {"frac": target_sample_frac, "replace": replace}
+    else:
+        minsize = grouper.size().min()
+        target_size = numpy.round(
+            target_sample_frac * minsize, decimals=0
+        ).astype(int)
+        kwargs = {"n": target_size, "replace": replace}
+
+    benchmark(grouper.sample, **kwargs)
+
+
 @benchmark_with_object(cls="dataframe", dtype="int")
 @pytest.mark.parametrize("num_cols_to_sort", [1])
 def bench_sort_values(benchmark, dataframe, num_cols_to_sort):
diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd
index c6c42c327ac..b210ddf81dd 100644
--- a/python/cudf/cudf/_lib/cpp/sorting.pxd
+++ b/python/cudf/cudf/_lib/cpp/sorting.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -38,3 +38,10 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         const table_view& table,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence) except +
+
+    cdef unique_ptr[table] segmented_sort_by_key(
+        const table_view& values,
+        const table_view& keys,
+        const column_view& segment_offsets,
+        vector[libcudf_types.order] column_order,
+        vector[libcudf_types.null_order] null_precedence) except +
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index 3b96cc618dd..3c3f8cabda6 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -18,11 +18,13 @@ from cudf._lib.cpp.search cimport lower_bound, upper_bound
 from cudf._lib.cpp.sorting cimport (
     is_sorted as cpp_is_sorted,
     rank,
+    segmented_sort_by_key as cpp_segmented_sort_by_key,
     sorted_order,
 )
+from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport null_order, null_policy, order
-from cudf._lib.utils cimport table_view_from_columns
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 @acquire_spill_lock()
@@ -143,6 +145,67 @@ def order_by(list columns_from_table, object ascending, str na_position):
     return Column.from_unique_ptr(move(c_result))
 
 
+def segmented_sort_by_key(
+    list values,
+    list keys,
+    Column segment_offsets,
+    list column_order=None,
+    list null_precedence=None,
+):
+    """
+    Sort segments of a table by given keys
+
+    Parameters
+    ----------
+    values : list[Column]
+        Columns of the table which will be sorted
+    keys : list[Column]
+        Columns making up the sort key
+    offsets : Column
+        Segment offsets
+    column_order : list[bool], optional
+        Sequence of boolean values which correspond to each column in
+        keys providing the sort order (default all True).
+        With True <=> ascending; False <=> descending.
+    null_precedence : list[str], optional
+        Sequence of "first" or "last" values (default "first")
+        indicating the position of null values when sorting the keys.
+
+    Returns
+    -------
+    list[Column]
+        list of value columns sorted by keys
+    """
+    cdef table_view values_view = table_view_from_columns(values)
+    cdef table_view keys_view = table_view_from_columns(keys)
+    cdef column_view offsets_view = segment_offsets.view()
+    cdef vector[order] c_column_order
+    cdef vector[null_order] c_null_precedence
+    cdef unique_ptr[table] result
+    ncol = len(values)
+    column_order = column_order or [True] * ncol
+    null_precedence = null_precedence or ["first"] * ncol
+    for asc, null in zip(column_order, null_precedence):
+        c_column_order.push_back(order.ASCENDING if asc else order.DESCENDING)
+        if asc ^ (null == "first"):
+            c_null_precedence.push_back(null_order.AFTER)
+        elif asc ^ (null == "last"):
+            c_null_precedence.push_back(null_order.BEFORE)
+        else:
+            raise ValueError(f"Invalid null precedence {null}")
+    with nogil:
+        result = move(
+            cpp_segmented_sort_by_key(
+                values_view,
+                keys_view,
+                offsets_view,
+                c_column_order,
+                c_null_precedence,
+            )
+        )
+    return columns_from_unique_ptr(move(result))
+
+
 @acquire_spill_lock()
 def digitize(list source_columns, list bins, bool right=False):
     """
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 0e671cb6412..1ab1da92bc7 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -6,7 +6,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import Any, Iterable, List, Tuple, Union
+from typing import Any, Iterable, List, Optional, Tuple, Union
 
 import cupy as cp
 import numpy as np
@@ -16,6 +16,8 @@
 from cudf._lib import groupby as libgroupby
 from cudf._lib.null_mask import bitmask_or
 from cudf._lib.reshape import interleave_columns
+from cudf._lib.sort import segmented_sort_by_key
+from cudf._lib.types import size_type_dtype
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
 from cudf.api.types import is_list_like
 from cudf.core.abc import Serializable
@@ -637,7 +639,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
         # aggregation scheme in libcudf. This is probably "fast
         # enough" for most reasonable input sizes.
         _, offsets, _, group_values = self._grouped()
-        group_offsets = np.asarray(offsets, dtype=np.int32)
+        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
         size_per_group = np.diff(group_offsets)
         # "Out of bounds" n for the group size either means no entries
         # (negative) or all the entries (positive)
@@ -651,7 +653,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
             group_offsets = group_offsets[:-1]
         else:
             group_offsets = group_offsets[1:] - size_per_group
-        to_take = np.arange(size_per_group.sum(), dtype=np.int32)
+        to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype)
         fixup = np.empty_like(size_per_group)
         fixup[0] = 0
         np.cumsum(size_per_group[:-1], out=fixup[1:])
@@ -870,6 +872,134 @@ def ngroup(self, ascending=True):
         group_ids._index = index
         return self._broadcast(group_ids)
 
+    def sample(
+        self,
+        n: Optional[int] = None,
+        frac: Optional[float] = None,
+        replace: bool = False,
+        weights: Union[abc.Sequence, "cudf.Series", None] = None,
+        random_state: Union[np.random.RandomState, int, None] = None,
+    ):
+        """Return a random sample of items in each group.
+
+        Parameters
+        ----------
+        n
+            Number of items to return for each group, if sampling
+            without replacement must be at most the size of the
+            smallest group. Cannot be used with frac. Default is
+            ``n=1`` if frac is None.
+        frac
+            Fraction of items to return. Cannot be used with n.
+        replace
+            Should sampling occur with or without replacement?
+        weights
+            Sampling probability for each element. Must be the same
+            length as the grouped frame. Not currently supported.
+        random_state
+            Seed for random number generation.
+
+        Returns
+        -------
+        New dataframe or series with samples of appropriate size drawn
+        from each group.
+
+        """
+        if weights is not None:
+            # To implement this case again needs different algorithms
+            # in both cases.
+            #
+            # Without replacement, use the weighted reservoir sampling
+            # approach of Efraimidas and Spirakis (2006)
+            # https://doi.org/10.1016/j.ipl.2005.11.003, essentially,
+            # do a segmented argsort sorting on weight-scaled
+            # logarithmic deviates. See
+            # https://timvieira.github.io/blog/post/
+            # 2019/09/16/algorithms-for-sampling-without-replacement/
+            #
+            # With replacement is trickier, one might be able to use
+            # the alias method, otherwise we're back to bucketed
+            # rejection sampling.
+            raise NotImplementedError("Sampling with weights is not supported")
+        if frac is not None and n is not None:
+            raise ValueError("Cannot supply both of frac and n")
+        elif n is None and frac is None:
+            n = 1
+        elif frac is not None and not (0 <= frac <= 1):
+            raise ValueError(
+                "Sampling with fraction must provide fraction in "
+                f"[0, 1], got {frac=}"
+            )
+        # TODO: handle random states properly.
+        if random_state is not None and not isinstance(random_state, int):
+            raise NotImplementedError(
+                "Only integer seeds are supported for random_state "
+                "in this case"
+            )
+        # Get the groups
+        # TODO: convince Cython to convert the std::vector offsets
+        # into a numpy array directly, rather than a list.
+        # TODO: this uses the sort-based groupby, could one use hash-based?
+        _, offsets, _, group_values = self._grouped()
+        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
+        size_per_group = np.diff(group_offsets)
+        if n is not None:
+            samples_per_group = np.broadcast_to(
+                size_type_dtype.type(n), size_per_group.shape
+            )
+            if not replace and (minsize := size_per_group.min()) < n:
+                raise ValueError(
+                    f"Cannot sample {n=} without replacement, "
+                    f"smallest group is {minsize}"
+                )
+        else:
+            # Pandas uses round-to-nearest, ties to even to
+            # pick sample sizes for the fractional case (unlike IEEE
+            # which is round-to-nearest, ties to sgn(x) * inf).
+            samples_per_group = np.round(
+                size_per_group * frac, decimals=0
+            ).astype(size_type_dtype)
+        if replace:
+            # We would prefer to use cupy here, but their rng.integers
+            # interface doesn't take array-based low and high
+            # arguments.
+            low = 0
+            high = np.repeat(size_per_group, samples_per_group)
+            rng = np.random.default_rng(seed=random_state)
+            indices = rng.integers(low, high, dtype=size_type_dtype)
+            indices += np.repeat(group_offsets[:-1], samples_per_group)
+        else:
+            # Approach: do a segmented argsort of the index array and take
+            # the first samples_per_group entries from sorted array.
+            # We will shuffle the group indices and then pick them out
+            # from the grouped dataframe index.
+            nrows = len(group_values)
+            indices = cp.arange(nrows, dtype=size_type_dtype)
+            if len(size_per_group) < 500:
+                # Empirically shuffling with cupy is faster at this scale
+                rs = cp.random.get_random_state()
+                rs.seed(seed=random_state)
+                for off, size in zip(group_offsets, size_per_group):
+                    rs.shuffle(indices[off : off + size])
+            else:
+                rng = cp.random.default_rng(seed=random_state)
+                (indices,) = segmented_sort_by_key(
+                    [as_column(indices)],
+                    [as_column(rng.random(size=nrows))],
+                    as_column(group_offsets),
+                    [],
+                    [],
+                )
+                indices = cp.asarray(indices.data_array_view(mode="read"))
+            # Which indices are we going to want?
+            want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
+            scan = np.empty_like(samples_per_group)
+            scan[0] = 0
+            np.cumsum(samples_per_group[:-1], out=scan[1:])
+            want += np.repeat(group_offsets[:-1] - scan, samples_per_group)
+            indices = indices[want]
+        return group_values.iloc[indices]
+
     def serialize(self):
         header = {}
         frames = []
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 35a01b81042..1b86c68b582 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1,8 +1,10 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
+import collections
 import datetime
 import itertools
 import operator
+import string
 import textwrap
 from decimal import Decimal
 
@@ -2962,6 +2964,91 @@ def test_groupby_dtypes(groups):
     assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes)
 
 
+class TestSample:
+    @pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"])
+    def index(self, request):
+        n = 12
+        if request.param == "rangeindex":
+            return cudf.RangeIndex(2, n + 2)
+        elif request.param == "intindex":
+            return cudf.Index(
+                [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32"
+            )
+        elif request.param == "strindex":
+            return cudf.StringIndex(list(string.ascii_lowercase[:n]))
+        elif request.param == "default":
+            return None
+
+    @pytest.fixture(
+        params=[
+            ["a", "a", "b", "b", "c", "c", "c", "d", "d", "d", "d", "d"],
+            [1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4],
+        ],
+        ids=["str-group", "int-group"],
+    )
+    def df(self, index, request):
+        return cudf.DataFrame(
+            {"a": request.param, "b": request.param, "v": request.param},
+            index=index,
+        )
+
+    @pytest.fixture(params=["a", ["a", "b"]], ids=["single-col", "two-col"])
+    def by(self, request):
+        return request.param
+
+    def expected(self, df, *, n=None, frac=None):
+        value_counts = collections.Counter(df.a.values_host)
+        if n is not None:
+            values = list(
+                itertools.chain.from_iterable(
+                    itertools.repeat(v, n) for v in value_counts.keys()
+                )
+            )
+        elif frac is not None:
+            values = list(
+                itertools.chain.from_iterable(
+                    itertools.repeat(v, round(count * frac))
+                    for v, count in value_counts.items()
+                )
+            )
+        else:
+            raise ValueError("Must provide either n or frac")
+        values = cudf.Series(sorted(values), dtype=df.a.dtype)
+        return cudf.DataFrame({"a": values, "b": values, "v": values})
+
+    @pytest.mark.parametrize("n", [None, 0, 1, 2])
+    def test_constant_n_no_replace(self, df, by, n):
+        result = df.groupby(by).sample(n=n).sort_values("a")
+        n = 1 if n is None else n
+        assert_eq(self.expected(df, n=n), result.reset_index(drop=True))
+
+    def test_constant_n_no_replace_too_large_raises(self, df):
+        with pytest.raises(ValueError):
+            df.groupby("a").sample(n=3)
+
+    @pytest.mark.parametrize("n", [1, 2, 3])
+    def test_constant_n_replace(self, df, by, n):
+        result = df.groupby(by).sample(n=n, replace=True).sort_values("a")
+        assert_eq(self.expected(df, n=n), result.reset_index(drop=True))
+
+    def test_invalid_arguments(self, df):
+        with pytest.raises(ValueError):
+            df.groupby("a").sample(n=1, frac=0.1)
+
+    def test_not_implemented_arguments(self, df):
+        with pytest.raises(NotImplementedError):
+            # These are valid weights, but we don't implement this yet.
+            df.groupby("a").sample(n=1, weights=[1 / len(df)] * len(df))
+
+    @pytest.mark.parametrize("frac", [0, 1 / 3, 1 / 2, 2 / 3, 1])
+    @pytest.mark.parametrize("replace", [False, True])
+    def test_fraction_rounding(self, df, by, frac, replace):
+        result = (
+            df.groupby(by).sample(frac=frac, replace=replace).sort_values("a")
+        )
+        assert_eq(self.expected(df, frac=frac), result.reset_index(drop=True))
+
+
 class TestHeadTail:
     @pytest.fixture(params=[-3, -2, -1, 0, 1, 2, 3], ids=lambda n: f"{n=}")
     def n(self, request):

From 2818d4597612ad68e7638d37811fb37169c44c6e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 23 Mar 2023 12:05:28 -0500
Subject: [PATCH 49/63] Fix `find_common_dtype` and `values` to handle complex
 dtypes (#12537)

Fixes: #12083, fixes #12115
This PR fixes `find_common_dtype` and `values` APIs to handle complex dtypes by raising an error instead of casting them to strings.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/12537
---
 python/cudf/cudf/core/frame.py           | 17 ++++++++++++++---
 python/cudf/cudf/tests/test_concat.py    | 13 +++++++++++++
 python/cudf/cudf/tests/test_dataframe.py | 14 ++++++++++++++
 python/cudf/cudf/utils/dtypes.py         | 21 +++++++++++++++++++++
 4 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ea6a6de0b2b..d8b9ee4d006 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -485,9 +485,20 @@ def get_column_values_na(col):
             )
 
         if dtype is None:
-            dtype = find_common_type(
-                [col.dtype for col in self._data.values()]
-            )
+            dtypes = [col.dtype for col in self._data.values()]
+            for dtype in dtypes:
+                if isinstance(
+                    dtype,
+                    (
+                        cudf.ListDtype,
+                        cudf.core.dtypes.DecimalDtype,
+                        cudf.StructDtype,
+                    ),
+                ):
+                    raise NotImplementedError(
+                        f"{dtype} cannot be exposed as a cupy array"
+                    )
+            dtype = find_common_type(dtypes)
 
         matrix = make_empty_matrix(
             shape=(len(self), ncol), dtype=dtype, order="F"
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 2ff0bddf1c8..910f0b9cf86 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -1869,3 +1869,16 @@ def test_concat_invalid_axis(axis):
     s = gd.Series([1, 2, 3])
     with pytest.raises(ValueError):
         gd.concat([s], axis=axis)
+
+
+@pytest.mark.parametrize(
+    "s1,s2",
+    [
+        ([1, 2], [[1, 2], [3, 4]]),
+    ],
+)
+def test_concat_mixed_list_types_error(s1, s2):
+    s1, s2 = gd.Series(s1), gd.Series(s2)
+
+    with pytest.raises(NotImplementedError):
+        gd.concat([s1, s2], ignore_index=True)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6a79555d43e..d7912985356 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10031,6 +10031,20 @@ def test_dataframe_transpose_complex_types(data):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]},
+        {"a": [[{"b": 567}], None] * 10},
+        {"a": [decimal.Decimal(10), decimal.Decimal(20), None]},
+    ],
+)
+def test_dataframe_values_complex_types(data):
+    gdf = cudf.DataFrame(data)
+    with pytest.raises(NotImplementedError):
+        gdf.values
+
+
 def test_dataframe_from_arrow_slice():
     table = pa.Table.from_pandas(
         pd.DataFrame.from_dict(
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 2484003bd38..c7a8c8b4096 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -580,6 +580,27 @@ def find_common_type(dtypes):
             )
         else:
             return cudf.dtype("O")
+    if any(cudf.api.types.is_list_dtype(dtype) for dtype in dtypes):
+        if len(dtypes) == 1:
+            return dtypes.get(0)
+        else:
+            # TODO: As list dtypes allow casting
+            # to identical types, improve this logic of returning a
+            # common dtype, for example:
+            # ListDtype(int64) & ListDtype(int32) common
+            # dtype could be ListDtype(int64).
+            raise NotImplementedError(
+                "Finding a common type for `ListDtype` is currently "
+                "not supported"
+            )
+    if any(cudf.api.types.is_struct_dtype(dtype) for dtype in dtypes):
+        if len(dtypes) == 1:
+            return dtypes.get(0)
+        else:
+            raise NotImplementedError(
+                "Finding a common type for `StructDtype` is currently "
+                "not supported"
+            )
 
     # Corner case 1:
     # Resort to np.result_type to handle "M" and "m" types separately

From 5cdb9d98ddcd35be4dfe9920986464fd521ef6c2 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 23 Mar 2023 17:21:44 -0500
Subject: [PATCH 50/63] Pre-emptive fix for upstream
 `dask.dataframe.read_parquet` changes (#12983)

Once https://github.com/dask/dask/pull/10007 is merged, users will be able to pass a dictionary of hive-partitioning options to `dd.read_parquet` (using the `dataset=` kwarg). This new feature provides a workaround for the fact that `pyarrow.dataset.Partitioning` objects **cannot** be serialized in Python. In order for this feature to be supported in `dask_cudf` the `CudfEngine.read_partition` method must account for the case that `partitioning` is a `dict`.

**NOTE**:
It is not possible to add test coverage for this change until dask#10007 is merged. However, I don't see any good reason not to merge this PR **before** dask#10007.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/12983
---
 python/dask_cudf/dask_cudf/io/parquet.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index b03ac256b05..f19c373150d 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -121,6 +121,8 @@ def _read_paths(
                                 if row_groups
                                 else None,
                                 strings_to_categorical=strings_to_categorical,
+                                dataset_kwargs=dataset_kwargs,
+                                categorical_partitions=False,
                                 **kwargs,
                             )
                             for i, pof in enumerate(paths_or_fobs)
@@ -191,6 +193,8 @@ def read_partition(
 
         dataset_kwargs = kwargs.get("dataset", {})
         partitioning = partitioning or dataset_kwargs.get("partitioning", None)
+        if isinstance(partitioning, dict):
+            partitioning = pa_ds.partitioning(**partitioning)
 
         # Check if we are actually selecting any columns
         read_columns = columns

From 4ab227dac5360bf21affea9bf3251fb12ece3213 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 23 Mar 2023 18:23:24 -0400
Subject: [PATCH 51/63] Pin numba version (#13001)

Changes in the upcoming numba 0.57 release will break cudf until we've had time to adapt them, which will not happen in 23.04.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Graham Markall (https://github.com/gmarkall)
  - Lawrence Mitchell (https://github.com/wence-)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/13001
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 4 ++--
 dependencies.yaml                                | 4 ++--
 python/cudf/pyproject.toml                       | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b71101e7e3b..890cb199419 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -44,7 +44,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba>=0.56.2
+- numba>=0.56.4,<0.57
 - numpy>=1.21
 - numpydoc
 - nvcc_linux-64=11.8
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 6b23c8953d3..bbd9961320a 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -52,7 +52,7 @@ requirements:
     - cython >=0.29,<0.30
     - scikit-build >=0.13.1
     - setuptools
-    - numba >=0.56.2
+    - numba >=0.56.4,<0.57
     - dlpack >=0.5,<0.6.0a0
     - pyarrow =10
     - libcudf ={{ version }}
@@ -64,7 +64,7 @@ requirements:
     - typing_extensions
     - pandas >=1.3,<1.6.0dev0
     - cupy >=9.5.0,<12.0.0a0
-    - numba >=0.56.2
+    - numba >=0.56.4,<0.57
     - numpy >=1.21
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
     - libcudf {{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 5f72b8b6dea..1bd664fc57d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -340,7 +340,7 @@ dependencies:
         packages:
           - cachetools
           - cuda-python>=11.7.1,<12.0
-          - numba>=0.56.2
+          - &numba numba>=0.56.4,<0.57
           - nvtx>=0.2.1
           - packaging
           - rmm==23.4.*
@@ -493,4 +493,4 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - dask-cuda==23.4.*
-          - numba>=0.56.2
+          - *numba
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 6832a7aef26..3b49c821eff 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "cuda-python>=11.7.1,<12.0",
     "cupy-cuda11x>=9.5.0,<12.0.0a0",
     "fsspec>=0.6.0",
-    "numba>=0.56.2",
+    "numba>=0.56.4,<0.57",
     "numpy>=1.21",
     "nvtx>=0.2.1",
     "packaging",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 49e1cb38da4..c91a9bb3b85 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -41,7 +41,7 @@ dynamic = ["entry-points"]
 [project.optional-dependencies]
 test = [
     "dask-cuda==23.4.*",
-    "numba>=0.56.2",
+    "numba>=0.56.4,<0.57",
     "pytest",
     "pytest-cov",
     "pytest-xdist",

From dd5252b892adb43dc362cceaed4fffd9c6329269 Mon Sep 17 00:00:00 2001
From: Navin Kumar <97137715+NVnavkumar@users.noreply.github.com>
Date: Thu, 23 Mar 2023 15:25:02 -0700
Subject: [PATCH 52/63] Add JNI method for strings::replace multi variety
 (#12979)

Adds the JNI API for `stringReplace` using column vector arguments for `targets` and `repls` (to make this consistent with the C++ API). Also adds unit tests for the new API.
Part of the work for https://github.com/NVIDIA/spark-rapids/issues/7907.

Authors:
  - Navin Kumar (https://github.com/NVnavkumar)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12979
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 43 +++++++++++++++++++
 java/src/main/native/src/ColumnViewJni.cpp    | 20 +++++++++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 21 +++++++++
 3 files changed, 84 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 84183819854..7d93438d72e 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2914,6 +2914,41 @@ public final ColumnVector stringReplace(Scalar target, Scalar replace) {
         replace.getScalarHandle()));
   }
 
+  /**
+   * Returns a new strings column where target strings with each string are replaced with
+   * corresponding replacement strings. For each string in the column, the list of targets
+   * is searched within that string. If a target string is found, it is replaced by the
+   * corresponding entry in the repls column. All occurrences found in each string are replaced.
+   * The repls argument can optionally contain a single string. In this case, all matching
+   * target substrings will be replaced by that single string.
+   *
+   * Example:
+   * cv = ["hello", "goodbye"]
+   * targets = ["e","o"]
+   * repls = ["EE","OO"]
+   * r1 = cv.stringReplace(targets, repls)
+   * r1 is now ["hEEllO", "gOOOOdbyEE"]
+   *
+   * targets = ["e", "o"]
+   * repls = ["_"]
+   * r2 = cv.stringReplace(targets, repls)
+   * r2 is now ["h_ll_", "g__dby_"]
+   *
+   * @param targets Strings to search for in each string.
+   * @param repls Corresponding replacement strings for target strings.
+   * @return A new java column vector containing the replaced strings.
+   */
+  public final ColumnVector stringReplace(ColumnView targets, ColumnView repls) {
+    assert type.equals(DType.STRING) : "column type must be a String";
+    assert targets != null : "target list may not be null";
+    assert targets.getType().equals(DType.STRING) : "target list must be a string column";
+    assert repls != null : "replacement list may not be null";
+    assert repls.getType().equals(DType.STRING) : "replacement list must be a string column";
+
+    return new ColumnVector(stringReplaceMulti(getNativeView(), targets.getNativeView(),
+        repls.getNativeView()));
+  }
+
   /**
    * For each string, replaces any character sequence matching the given pattern using the
    * replacement string scalar.
@@ -4170,6 +4205,14 @@ private static native long substringColumn(long columnView, long startColumn, lo
    */
   private static native long stringReplace(long columnView, long target, long repl) throws CudfException;
 
+  /**
+   * Native method to replace target strings by corresponding repl strings.
+   * @param inputCV native handle of the cudf::column_view being operated on.
+   * @param targetsCV handle of column containing the strings being searched.
+   * @param replsCV handle of column containing the strings to replace (can optionally contain a single string).
+   */
+  private static native long stringReplaceMulti(long inputCV, long targetsCV, long replsCV) throws CudfException;
+
   /**
    * Native method for replacing each regular expression pattern match with the specified
    * replacement string.
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index f2c361c5e8c..1213ab305fe 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1546,6 +1546,26 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplace(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti(JNIEnv *env, jclass,
+                                                                          jlong inputs_cv,
+                                                                          jlong targets_cv,
+                                                                          jlong repls_cv) {
+  JNI_NULL_CHECK(env, inputs_cv, "column is null", 0);
+  JNI_NULL_CHECK(env, targets_cv, "targets string column view is null", 0);
+  JNI_NULL_CHECK(env, repls_cv, "repls string column view is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(inputs_cv);
+    cudf::strings_column_view scv(*cv);
+    cudf::column_view *cvtargets = reinterpret_cast<cudf::column_view *>(targets_cv);
+    cudf::strings_column_view scvtargets(*cvtargets);
+    cudf::column_view *cvrepls = reinterpret_cast<cudf::column_view *>(repls_cv);
+    cudf::strings_column_view scvrepls(*cvrepls);
+    return release_as_jlong(cudf::strings::replace(scv, scvtargets, scvrepls));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookupForKeys(JNIEnv *env, jclass,
                                                                         jlong map_column_view,
                                                                         jlong lookup_keys) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 7848807dab8..8e19c543ee5 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -5146,6 +5146,27 @@ void teststringReplaceThrowsException() {
     });
   }
 
+  @Test
+  void teststringReplaceMulti() {
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo", "thésssé", null, "", "ARé", "sssstrings");
+         ColumnVector e_allParameters = ColumnVector.fromStrings("Hello", "theSse", null, "", "ARe", "SStrings");
+         ColumnVector targets = ColumnVector.fromStrings("ss", "é");
+         ColumnVector repls = ColumnVector.fromStrings("S", "e");
+         ColumnVector replace_allParameters = v.stringReplace(targets, repls)) {
+      assertColumnsAreEqual(e_allParameters, replace_allParameters);
+    }
+  }
+
+  @Test
+  void teststringReplaceMultiThrowsException() {
+    assertThrows(AssertionError.class, () -> {
+      try (ColumnVector testStrings = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");
+           ColumnVector targets = ColumnVector.fromInts(0, 1);
+           ColumnVector repls = null;
+           ColumnVector result = testStrings.stringReplace(targets,repls)){}
+    });
+  }
+
   @Test
   void testReplaceRegex() {
     try (ColumnVector v = ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title");

From 33e2387b7810011f5648639a6d59b744968b999d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 23 Mar 2023 17:49:54 -0500
Subject: [PATCH 53/63] Fix `DataFrame` constructor to broadcast scalar inputs
 properly (#12997)

Fixes: #12646

This PR fixes an issue with `DataFrame` where broadcasting scalar inputs was order dependent.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/12997
---
 python/cudf/cudf/core/dataframe.py       | 24 +++++++++++++------
 python/cudf/cudf/tests/test_dataframe.py | 30 ++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 672e663d316..9d14d4bde7f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -907,14 +907,24 @@ def _init_from_dict_like(
         if index is None:
             num_rows = 0
             if data:
-                col_name = next(iter(data))
-                if is_scalar(data[col_name]):
-                    num_rows = num_rows or 1
-                else:
-                    data[col_name] = column.as_column(
-                        data[col_name], nan_as_null=nan_as_null
+                keys, values, lengths = zip(
+                    *(
+                        (k, v, 1)
+                        if is_scalar(v)
+                        else (
+                            k,
+                            vc := as_column(v, nan_as_null=nan_as_null),
+                            len(vc),
+                        )
+                        for k, v in data.items()
                     )
-                    num_rows = len(data[col_name])
+                )
+                data = dict(zip(keys, values))
+                try:
+                    (num_rows,) = (set(lengths) - {1}) or {1}
+                except ValueError:
+                    raise ValueError("All arrays must be the same length")
+
             self._index = RangeIndex(0, num_rows)
         else:
             self._index = as_index(index)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d7912985356..609f5eb488b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10057,3 +10057,33 @@ def test_dataframe_from_arrow_slice():
     actual = cudf.DataFrame.from_arrow(table_slice)
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 4},
+        {"c": 4, "a": [1, 2, 3], "b": ["x", "y", "z"]},
+        {"a": [1, 2, 3], "c": 4},
+    ],
+)
+def test_dataframe_init_from_scalar_and_lists(data):
+    actual = cudf.DataFrame(data)
+    expected = pd.DataFrame(data)
+
+    assert_eq(expected, actual)
+
+
+def test_dataframe_init_length_error():
+    assert_exceptions_equal(
+        lfunc=pd.DataFrame,
+        rfunc=cudf.DataFrame,
+        lfunc_args_and_kwargs=(
+            [],
+            {"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}},
+        ),
+        rfunc_args_and_kwargs=(
+            [],
+            {"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}},
+        ),
+    )

From fb96fc8b1d3f662aba59a9ee1cf388efb3e150cf Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 23 Mar 2023 18:52:32 -0500
Subject: [PATCH 54/63] Fix `GroupBy.apply` doc examples rendering (#12994)

Closes https://github.com/rapidsai/cudf/issues/12986.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/12994
---
 python/cudf/cudf/core/groupby/groupby.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 1ab1da92bc7..122e8091050 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1278,13 +1278,14 @@ def mult(df):
         ``engine='jit'`` may be used to accelerate certain functions,
         initially those that contain reductions and arithmetic operations
         between results of those reductions:
+
         >>> import cudf
         >>> df = cudf.DataFrame({'a':[1,1,2,2,3,3], 'b':[1,2,3,4,5,6]})
         >>> df.groupby('a').apply(
-        ...     lambda group: group['b'].max() - group['b'].min(),
-        ...     engine='jit'
+        ...   lambda group: group['b'].max() - group['b'].min(),
+        ...   engine='jit'
         ... )
-        a  None
+           a  None
         0  1     1
         1  2     1
         2  3     1

From a0473cf5b7c52b157584bd02e5c60897c05a57f8 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Fri, 24 Mar 2023 07:56:12 -0500
Subject: [PATCH 55/63] Cache JIT `GroupBy.apply` functions (#12802)

This PR sends incoming UDFs that go through the `engine='jit'` codepath through the main UDF cache. This should avoid recompiling if a user reuses the same UDF on different input data, so long as the types of that data are the same.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12802
---
 python/cudf/cudf/core/udf/groupby_utils.py | 15 +++++++--
 python/cudf/cudf/core/udf/utils.py         |  3 +-
 python/cudf/cudf/tests/test_groupby.py     | 37 ++++++++++++++++++++++
 3 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index dc31cf43292..ebf8c677e55 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -19,11 +19,13 @@
     groupby_apply_kernel_template,
 )
 from cudf.core.udf.utils import (
+    _generate_cache_key,
     _get_extensionty_size,
     _get_kernel,
     _get_udf_return_type,
     _supported_cols_from_frame,
     _supported_dtypes_from_frame,
+    precompiled,
 )
 from cudf.utils.utils import _cudf_nvtx_annotate
 
@@ -147,12 +149,19 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     offsets = cp.asarray(offsets)
     ngroups = len(offsets) - 1
 
-    kernel, return_type = _get_groupby_apply_kernel(
-        grouped_values, function, args
+    cache_key = _generate_cache_key(
+        grouped_values, function, suffix="__GROUPBY_APPLY_UDF"
     )
-    return_type = numpy_support.as_dtype(return_type)
 
+    if cache_key not in precompiled:
+        precompiled[cache_key] = _get_groupby_apply_kernel(
+            grouped_values, function, args
+        )
+    kernel, return_type = precompiled[cache_key]
+
+    return_type = numpy_support.as_dtype(return_type)
     output = cudf.core.column.column_empty(ngroups, dtype=return_type)
+
     launch_args = [
         offsets,
         output,
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index edc1a16353f..ed0c3332499 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -245,7 +245,7 @@ def _mask_get(mask, pos):
     return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1
 
 
-def _generate_cache_key(frame, func: Callable):
+def _generate_cache_key(frame, func: Callable, suffix="__APPLY_UDF"):
     """Create a cache key that uniquely identifies a compilation.
 
     A new compilation is needed any time any of the following things change:
@@ -259,6 +259,7 @@ def _generate_cache_key(frame, func: Callable):
         ),
         *(col.mask is None for col in frame._data.values()),
         *frame._data.keys(),
+        suffix,
     )
 
 
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 1b86c68b582..2a4b860c196 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -20,6 +20,7 @@
 from cudf import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140
 from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES
+from cudf.core.udf.utils import precompiled
 from cudf.testing._utils import (
     DATETIME_TYPES,
     SIGNED_TYPES,
@@ -534,6 +535,42 @@ def diverging_block(grp_df):
     run_groupby_apply_jit_test(df, diverging_block, ["a"])
 
 
+def test_groupby_apply_caching():
+    # Make sure similar functions that differ
+    # by simple things like constants actually
+    # recompile
+
+    # begin with a clear cache
+    precompiled.clear()
+    assert precompiled.currsize == 0
+
+    data = cudf.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 4, 5, 6]})
+
+    def f(group):
+        return group["b"].mean() * 2
+
+    # a single run should result in a cache size of 1
+    run_groupby_apply_jit_test(data, f, ["a"])
+    assert precompiled.currsize == 1
+
+    # a second run with f should not increase the count
+    run_groupby_apply_jit_test(data, f, ["a"])
+    assert precompiled.currsize == 1
+
+    # changing a constant value inside the UDF should miss
+    def f(group):
+        return group["b"].mean() * 3
+
+    run_groupby_apply_jit_test(data, f, ["a"])
+    assert precompiled.currsize == 2
+
+    # changing the dtypes of the columns should miss
+    data["b"] = data["b"].astype("float64")
+    run_groupby_apply_jit_test(data, f, ["a"])
+
+    assert precompiled.currsize == 3
+
+
 @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000])
 @pytest.mark.parametrize(
     "func",

From 9fbc249ce8be1bd515edb91fb768ecad870885aa Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 24 Mar 2023 11:19:12 -0400
Subject: [PATCH 56/63] Remove default detail mrs: part2 (#12965)

This is the second PR in a sequence removing default mr parameters in detail APIs. Contributes to https://github.com/rapidsai/cudf/issues/12944.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/12965
---
 cpp/include/cudf/detail/concatenate.hpp       | 16 ++---
 cpp/include/cudf/detail/copy_if.cuh           | 41 ++++++------
 cpp/include/cudf/detail/copy_if_else.cuh      | 19 +++---
 cpp/include/cudf/detail/copy_range.cuh        | 17 +++--
 cpp/include/cudf/detail/null_mask.cuh         | 13 ++--
 cpp/include/cudf/detail/repeat.hpp            | 22 +++----
 cpp/include/cudf/detail/replace.hpp           | 65 +++++++++----------
 cpp/include/cudf/detail/rolling.hpp           | 17 +++--
 cpp/include/cudf/detail/tdigest/tdigest.hpp   | 29 ++++-----
 cpp/include/cudf/io/detail/avro.hpp           | 11 ++--
 cpp/include/cudf/io/detail/json.hpp           | 11 ++--
 cpp/include/cudf_test/tdigest_utilities.cuh   | 20 ++++--
 cpp/src/dictionary/add_keys.cu                |  8 ++-
 cpp/src/dictionary/detail/concatenate.cu      |  4 +-
 cpp/src/dictionary/set_keys.cu                |  4 +-
 cpp/src/lists/combine/concatenate_rows.cu     |  2 +-
 cpp/src/lists/set_operations.cu               |  6 +-
 .../quantiles/tdigest/tdigest_aggregation.cu  |  3 +-
 cpp/src/reductions/reductions.cpp             |  2 +-
 cpp/src/replace/replace.cu                    |  4 +-
 cpp/tests/groupby/tdigest_tests.cu            |  9 ++-
 .../quantiles/percentile_approx_test.cpp      |  3 +-
 22 files changed, 161 insertions(+), 165 deletions(-)

diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 925029597a6..442814bc4fd 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,20 +33,18 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns_to_concat,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::concatenate(host_span<table_view const>,rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> concatenate(
-  host_span<table_view const> tables_to_concat,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 6eea72a1e0d..2870a891f87 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -229,14 +229,13 @@ struct DeviceType<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
 template <typename Filter, int block_size>
 struct scatter_gather_functor {
   template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(
-    cudf::column_view const& input,
-    cudf::size_type const& output_size,
-    cudf::size_type const* block_offsets,
-    Filter filter,
-    cudf::size_type per_thread,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
+                                           cudf::size_type const& output_size,
+                                           cudf::size_type const* block_offsets,
+                                           Filter filter,
+                                           cudf::size_type per_thread,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     auto output_column = cudf::detail::allocate_like(
       input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
@@ -277,14 +276,13 @@ struct scatter_gather_functor {
 
   template <typename T,
             std::enable_if_t<!cudf::is_fixed_width<T>() and !cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(
-    cudf::column_view const& input,
-    cudf::size_type const& output_size,
-    cudf::size_type const*,
-    Filter filter,
-    cudf::size_type,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
+                                           cudf::size_type const& output_size,
+                                           cudf::size_type const*,
+                                           Filter filter,
+                                           cudf::size_type,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     rmm::device_uvector<cudf::size_type> indices(output_size, stream);
 
@@ -320,11 +318,10 @@ struct scatter_gather_functor {
  * @return unique_ptr<table> The table generated from filtered `input`.
  */
 template <typename Filter>
-std::unique_ptr<table> copy_if(
-  table_view const& input,
-  Filter filter,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> copy_if(table_view const& input,
+                               Filter filter,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index b20753239ab..083b12edbf8 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,15 +145,14 @@ __launch_bounds__(block_size) __global__
  *                    by `filter[i]`
  */
 template <typename FilterFn, typename LeftIter, typename RightIter>
-std::unique_ptr<column> copy_if_else(
-  bool nullable,
-  LeftIter lhs_begin,
-  LeftIter lhs_end,
-  RightIter rhs,
-  FilterFn filter,
-  cudf::data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> copy_if_else(bool nullable,
+                                     LeftIter lhs_begin,
+                                     LeftIter lhs_end,
+                                     RightIter rhs,
+                                     FilterFn filter,
+                                     cudf::data_type output_type,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   // This is the type of the thrust::optional element in the passed iterators
   using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 22714e97dfa..0d5aa509e08 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -203,14 +203,13 @@ void copy_range_in_place(column_view const& source,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return std::unique_ptr<column> The result target column
  */
-std::unique_ptr<column> copy_range(
-  column_view const& source,
-  column_view const& target,
-  size_type source_begin,
-  size_type source_end,
-  size_type target_begin,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> copy_range(column_view const& source,
+                                   column_view const& target,
+                                   size_type source_begin,
+                                   size_type source_end,
+                                   size_type target_begin,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index ce2619d767e..3ff3bb4cf3c 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -114,13 +114,12 @@ __global__ void offset_bitmask_binop(Binop op,
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 template <typename Binop>
-std::pair<rmm::device_buffer, size_type> bitmask_binop(
-  Binop op,
-  host_span<bitmask_type const* const> masks,
-  host_span<size_type const> masks_begin_bits,
-  size_type mask_size_bits,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::pair<rmm::device_buffer, size_type> bitmask_binop(Binop op,
+                                                       host_span<bitmask_type const* const> masks,
+                                                       host_span<size_type const> masks_begin_bits,
+                                                       size_type mask_size_bits,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
 {
   auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};
   auto null_count =
diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index 69d9705556f..883d5d158fb 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,12 +32,11 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> repeat(
-  table_view const& input_table,
-  column_view const& count,
-  bool check_count,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> repeat(table_view const& input_table,
+                              column_view const& count,
+                              bool check_count,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::repeat(table_view const&, size_type,
@@ -45,11 +44,10 @@ std::unique_ptr<table> repeat(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> repeat(
-  table_view const& input_table,
-  size_type count,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> repeat(table_view const& input_table,
+                              size_type count,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index 9721c6e9849..da83f7b285d 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,11 +31,10 @@ namespace detail {
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_nulls(
-  column_view const& input,
-  cudf::column_view const& replacement,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nulls(column_view const& input,
+                                      cudf::column_view const& replacement,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, scalar const&,
@@ -43,11 +42,10 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_nulls(
-  column_view const& input,
-  scalar const& replacement,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nulls(column_view const& input,
+                                      scalar const& replacement,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, replace_policy const&,
@@ -55,11 +53,10 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_nulls(
-  column_view const& input,
-  replace_policy const& replace_policy,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nulls(column_view const& input,
+                                      replace_policy const& replace_policy,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, column_view const&,
@@ -67,11 +64,10 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_nans(
-  column_view const& input,
-  column_view const& replacement,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nans(column_view const& input,
+                                     column_view const& replacement,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, scalar const&,
@@ -79,33 +75,30 @@ std::unique_ptr<column> replace_nans(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_nans(
-  column_view const& input,
-  scalar const& replacement,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nans(column_view const& input,
+                                     scalar const& replacement,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::find_and_replace_all
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> find_and_replace_all(
-  column_view const& input_col,
-  column_view const& values_to_replace,
-  column_view const& replacement_values,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> find_and_replace_all(column_view const& input_col,
+                                             column_view const& values_to_replace,
+                                             column_view const& replacement_values,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::normalize_nans_and_zeros
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> normalize_nans_and_zeros(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
index dcaece2bafc..da90217c254 100644
--- a/cpp/include/cudf/detail/rolling.hpp
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,14 +39,13 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> rolling_window(
-  column_view const& input,
-  column_view const& preceding_window,
-  column_view const& following_window,
-  size_type min_periods,
-  rolling_aggregation const& agg,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> rolling_window(column_view const& input,
+                                       column_view const& preceding_window,
+                                       column_view const& following_window,
+                                       size_type min_periods,
+                                       rolling_aggregation const& agg,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 9df3f9daf3f..d9fb0efed45 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,15 +131,14 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& values,
  *
  * @returns The constructed tdigest column.
  */
-std::unique_ptr<column> make_tdigest_column(
-  size_type num_rows,
-  std::unique_ptr<column>&& centroid_means,
-  std::unique_ptr<column>&& centroid_weights,
-  std::unique_ptr<column>&& tdigest_offsets,
-  std::unique_ptr<column>&& min_values,
-  std::unique_ptr<column>&& max_values,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> make_tdigest_column(size_type num_rows,
+                                            std::unique_ptr<column>&& centroid_means,
+                                            std::unique_ptr<column>&& centroid_weights,
+                                            std::unique_ptr<column>&& tdigest_offsets,
+                                            std::unique_ptr<column>&& min_values,
+                                            std::unique_ptr<column>&& max_values,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create an empty tdigest column.
@@ -151,9 +150,8 @@ std::unique_ptr<column> make_tdigest_column(
  *
  * @returns An empty tdigest column.
  */
-std::unique_ptr<column> make_empty_tdigest_column(
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create an empty tdigest scalar.
@@ -165,9 +163,8 @@ std::unique_ptr<column> make_empty_tdigest_column(
  *
  * @returns An empty tdigest scalar.
  */
-std::unique_ptr<scalar> make_empty_tdigest_scalar(
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Generate a tdigest column from a grouped, sorted set of numeric input values.
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index c141e25f939..fede8e62d9f 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,11 +36,10 @@ namespace avro {
  *
  * @return The set of columns along with table metadata
  */
-table_with_metadata read_avro(
-  std::unique_ptr<cudf::io::datasource>&& source,
-  avro_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
+                              avro_reader_options const& options,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace avro
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 7d2884880e7..7b0350e9bc8 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -33,11 +33,10 @@ namespace cudf::io::json::detail {
  *
  * @return cudf::table object that contains the array of cudf::column.
  */
-table_with_metadata read_json(
-  std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
-  json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
+                              json_reader_options const& options,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Write an entire dataset to JSON format.
@@ -52,5 +51,5 @@ void write_json(data_sink* sink,
                 table_view const& table,
                 json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::mr::device_memory_resource* mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index d23d7f29a6c..df1900bfa0c 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -24,6 +24,9 @@
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
@@ -32,8 +35,6 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-#include <rmm/exec_policy.hpp>
-
 // for use with groupby and reduction aggregation tests.
 
 namespace cudf {
@@ -268,7 +269,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -559,9 +561,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
-  auto b = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
+  auto b = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -571,7 +576,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 486e7d2d24b..d543225d3eb 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -54,8 +56,8 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
   CUDF_EXPECTS(new_keys.type() == old_keys.type(), "Keys must be the same type");
   // first, concatenate the keys together
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
-  auto combined_keys =
-    cudf::detail::concatenate(std::vector<column_view>{old_keys, new_keys}, stream);
+  auto combined_keys = cudf::detail::concatenate(
+    std::vector<column_view>{old_keys, new_keys}, stream, rmm::mr::get_current_device_resource());
 
   // Drop duplicates from the combined keys, then sort the result.
   // sort(distinct([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f]
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index bc54f65bbd3..98ad108655f 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -220,7 +221,8 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     CUDF_EXPECTS(keys.type() == keys_type, "key types of all dictionary columns must match");
     return keys;
   });
-  auto all_keys = cudf::detail::concatenate(keys_views, stream);
+  auto all_keys =
+    cudf::detail::concatenate(keys_views, stream, rmm::mr::get_current_device_resource());
 
   // sort keys and remove duplicates;
   // this becomes the keys child for the output dictionary column
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 075fb6115e3..36f5021d305 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -181,7 +181,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
-  auto new_keys  = cudf::detail::concatenate(keys, stream);
+  auto new_keys  = cudf::detail::concatenate(keys, stream, rmm::mr::get_current_device_resource());
   auto keys_view = new_keys->view();
   std::vector<std::unique_ptr<column>> result(input.size());
   std::transform(input.begin(), input.end(), result.begin(), [keys_view, mr, stream](auto& col) {
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index b890a0c82a2..993d5e3fc78 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -216,7 +216,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
   // concatenate the input table into one column.
   std::vector<column_view> cols(input.num_columns());
   std::copy(input.begin(), input.end(), cols.begin());
-  auto concat = cudf::detail::concatenate(cols, stream);
+  auto concat = cudf::detail::concatenate(cols, stream, rmm::mr::get_current_device_resource());
 
   // whether or not we should be generating a null mask at all
   auto const build_null_mask = concat->has_nulls();
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index c05ef2fd644..813bac54e08 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -157,7 +157,8 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
   auto const intersect_table = cudf::detail::copy_if(
     rhs_table,
     [contained = contained.begin()] __device__(auto const idx) { return contained[idx]; },
-    stream);
+    stream,
+    rmm::mr::get_current_device_resource());
 
   // A stable algorithm is required to ensure that list labels remain contiguous.
   auto out_table = cudf::detail::stable_distinct(intersect_table->view(),
@@ -237,7 +238,8 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
   auto const difference_table = cudf::detail::copy_if(
     lhs_table,
     [contained = contained.begin()] __device__(auto const idx) { return !contained[idx]; },
-    stream);
+    stream,
+    rmm::mr::get_current_device_resource());
 
   // A stable algorithm is required to ensure that list labels remain contiguous.
   auto out_table = cudf::detail::stable_distinct(difference_table->view(),
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index e231d515e86..094e554c3d2 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1120,7 +1120,8 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                  tdigests.end(),
                  std::back_inserter(tdigest_views),
                  [](std::unique_ptr<table> const& t) { return t->view(); });
-  auto merged = cudf::detail::concatenate(tdigest_views, stream);
+  auto merged =
+    cudf::detail::concatenate(tdigest_views, stream, rmm::mr::get_current_device_resource());
 
   // generate cumulative weights
   auto merged_weights     = merged->get_column(1).view();
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index b6c050287cf..2fef8aa8785 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -162,7 +162,7 @@ std::unique_ptr<scalar> reduce(column_view const& col,
   // Returns default scalar if input column is empty or all null
   if (col.size() <= col.null_count()) {
     if (agg.kind == aggregation::TDIGEST || agg.kind == aggregation::MERGE_TDIGEST) {
-      return tdigest::detail::make_empty_tdigest_scalar(stream);
+      return tdigest::detail::make_empty_tdigest_scalar(stream, mr);
     }
 
     if (output_dtype.id() == type_id::LIST) {
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 7f184f793de..373e5ee97e2 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -449,7 +449,9 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
 
   auto matched_input = [&] {
     auto new_keys = cudf::detail::concatenate(
-      std::vector<cudf::column_view>({values.keys(), replacements.keys()}), stream);
+      std::vector<cudf::column_view>({values.keys(), replacements.keys()}),
+      stream,
+      rmm::mr::get_current_device_resource());
     return cudf::dictionary::detail::add_keys(input, new_keys->view(), stream, mr);
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index d7446d4dabb..4052201b064 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -471,13 +471,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto e = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 5809501fe2f..819b342ff8f 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -373,7 +373,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;

From 4c4fdd2c86deef16bb68a13bc401193274c45266 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 24 Mar 2023 14:12:37 -0500
Subject: [PATCH 57/63] Drop `force_nullable_schema` from chunked parquet
 writer (#12996)

`force_nullable_schema` was introduced in https://github.com/rapidsai/cudf/pull/12952, however strangely only after it has been merged to `branch-23.04` we are seeing the following pytest failure occur locally:
```python
(cudfdev) pgali@dt07:/nvme/0/pgali/cudf$ pytest python/dask_cudf/dask_cudf/io/tests/test_parquet.py::test_cudf_list_struct_write
====================================================================================== test session starts =======================================================================================
platform linux -- Python 3.10.9, pytest-7.2.2, pluggy-1.0.0
benchmark: 4.0.0 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000)
rootdir: /nvme/0/pgali/cudf/python/dask_cudf
plugins: cases-3.6.14, anyio-3.6.2, benchmark-4.0.0, xdist-3.2.1, hypothesis-6.70.0, cov-4.0.0
collected 1 item

python/dask_cudf/dask_cudf/io/tests/test_parquet.py F                                                                                                                                      [100%]

============================================================================================ FAILURES ============================================================================================
__________________________________________________________________________________ test_cudf_list_struct_write ___________________________________________________________________________________

tmpdir = local('/tmp/pytest-of-pgali/pytest-84/test_cudf_list_struct_write0')

    def test_cudf_list_struct_write(tmpdir):
        df = cudf.DataFrame(
            {
                "a": [1, 2, 3],
                "b": [[[1, 2]], [[2, 3]], None],
                "c": [[[["a", "z"]]], [[["b", "d", "e"]]], None],
            }
        )
        df["d"] = df.to_struct()

        ddf = dask_cudf.from_cudf(df, 3)
        temp_file = str(tmpdir.join("list_struct.parquet"))

>       ddf.to_parquet(temp_file)

python/dask_cudf/dask_cudf/io/tests/test_parquet.py:493:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
../envs/cudfdev/lib/python3.10/contextlib.py:79: in inner
    return func(*args, **kwds)
python/dask_cudf/dask_cudf/core.py:252: in to_parquet
    return to_parquet(self, path, *args, **kwargs)
../envs/cudfdev/lib/python3.10/site-packages/dask/dataframe/io/parquet/core.py:1061: in to_parquet
    out = out.compute(**compute_kwargs)
../envs/cudfdev/lib/python3.10/site-packages/dask/base.py:314: in compute
    (result,) = compute(self, traverse=False, **kwargs)
../envs/cudfdev/lib/python3.10/site-packages/dask/base.py:599: in compute
    results = schedule(dsk, keys, **kwargs)
../envs/cudfdev/lib/python3.10/site-packages/dask/threaded.py:89: in get
    results = get_async(
../envs/cudfdev/lib/python3.10/site-packages/dask/local.py:511: in get_async
    raise_exception(exc, tb)
../envs/cudfdev/lib/python3.10/site-packages/dask/local.py:319: in reraise
    raise exc
../envs/cudfdev/lib/python3.10/site-packages/dask/local.py:224: in execute_task
    result = _execute_task(task, data)
../envs/cudfdev/lib/python3.10/site-packages/dask/core.py:119: in _execute_task
    return func(*(_execute_task(a, cache) for a in args))
../envs/cudfdev/lib/python3.10/site-packages/dask/optimization.py:990: in __call__
    return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
../envs/cudfdev/lib/python3.10/site-packages/dask/core.py:149: in get
    result = _execute_task(task, cache)
../envs/cudfdev/lib/python3.10/site-packages/dask/core.py:119: in _execute_task
    return func(*(_execute_task(a, cache) for a in args))
../envs/cudfdev/lib/python3.10/site-packages/dask/dataframe/io/parquet/core.py:171: in __call__
    return self.engine.write_partition(
python/dask_cudf/dask_cudf/io/parquet.py:349: in write_partition
    md = df.to_parquet(
../envs/cudfdev/lib/python3.10/site-packages/cudf/core/dataframe.py:6322: in to_parquet
    return parquet.to_parquet(
../envs/cudfdev/lib/python3.10/contextlib.py:79: in inner
    return func(*args, **kwds)
../envs/cudfdev/lib/python3.10/site-packages/cudf/io/parquet.py:783: in to_parquet
    return _write_parquet(
../envs/cudfdev/lib/python3.10/contextlib.py:79: in inner
    return func(*args, **kwds)
../envs/cudfdev/lib/python3.10/site-packages/cudf/io/parquet.py:105: in _write_parquet
    write_parquet_res = libparquet.write_parquet(
../envs/cudfdev/lib/python3.10/contextlib.py:79: in inner
    return func(*args, **kwds)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

>   ???
E   RuntimeError: CUDF failure at: /nvme/0/pgali/cudf/cpp/src/io/parquet/writer_impl.cu:513: Mismatch in metadata prescribed nullability and input column nullability. Metadata for nullable input column cannot prescribe nullability = false

parquet.pyx:432: RuntimeError
==================================================================================== short test summary info =====================================================================================
FAILED python/dask_cudf/dask_cudf/io/tests/test_parquet.py::test_cudf_list_struct_write - RuntimeError: CUDF failure at: /nvme/0/pgali/cudf/cpp/src/io/parquet/writer_impl.cu:513: Mismatch in metadata prescribed nullability and input column nullability. Metadata for nullable inpu...
======================================================================================= 1 failed in 3.90s ========================================================================================
```

This PR fixes the issue by dropping `force_nullable_schema` from chunked parquet writer.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/12996
---
 python/cudf/cudf/_lib/parquet.pyx      | 23 ++++++-----------------
 python/cudf/cudf/tests/test_parquet.py | 20 --------------------
 2 files changed, 6 insertions(+), 37 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 59571b0e4b3..923f5c4089f 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -470,16 +470,6 @@ cdef class ParquetWriter:
     max_page_size_rows: int, default 20000
         Maximum number of rows of each page of the output.
         By default, 20000 will be used.
-    force_nullable_schema : bool, default True.
-        If True, writes all columns as `null` in schema.
-        If False, columns are written as `null` if they contain null values,
-        otherwise as `not null`.
-
-    Notes
-    -----
-    `DataFrame.to_parquet` and `ParquetWriter` differ in the default
-    value for `force_nullable_schema` to enable all the chunks being
-    written by chunked parquet writer to be schema identical.
 
     See Also
     --------
@@ -497,15 +487,13 @@ cdef class ParquetWriter:
     cdef size_type row_group_size_rows
     cdef size_t max_page_size_bytes
     cdef size_type max_page_size_rows
-    cdef bool force_nullable_schema
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
                   int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
                   int row_group_size_rows=1000000,
                   int max_page_size_bytes=524288,
-                  int max_page_size_rows=20000,
-                  bool force_nullable_schema=True):
+                  int max_page_size_rows=20000):
         filepaths_or_buffers = (
             list(filepath_or_buffer)
             if is_list_like(filepath_or_buffer)
@@ -520,7 +508,6 @@ cdef class ParquetWriter:
         self.row_group_size_rows = row_group_size_rows
         self.max_page_size_bytes = max_page_size_bytes
         self.max_page_size_rows = max_page_size_rows
-        self.force_nullable_schema = force_nullable_schema
 
     def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
@@ -615,7 +602,6 @@ cdef class ParquetWriter:
             _set_col_metadata(
                 table[name]._column,
                 self.tbl_meta.get().column_metadata[i],
-                self.force_nullable_schema
             )
 
         index = (
@@ -696,9 +682,12 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression):
 cdef _set_col_metadata(
     Column col,
     column_in_metadata& col_meta,
-    bool force_nullable_schema
+    bool force_nullable_schema=False,
 ):
-    col_meta.set_nullability(force_nullable_schema or col.nullable)
+    if force_nullable_schema:
+        # Only set nullability if `force_nullable_schema`
+        # is true.
+        col_meta.set_nullability(True)
 
     if is_struct_dtype(col):
         for i, (child_col, name) in enumerate(
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 9b783b03dad..c24ff080033 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2788,23 +2788,3 @@ def test_parquet_writer_schema_nullability(data, force_nullable_schema):
     assert pa.parquet.read_schema(file_obj).field(0).nullable == (
         force_nullable_schema or df.isnull().any().any()
     )
-
-
-@pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}])
-@pytest.mark.parametrize("force_nullable_schema", [True, False])
-def test_parquet_chunked_writer_schema_nullability(
-    data, force_nullable_schema
-):
-    df = cudf.DataFrame(data)
-    file_obj = BytesIO()
-
-    writer = ParquetWriter(
-        file_obj, force_nullable_schema=force_nullable_schema
-    )
-
-    writer.write_table(df)
-
-    writer.close()
-    assert pa.parquet.read_schema(file_obj).field(0).nullable == (
-        force_nullable_schema or df.isnull().any().any()
-    )

From ba8116a2bcee16e50f7dec1711fc436259d54874 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 24 Mar 2023 17:01:42 -0400
Subject: [PATCH 58/63] Fix gtest column utility comparator diff reporting
 (#12995)

Fixes a bug introduced in #12777 (by me) in `column_utilities.cu` that caused the difference reporting in a failed comparison in a gtest result (i.e. through `CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT` for example) to report the incorrect row.
The logic was changed to improve compile time but incorrectly created indices of only 0s and 1s in the difference vector.
This PR fixes the logic to create the correct indices for the reporting logic.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/cudf/pull/12995
---
 cpp/tests/utilities/column_utilities.cu | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 3a94aac1cc9..133ca99b31f 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -543,7 +543,7 @@ struct column_comparator_impl {
 
     auto const comparator = cudf::experimental::row::equality::two_table_comparator{
       lhs_tview, rhs_tview, cudf::get_default_stream()};
-    auto const has_nulls = cudf::has_nested_nulls(lhs_tview) or cudf::has_nested_nulls(rhs_tview);
+    auto const has_nulls = cudf::has_nulls(lhs_tview) or cudf::has_nulls(rhs_tview);
 
     auto const device_comparator = comparator.equal_to<false>(cudf::nullate::DYNAMIC{has_nulls});
 
@@ -556,18 +556,22 @@ struct column_comparator_impl {
       lhs_row_indices.size(), cudf::get_default_stream());  // worst case: everything different
     auto input_iter = thrust::make_counting_iterator(0);
 
+    auto diff_map = rmm::device_uvector<bool>(lhs_row_indices.size(), cudf::get_default_stream());
+
     thrust::transform(
       rmm::exec_policy(cudf::get_default_stream()),
       input_iter,
       input_iter + lhs_row_indices.size(),
-      differences.begin(),
+      diff_map.begin(),
       ComparatorType(
         *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps, device_comparator, *d_lhs, *d_rhs));
 
-    auto diff_iter = thrust::remove(rmm::exec_policy(cudf::get_default_stream()),
-                                    differences.begin(),
-                                    differences.end(),
-                                    0);  // remove the zero entries
+    auto diff_iter = thrust::copy_if(rmm::exec_policy(cudf::get_default_stream()),
+                                     input_iter,
+                                     input_iter + lhs_row_indices.size(),
+                                     diff_map.begin(),
+                                     differences.begin(),
+                                     thrust::identity<bool>{});
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
                        cudf::get_default_stream());  // shrink back down

From 698fcf63121323df376af57626047512d6816b24 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 24 Mar 2023 17:02:58 -0400
Subject: [PATCH 59/63] Fix memcheck script to execute only _TEST files found
 in bin/gtests/libcudf (#13006)

The nightly runs of `compute-sanitizer` started failing due to the test script trying to execute some new extra files found in the `$CONDA_PREFIX"/bin/gtests/libcudf/` directory. This change ensures only files ending in `_TEST` are executed by `compute-sanitizer`.

For reference errors are here: https://github.com/rapidsai/cudf/actions/runs/4508267264/jobs/7936800047
Example:
```
Running compute-sanitizer on CTestTestfile.cmake
========= COMPUTE-SANITIZER
========= Error: Target application terminated before first instrumented API call
```

Follow on issue/PR could explore adding a special make option to execute `compute-sanitizer` only on test files.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/13006
---
 ci/test_cpp_memcheck.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh
index db9ce143d51..0e85268cb72 100755
--- a/ci/test_cpp_memcheck.sh
+++ b/ci/test_cpp_memcheck.sh
@@ -11,7 +11,7 @@ set +e
 rapids-logger "Memcheck gtests with rmm_mode=cuda"
 export GTEST_CUDF_RMM_MODE=cuda
 COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck"
-for gt in "$CONDA_PREFIX"/bin/gtests/libcudf/* ; do
+for gt in "$CONDA_PREFIX"/bin/gtests/libcudf/*_TEST ; do
     test_name=$(basename ${gt})
     if [[ "$test_name" == "ERROR_TEST" ]] || [[ "$test_name" == "STREAM_IDENTIFICATION_TEST" ]]; then
         continue

From 12dc13007fd670af53c8fc869a3abebbe6188375 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 27 Mar 2023 09:51:37 -0400
Subject: [PATCH 60/63] Rework some code logic to reduce iterator and
 comparator inlining to improve compile time (#12900)

Disables inlining the device code logic for the row operators for nested column types did not work as hoped.
Some files took longer to compile and some functions ran 20% slower for large rows.

Reworking individual source files to break up the code logic into multiple kernels seems to work well for compile time while having a smaller effect on performance. The goal is to only rework the nested column code paths.
Here are some source files that have compile time issues and are improved in this PR.

| source file  | current | PR |
|:--- | ---:| ---:|
|   stream_compaction/unique_count.cu | 18 min |  13 min |
|   groupby/sort/group_nunique.cu   | 16 min |  2 min  |
|   stream_compaction/unique.cu  | 16 min | 5 min |
|   groupby/sort/sort_helper.cu | 10 min | 6.5 min |
|   search/contains_scalar.cu | 12 min | 4.7 min  |
|   sort/is_sorted.cu | 9 min | 7 min |
|   groupby/sort/group_std.cu | 7 min | 1.2 min |
|   groupby/sort/group_m2.cu | 6 min | 1.2 min  |

Available benchmarks showed minimal impact to performance.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/12900
---
 cpp/benchmarks/CMakeLists.txt                 |  1 +
 .../stream_compaction/unique_count.cpp        | 53 +++++++++++++
 cpp/src/groupby/sort/group_m2.cu              | 18 +++--
 cpp/src/groupby/sort/group_nunique.cu         | 36 +++++----
 cpp/src/groupby/sort/group_std.cu             | 18 +++--
 cpp/src/groupby/sort/sort_helper.cu           | 29 +++++--
 cpp/src/search/contains_scalar.cu             | 29 ++++---
 cpp/src/sort/is_sorted.cu                     | 25 +++++--
 cpp/src/stream_compaction/unique.cu           | 75 +++++++++++--------
 cpp/src/stream_compaction/unique_count.cu     | 20 +++--
 10 files changed, 217 insertions(+), 87 deletions(-)
 create mode 100644 cpp/benchmarks/stream_compaction/unique_count.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index cc0b642a337..e01d7745e94 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -150,6 +150,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
   STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
+  stream_compaction/unique_count.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/stream_compaction/unique_count.cpp b/cpp/benchmarks/stream_compaction/unique_count.cpp
new file mode 100644
index 00000000000..f8319e0385c
--- /dev/null
+++ b/cpp/benchmarks/stream_compaction/unique_count.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void nvbench_unique_count(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("NumRows"));
+  auto const nulls    = state.get_float64("NullProbability");
+
+  data_profile profile = data_profile_builder().cardinality(0).null_probability(nulls).distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows / 100);
+
+  auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+  auto sorted_table  = cudf::sort(cudf::table_view({source_column->view()}));
+
+  auto input = sorted_table->view();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    cudf::unique_count(input, cudf::null_equality::EQUAL);
+  });
+}
+
+using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+
+NVBENCH_BENCH_TYPES(nvbench_unique_count, NVBENCH_TYPE_AXES(data_type))
+  .set_name("unique_count")
+  .set_type_axes_names({"Type"})
+  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000})
+  .add_float64_axis("NullProbability", {0.0, 0.1});
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index edc8b089120..70b05100fb0 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,10 +25,12 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace groupby {
@@ -62,15 +64,19 @@ void compute_m2_fn(column_device_view const& values,
                    ResultType* d_result,
                    rmm::cuda_stream_view stream)
 {
-  auto const var_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    m2_transform<ResultType, decltype(values_iter)>{
-      values, values_iter, d_means, group_labels.data()});
+  auto m2_fn = m2_transform<ResultType, decltype(values_iter)>{
+    values, values_iter, d_means, group_labels.data()};
+  auto const itr = thrust::counting_iterator<size_type>(0);
+  // Using a temporary buffer for intermediate transform results instead of
+  // using the transform-iterator directly in thrust::reduce_by_key
+  // improves compile-time significantly.
+  auto m2_vals = rmm::device_uvector<ResultType>(values.size(), stream);
+  thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), m2_vals.begin(), m2_fn);
 
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
                         group_labels.end(),
-                        var_iter,
+                        m2_vals.begin(),
                         thrust::make_discard_iterator(),
                         d_result);
 }
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index cf81253483e..1a5f1691d5b 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -94,21 +94,20 @@ std::unique_ptr<column> group_nunique(column_view const& values,
 
   auto const d_values_view = column_device_view::create(values, stream);
 
+  auto d_result = rmm::device_uvector<size_type>(group_labels.size(), stream);
+
   auto const comparator_helper = [&](auto const d_equal) {
-    auto const is_unique_iterator =
-      thrust::make_transform_iterator(thrust::counting_iterator<cudf::size_type>(0),
-                                      is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
-                                                            *d_values_view,
-                                                            d_equal,
-                                                            null_handling,
-                                                            group_offsets.data(),
-                                                            group_labels.data()});
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          group_labels.begin(),
-                          group_labels.end(),
-                          is_unique_iterator,
-                          thrust::make_discard_iterator(),
-                          result->mutable_view().begin<size_type>());
+    auto fn = is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
+                                    *d_values_view,
+                                    d_equal,
+                                    null_handling,
+                                    group_offsets.data(),
+                                    group_labels.data()};
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(values.size()),
+                      d_result.begin(),
+                      fn);
   };
 
   if (cudf::detail::has_nested_columns(values_view)) {
@@ -121,6 +120,15 @@ std::unique_ptr<column> group_nunique(column_view const& values,
     comparator_helper(d_equal);
   }
 
+  // calling this with a vector instead of a transform iterator is 10x faster to compile;
+  // it also helps that we are only calling it once for both conditions
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        d_result.begin(),
+                        thrust::make_discard_iterator(),
+                        result->mutable_view().begin<size_type>());
+
   return result;
 }
 
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index a3efc1f172a..8cd2d8baf4e 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
@@ -33,6 +34,7 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace groupby {
@@ -48,7 +50,7 @@ struct var_transform {
   size_type const* d_group_labels;
   size_type ddof;
 
-  __device__ ResultType operator()(size_type i)
+  __device__ ResultType operator()(size_type i) const
   {
     if (d_values.is_null(i)) return 0.0;
 
@@ -75,15 +77,19 @@ void reduce_by_key_fn(column_device_view const& values,
                       ResultType* d_result,
                       rmm::cuda_stream_view stream)
 {
-  auto var_iter = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0),
-    var_transform<ResultType, decltype(values_iter)>{
-      values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof});
+  auto var_fn = var_transform<ResultType, decltype(values_iter)>{
+    values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof};
+  auto const itr = thrust::make_counting_iterator<size_type>(0);
+  // Using a temporary buffer for intermediate transform results instead of
+  // using the transform-iterator directly in thrust::reduce_by_key
+  // improves compile-time significantly.
+  auto vars = rmm::device_uvector<ResultType>(values.size(), stream);
+  thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), vars.begin(), var_fn);
 
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
                         group_labels.end(),
-                        var_iter,
+                        vars.begin(),
                         thrust::make_discard_iterator(),
                         d_result);
 }
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 6e992f2f53b..5b5a6356d67 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -16,6 +16,8 @@
 
 #include "common_utils.cuh"
 
+#include <stream_compaction/stream_compaction_common.cuh>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
@@ -144,7 +146,8 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
 {
   if (_group_offsets) return *_group_offsets;
 
-  _group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
+  auto const size = num_keys(stream);
+  _group_offsets  = std::make_unique<index_vector>(size + 1, stream);
 
   auto const comparator = cudf::experimental::row::equality::self_comparator{_keys, stream};
 
@@ -154,23 +157,33 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
   if (cudf::detail::has_nested_columns(_keys)) {
     auto const d_key_equal = comparator.equal_to<true>(
       cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
-    result_end = thrust::unique_copy(rmm::exec_policy(stream),
-                                     thrust::counting_iterator<size_type>(0),
-                                     thrust::counting_iterator<size_type>(num_keys(stream)),
-                                     _group_offsets->begin(),
-                                     permuted_row_equality_comparator(d_key_equal, sorted_order));
+    // Using a temporary buffer for intermediate transform results from the iterator containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator directly in thrust::unique_copy.
+    auto result       = rmm::device_uvector<bool>(size, stream);
+    auto const itr    = thrust::make_counting_iterator<size_type>(0);
+    auto const row_eq = permuted_row_equality_comparator(d_key_equal, sorted_order);
+    auto const ufn    = cudf::detail::unique_copy_fn<decltype(itr), decltype(row_eq)>{
+      itr, duplicate_keep_option::KEEP_FIRST, row_eq, size - 1};
+    thrust::transform(rmm::exec_policy(stream), itr, itr + size, result.begin(), ufn);
+    result_end = thrust::copy_if(rmm::exec_policy(stream),
+                                 itr,
+                                 itr + size,
+                                 result.begin(),
+                                 _group_offsets->begin(),
+                                 thrust::identity<bool>{});
   } else {
     auto const d_key_equal = comparator.equal_to<false>(
       cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
     result_end = thrust::unique_copy(rmm::exec_policy(stream),
                                      thrust::counting_iterator<size_type>(0),
-                                     thrust::counting_iterator<size_type>(num_keys(stream)),
+                                     thrust::counting_iterator<size_type>(size),
                                      _group_offsets->begin(),
                                      permuted_row_equality_comparator(d_key_equal, sorted_order));
   }
 
   size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
-  _group_offsets->set_element(num_groups, num_keys(stream), stream);
+  _group_offsets->set_element(num_groups, size, stream);
   _group_offsets->resize(num_groups + 1, stream);
 
   return *_group_offsets;
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 093a1f8f1ed..7c16a1b12ef 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -30,6 +30,7 @@
 
 #include <thrust/count.h>
 #include <thrust/pair.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
@@ -108,16 +109,24 @@ struct contains_scalar_dispatch {
     auto const haystack_cdv_ptr = column_device_view::create(haystack, stream);
 
     auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
-    return thrust::count_if(
-             rmm::exec_policy(stream),
-             begin,
-             end,
-             [d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
-               if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
-                 return false;
-               }
-               return d_comp(idx, rhs_index_type{0});  // compare haystack[idx] == needle[0].
-             }) > 0;
+
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator in a transform iterator with thrust::count_if.
+    auto d_results = rmm::device_uvector<bool>(haystack.size(), stream);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      begin,
+      end,
+      d_results.begin(),
+      [d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
+        if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
+          return false;
+        }
+        return d_comp(idx, rhs_index_type{0});  // compare haystack[idx] == needle[0].
+      });
+
+    return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true) > 0;
   }
 };
 
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index 4c5ad1ef0ea..25c594e9e74 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -27,13 +27,15 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/sort.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
 
-auto is_sorted(cudf::table_view const& in,
+bool is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
                std::vector<null_order> const& null_precedence,
                rmm::cuda_stream_view stream)
@@ -44,16 +46,25 @@ auto is_sorted(cudf::table_view const& in,
   if (cudf::detail::has_nested_columns(in)) {
     auto const device_comparator = comparator.less<true>(has_nested_nulls(in));
 
-    return thrust::is_sorted(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(in.num_rows()),
-                             device_comparator);
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly over using the comparator directly
+    // in thrust::is_sorted.
+    auto d_results = rmm::device_uvector<bool>(in.num_rows(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::counting_iterator<size_type>(0),
+                      thrust::counting_iterator<size_type>(in.num_rows()),
+                      d_results.begin(),
+                      [device_comparator] __device__(auto idx) -> bool {
+                        return (idx == 0) || device_comparator(idx - 1, idx);
+                      });
+
+    return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) == 0;
   } else {
     auto const device_comparator = comparator.less<false>(has_nested_nulls(in));
 
     return thrust::is_sorted(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(in.num_rows()),
+                             thrust::counting_iterator<size_type>(0),
+                             thrust::counting_iterator<size_type>(in.num_rows()),
                              device_comparator);
   }
 }
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index 511a7b7ae1c..2d81c00e9d9 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -67,38 +67,51 @@ std::unique_ptr<table> unique(table_view const& input,
 
   auto comp = cudf::experimental::row::equality::self_comparator(keys_view, stream);
 
-  auto const comparator_helper = [&](auto const row_equal) {
-    // get indices of unique rows
-    auto result_end = unique_copy(thrust::counting_iterator<size_type>(0),
-                                  thrust::counting_iterator<size_type>(num_rows),
-                                  mutable_view->begin<size_type>(),
-                                  row_equal,
-                                  keep,
-                                  stream);
+  size_type const unique_size = [&] {
+    if (cudf::detail::has_nested_columns(keys_view)) {
+      // Using a temporary buffer for intermediate transform results from the functor containing
+      // the comparator speeds up compile-time significantly without much degradation in
+      // runtime performance over using the comparator directly in thrust::unique_copy.
+      auto row_equal =
+        comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
+      auto d_results = rmm::device_uvector<bool>(num_rows, stream);
+      auto itr       = thrust::make_counting_iterator<size_type>(0);
+      thrust::transform(
+        rmm::exec_policy(stream),
+        itr,
+        itr + num_rows,
+        d_results.begin(),
+        unique_copy_fn<decltype(itr), decltype(row_equal)>{itr, keep, row_equal, num_rows - 1});
+      auto result_end = thrust::copy_if(rmm::exec_policy(stream),
+                                        itr,
+                                        itr + num_rows,
+                                        d_results.begin(),
+                                        mutable_view->begin<size_type>(),
+                                        thrust::identity<bool>{});
+      return static_cast<size_type>(thrust::distance(mutable_view->begin<size_type>(), result_end));
+    } else {
+      // Using thrust::unique_copy with the comparator directly will compile more slowly but
+      // improves runtime by up to 2x over the transform/copy_if approach above.
+      auto row_equal =
+        comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
+      auto result_end = unique_copy(thrust::counting_iterator<size_type>(0),
+                                    thrust::counting_iterator<size_type>(num_rows),
+                                    mutable_view->begin<size_type>(),
+                                    row_equal,
+                                    keep,
+                                    stream);
+      return static_cast<size_type>(thrust::distance(mutable_view->begin<size_type>(), result_end));
+    }
+  }();
+  auto indices_view = cudf::detail::slice(column_view(*unique_indices), 0, unique_size);
 
-    auto indices_view =
-      cudf::detail::slice(column_view(*unique_indices),
-                          0,
-                          thrust::distance(mutable_view->begin<size_type>(), result_end));
-
-    // gather unique rows and return
-    return detail::gather(input,
-                          indices_view,
-                          out_of_bounds_policy::DONT_CHECK,
-                          detail::negative_index_policy::NOT_ALLOWED,
-                          stream,
-                          mr);
-  };
-
-  if (cudf::detail::has_nested_columns(keys_view)) {
-    auto row_equal =
-      comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
-    return comparator_helper(row_equal);
-  } else {
-    auto row_equal =
-      comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
-    return comparator_helper(row_equal);
-  }
+  // gather unique rows and return
+  return detail::gather(input,
+                        indices_view,
+                        out_of_bounds_policy::DONT_CHECK,
+                        detail::negative_index_policy::NOT_ALLOWED,
+                        stream,
+                        mr);
 }
 }  // namespace detail
 
diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu
index 4c1cf2b2bc3..ac9924311c2 100644
--- a/cpp/src/stream_compaction/unique_count.cu
+++ b/cpp/src/stream_compaction/unique_count.cu
@@ -38,6 +38,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
+#include <thrust/transform.h>
 
 #include <cmath>
 #include <cstddef>
@@ -76,14 +77,23 @@ cudf::size_type unique_count(table_view const& keys,
   if (cudf::detail::has_nested_columns(keys)) {
     auto const comp =
       row_comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
-    return thrust::count_if(
-      rmm::exec_policy(stream),
-      thrust::counting_iterator<cudf::size_type>(0),
-      thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
-      [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator directly in thrust::count_if.
+    auto d_results = rmm::device_uvector<bool>(keys.num_rows(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(keys.num_rows()),
+                      d_results.begin(),
+                      [comp] __device__(auto i) { return (i == 0 or not comp(i, i - 1)); });
+
+    return static_cast<size_type>(
+      thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true));
   } else {
     auto const comp =
       row_comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
+    // Using thrust::copy_if with the comparator directly will compile more slowly but
+    // improves runtime by up to 2x over the transform/count approach above.
     return thrust::count_if(
       rmm::exec_policy(stream),
       thrust::counting_iterator<cudf::size_type>(0),

From bc7b89f6aca49d039322272067dcea71f045fa82 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 27 Mar 2023 10:04:44 -0500
Subject: [PATCH 61/63] Handle index names while performing `groupby` (#12992)

Fixes: #12759

This PR fixes the above issue by factoring in index names while performing a `groupby` by label name.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/12992
---
 python/cudf/cudf/core/groupby/groupby.py |  9 ++++++++-
 python/cudf/cudf/tests/test_groupby.py   | 12 ++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 122e8091050..cb4c0f6b48b 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2319,7 +2319,14 @@ def _handle_mapping(self, by):
         self._handle_series(by)
 
     def _handle_label(self, by):
-        self._key_columns.append(self._obj._data[by])
+        try:
+            self._key_columns.append(self._obj._data[by])
+        except KeyError as e:
+            # `by` can be index name(label) too.
+            if by in self._obj._index.names:
+                self._key_columns.append(self._obj._index._data[by])
+            else:
+                raise e
         self.names.append(by)
         self._named_columns.append(by)
 
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 2a4b860c196..e58d70f49c7 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3001,6 +3001,18 @@ def test_groupby_dtypes(groups):
     assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes)
 
 
+@pytest.mark.parametrize("index_names", ["a", "b", "c", ["b", "c"]])
+def test_groupby_by_index_names(index_names):
+    gdf = cudf.DataFrame(
+        {"a": [1, 2, 3, 4], "b": ["a", "b", "a", "a"], "c": [1, 1, 2, 1]}
+    ).set_index(index_names)
+    pdf = gdf.to_pandas()
+
+    assert_groupby_results_equal(
+        pdf.groupby(index_names).min(), gdf.groupby(index_names).min()
+    )
+
+
 class TestSample:
     @pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"])
     def index(self, request):

From 766af666575b5db6f265e1fbf6466ccfd46eae30 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@users.noreply.github.com>
Date: Mon, 27 Mar 2023 13:15:41 -0400
Subject: [PATCH 62/63] add sphinx building and s3 uploading for dask-cudf docs
 (#12982)

follow up PR for https://github.com/rapidsai/cudf/pull/12725

Authors:
  - Benjamin Zaitlen (https://github.com/quasiben)
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Sevag H (https://github.com/sevagh)

URL: https://github.com/rapidsai/cudf/pull/12982
---
 ci/build_docs.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 6daedb59733..4955fe08982 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -33,16 +33,25 @@ aws s3 cp s3://rapidsai-docs/librmm/${VERSION_NUMBER}/html/rmm.tag . || echo "Fa
 doxygen Doxyfile
 popd
 
-rapids-logger "Build Sphinx docs"
+rapids-logger "Build cuDF Sphinx docs"
 pushd docs/cudf
 sphinx-build -b dirhtml source _html
 sphinx-build -b text source _text
 popd
 
 
+rapids-logger "Build dask-cuDF Sphinx docs"
+pushd docs/dask_cudf
+sphinx-build -b dirhtml source _html
+sphinx-build -b text source _text
+popd
+
+
 if [[ ${RAPIDS_BUILD_TYPE} == "branch" ]]; then
   rapids-logger "Upload Docs to S3"
   aws s3 sync --no-progress --delete cpp/doxygen/html "s3://rapidsai-docs/libcudf/${VERSION_NUMBER}/html"
   aws s3 sync --no-progress --delete docs/cudf/_html "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/html"
   aws s3 sync --no-progress --delete docs/cudf/_text "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/txt"
+  aws s3 sync --no-progress --delete docs/dask_cudf/_html "s3://rapidsai-docs/dask-cudf/${VERSION_NUMBER}/html"
+  aws s3 sync --no-progress --delete docs/dask_cudf/_text "s3://rapidsai-docs/dask-cudf/${VERSION_NUMBER}/txt"
 fi

From 173fde9d9ae335014a1aa6de60b417f6f245dcf2 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 27 Mar 2023 19:36:13 -0400
Subject: [PATCH 63/63] Add nunique aggregation support for
 cudf::segmented_reduce (#12972)

Adds support for `NUNIQUE` aggregation type for `cudf::segmented_reduce`. This computes the number of unique elements within each segment specified. Due to the overhead of sorting, the segments must be sorted before calling this function otherwise the results are undefined. Also, only non-nested column types are supported as well.

Reference #10432

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/12972
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/benchmarks/CMakeLists.txt                 |   2 +-
 ...segment_reduce.cu => segmented_reduce.cpp} |  73 +++++++----
 .../cudf/detail/aggregation/aggregation.hpp   |   4 +-
 .../detail/segmented_reduction_functions.hpp  |  28 +++++
 cpp/src/aggregation/aggregation.cpp           |   2 +
 cpp/src/reductions/segmented/nunique.cu       | 113 ++++++++++++++++++
 cpp/src/reductions/segmented/reductions.cpp   |   2 +
 .../reductions/segmented_reduction_tests.cpp  |  42 +++++++
 9 files changed, 240 insertions(+), 27 deletions(-)
 rename cpp/benchmarks/reduction/{segment_reduce.cu => segmented_reduce.cpp} (58%)
 create mode 100644 cpp/src/reductions/segmented/nunique.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 13583378134..127df03c54d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -464,6 +464,7 @@ add_library(
   src/reductions/segmented/max.cu
   src/reductions/segmented/mean.cu
   src/reductions/segmented/min.cu
+  src/reductions/segmented/nunique.cu
   src/reductions/segmented/product.cu
   src/reductions/segmented/reductions.cpp
   src/reductions/segmented/std.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index e01d7745e94..b9c15e244de 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -192,7 +192,7 @@ ConfigureBench(
 )
 ConfigureNVBench(
   REDUCTION_NVBENCH reduction/distinct_count.cpp reduction/rank.cpp reduction/scan_structs.cpp
-  reduction/segment_reduce.cu
+  reduction/segmented_reduce.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/reduction/segment_reduce.cu b/cpp/benchmarks/reduction/segmented_reduce.cpp
similarity index 58%
rename from cpp/benchmarks/reduction/segment_reduce.cu
rename to cpp/benchmarks/reduction/segmented_reduce.cpp
index 127b3598dae..590a014ad76 100644
--- a/cpp/benchmarks/reduction/segment_reduce.cu
+++ b/cpp/benchmarks/reduction/segmented_reduce.cpp
@@ -20,17 +20,15 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
-#include <cudf/detail/iterator.cuh>
+#include <cudf/filling.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/device_vector.h>
-
 #include <memory>
-#include <type_traits>
 
 bool constexpr is_boolean_output_agg(cudf::segmented_reduce_aggregation::Kind kind)
 {
@@ -38,8 +36,15 @@ bool constexpr is_boolean_output_agg(cudf::segmented_reduce_aggregation::Kind ki
          kind == cudf::segmented_reduce_aggregation::ANY;
 }
 
+bool constexpr is_float_output_agg(cudf::segmented_reduce_aggregation::Kind kind)
+{
+  return kind == cudf::segmented_reduce_aggregation::MEAN ||
+         kind == cudf::segmented_reduce_aggregation::VARIANCE ||
+         kind == cudf::segmented_reduce_aggregation::STD;
+}
+
 template <cudf::segmented_reduce_aggregation::Kind kind>
-std::unique_ptr<cudf::segmented_reduce_aggregation> make_simple_aggregation()
+std::unique_ptr<cudf::segmented_reduce_aggregation> make_reduce_aggregation()
 {
   switch (kind) {
     case cudf::segmented_reduce_aggregation::SUM:
@@ -54,12 +59,22 @@ std::unique_ptr<cudf::segmented_reduce_aggregation> make_simple_aggregation()
       return cudf::make_all_aggregation<cudf::segmented_reduce_aggregation>();
     case cudf::segmented_reduce_aggregation::ANY:
       return cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>();
-    default: CUDF_FAIL("Unsupported simple segmented aggregation");
+    case cudf::segmented_reduce_aggregation::SUM_OF_SQUARES:
+      return cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
+    case cudf::segmented_reduce_aggregation::MEAN:
+      return cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
+    case cudf::segmented_reduce_aggregation::VARIANCE:
+      return cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
+    case cudf::segmented_reduce_aggregation::STD:
+      return cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
+    case cudf::segmented_reduce_aggregation::NUNIQUE:
+      return cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
+    default: CUDF_FAIL("Unsupported segmented reduce aggregation in this benchmark");
   }
 }
 
 template <typename DataType>
-std::pair<std::unique_ptr<cudf::column>, thrust::device_vector<cudf::size_type>> make_test_data(
+std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>> make_test_data(
   nvbench::state& state)
 {
   auto const column_size{cudf::size_type(state.get_int64("column_size"))};
@@ -72,28 +87,30 @@ std::pair<std::unique_ptr<cudf::column>, thrust::device_vector<cudf::size_type>>
     dtype, distribution_id::UNIFORM, 0, 100);
   auto input = create_random_column(dtype, row_count{column_size}, profile);
 
-  auto offset_it = cudf::detail::make_counting_transform_iterator(
-    0, [column_size, segment_length] __device__(auto i) {
-      return column_size < i * segment_length ? column_size : i * segment_length;
-    });
-
-  thrust::device_vector<cudf::size_type> d_offsets(offset_it, offset_it + num_segments + 1);
-
-  return std::pair(std::move(input), d_offsets);
+  auto offsets = cudf::sequence(num_segments + 1,
+                                cudf::numeric_scalar<cudf::size_type>(0),
+                                cudf::numeric_scalar<cudf::size_type>(segment_length));
+  return std::pair(std::move(input), std::move(offsets));
 }
 
 template <typename DataType, cudf::aggregation::Kind kind>
-void BM_Simple_Segmented_Reduction(nvbench::state& state,
-                                   nvbench::type_list<DataType, nvbench::enum_type<kind>>)
+void BM_Segmented_Reduction(nvbench::state& state,
+                            nvbench::type_list<DataType, nvbench::enum_type<kind>>)
 {
   auto const column_size{cudf::size_type(state.get_int64("column_size"))};
   auto const num_segments{cudf::size_type(state.get_int64("num_segments"))};
 
   auto [input, offsets] = make_test_data<DataType>(state);
-  auto agg              = make_simple_aggregation<kind>();
+  auto agg              = make_reduce_aggregation<kind>();
 
-  auto output_type = is_boolean_output_agg(kind) ? cudf::data_type{cudf::type_id::BOOL8}
-                                                 : cudf::data_type{cudf::type_to_id<DataType>()};
+  auto const output_type = [] {
+    if (is_boolean_output_agg(kind)) { return cudf::data_type{cudf::type_id::BOOL8}; }
+    if (is_float_output_agg(kind)) { return cudf::data_type{cudf::type_id::FLOAT64}; }
+    if (kind == cudf::segmented_reduce_aggregation::NUNIQUE) {
+      return cudf::data_type{cudf::type_to_id<cudf::size_type>()};
+    }
+    return cudf::data_type{cudf::type_to_id<DataType>()};
+  }();
 
   state.add_element_count(column_size);
   state.add_global_memory_reads<DataType>(column_size);
@@ -103,8 +120,10 @@ void BM_Simple_Segmented_Reduction(nvbench::state& state,
     state.add_global_memory_writes<DataType>(num_segments);
   }
 
-  auto const input_view  = input->view();
-  auto const offset_span = cudf::device_span<cudf::size_type>{offsets};
+  auto const input_view   = input->view();
+  auto const offsets_view = offsets->view();
+  auto const offset_span  = cudf::device_span<cudf::size_type const>{
+    offsets_view.template data<cudf::size_type>(), static_cast<std::size_t>(offsets_view.size())};
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(
@@ -115,13 +134,17 @@ void BM_Simple_Segmented_Reduction(nvbench::state& state,
 
 using Types = nvbench::type_list<bool, int32_t, float, double>;
 // Skip benchmarking MAX/ANY since they are covered by MIN/ALL respectively.
+// Also VARIANCE includes STD calculation.
 using AggKinds = nvbench::enum_type_list<cudf::aggregation::SUM,
                                          cudf::aggregation::PRODUCT,
                                          cudf::aggregation::MIN,
-                                         cudf::aggregation::ALL>;
+                                         cudf::aggregation::ALL,
+                                         cudf::aggregation::MEAN,
+                                         cudf::aggregation::VARIANCE,
+                                         cudf::aggregation::NUNIQUE>;
 
-NVBENCH_BENCH_TYPES(BM_Simple_Segmented_Reduction, NVBENCH_TYPE_AXES(Types, AggKinds))
-  .set_name("segmented_reduction_simple")
+NVBENCH_BENCH_TYPES(BM_Segmented_Reduction, NVBENCH_TYPE_AXES(Types, AggKinds))
+  .set_name("segmented_reduction")
   .set_type_axes_names({"DataType", "AggregationKinds"})
   .add_int64_axis("column_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
   .add_int64_axis("num_segments", {1'000, 10'000, 100'000});
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index e269d4d2e13..b688bf3d445 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -535,7 +535,9 @@ class argmin_aggregation final : public rolling_aggregation, public groupby_aggr
 /**
  * @brief Derived class for specifying a nunique aggregation
  */
-class nunique_aggregation final : public groupby_aggregation, public reduce_aggregation {
+class nunique_aggregation final : public groupby_aggregation,
+                                  public reduce_aggregation,
+                                  public segmented_reduce_aggregation {
  public:
   nunique_aggregation(null_policy null_handling)
     : aggregation{NUNIQUE}, _null_handling{null_handling}
diff --git a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
index c1bf59e5f65..3902a7200a9 100644
--- a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
@@ -325,6 +325,34 @@ std::unique_ptr<column> segmented_variance(column_view const& col,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Counts the number of unique values within each segment of a column
+ *
+ * Unique entries are counted by comparing adjacent values so the column segments
+ * are expected to be sorted before calling this function otherwise the results
+ * are undefined.
+ *
+ * If any input segment is empty, that segment's result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, the segment count is the number of
+ * unique values +1 which includes all the null entries in that segment.
+ * If `null_handling==null_policy::EXCLUDE`, the segment count does not include nulls.
+ *
+ * @throw cudf::logic_error if input column type is a nested type
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of unique counts per segment
+ */
+std::unique_ptr<column> segmented_nunique(column_view const& col,
+                                          device_span<size_type const> offsets,
+                                          null_policy null_handling,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 07c53b3a421..2e6a643484e 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -605,6 +605,8 @@ template std::unique_ptr<groupby_aggregation> make_nunique_aggregation<groupby_a
   null_policy null_handling);
 template std::unique_ptr<reduce_aggregation> make_nunique_aggregation<reduce_aggregation>(
   null_policy null_handling);
+template std::unique_ptr<segmented_reduce_aggregation>
+make_nunique_aggregation<segmented_reduce_aggregation>(null_policy null_handling);
 
 /// Factory to create an NTH_ELEMENT aggregation
 template <typename Base>
diff --git a/cpp/src/reductions/segmented/nunique.cu b/cpp/src/reductions/segmented/nunique.cu
new file mode 100644
index 00000000000..bd1efb41df8
--- /dev/null
+++ b/cpp/src/reductions/segmented/nunique.cu
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "update_validity.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/labeling/label_segments.cuh>
+#include <cudf/reduction/detail/segmented_reduction.cuh>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace reduction {
+namespace detail {
+namespace {
+template <typename ComparatorType>
+struct is_unique_fn {
+  column_device_view const d_col;
+  ComparatorType row_equal;
+  null_policy null_handling;
+  size_type const* offsets;
+  size_type const* labels;
+
+  __device__ size_type operator()(size_type idx) const
+  {
+    if (null_handling == null_policy::EXCLUDE && d_col.is_null(idx)) { return 0; }
+    return static_cast<size_type>(offsets[labels[idx]] == idx || (!row_equal(idx, idx - 1)));
+  }
+};
+}  // namespace
+
+std::unique_ptr<cudf::column> segmented_nunique(column_view const& col,
+                                                device_span<size_type const> offsets,
+                                                null_policy null_handling,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  // only support non-nested types
+  CUDF_EXPECTS(!cudf::is_nested(col.type()),
+               "segmented reduce nunique only supports non-nested column types");
+
+  // compute the unique identifiers within each segment
+  auto const identifiers = [&] {
+    auto const d_col = column_device_view::create(col, stream);
+    auto const comparator =
+      cudf::experimental::row::equality::self_comparator{table_view({col}), stream};
+    auto const row_equal =
+      comparator.equal_to<false>(cudf::nullate::DYNAMIC{col.has_nulls()}, null_equality::EQUAL);
+
+    auto labels = rmm::device_uvector<size_type>(col.size(), stream);
+    cudf::detail::label_segments(
+      offsets.begin(), offsets.end(), labels.begin(), labels.end(), stream);
+    auto fn = is_unique_fn<decltype(row_equal)>{
+      *d_col, row_equal, null_handling, offsets.data(), labels.data()};
+
+    auto identifiers = rmm::device_uvector<size_type>(col.size(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(col.size()),
+                      identifiers.begin(),
+                      fn);
+    return identifiers;
+  }();
+
+  auto result = cudf::make_numeric_column(data_type(type_to_id<size_type>()),
+                                          static_cast<size_type>(offsets.size() - 1),
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream,
+                                          mr);
+
+  // Sum the unique identifiers within each segment
+  auto add_op = op::sum{};
+  cudf::reduction::detail::segmented_reduce(identifiers.begin(),
+                                            offsets.begin(),
+                                            offsets.end(),
+                                            result->mutable_view().data<size_type>(),
+                                            add_op.get_binary_op(),
+                                            0,
+                                            stream);
+
+  // Compute the output null mask
+  // - only empty segments are tagged as null
+  // - nulls are counted appropriately above per null_handling policy
+  auto const bitmask_col = null_handling == null_policy::EXCLUDE ? col : result->view();
+  cudf::reduction::detail::segmented_update_validity(
+    *result, bitmask_col, offsets, null_policy::EXCLUDE, std::nullopt, stream, mr);
+
+  return result;
+}
+}  // namespace detail
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 66b98fa8322..cee82560794 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -95,6 +95,8 @@ struct segmented_reduce_dispatch_functor {
         return segmented_standard_deviation(
           col, offsets, output_dtype, null_handling, var_agg._ddof, stream, mr);
       }
+      case segmented_reduce_aggregation::NUNIQUE:
+        return segmented_nunique(col, offsets, null_handling, stream, mr);
       default: CUDF_FAIL("Unsupported aggregation type.");
     }
   }
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 40b0d268580..77fdad09c0b 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -927,6 +927,48 @@ TEST_F(SegmentedReductionTestUntyped, VarianceNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 }
 
+TEST_F(SegmentedReductionTestUntyped, NUnique)
+{
+  auto const input =
+    cudf::test::fixed_width_column_wrapper<int32_t>({10, 15, 20, 30, 60, 60, 70, 70, 80});
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 2, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const agg         = cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::INT32};
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{{1, 0, 1, 2, 3}, {1, 0, 1, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(SegmentedReductionTestUntyped, NUniqueNulls)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {10, 0, 20, 30, 60, 60, 70, 70, 0}, {1, 0, 1, 1, 1, 1, 1, 1, 0});
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 2, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const agg         = cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::INT32};
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{{1, 0, 0, 2, 2}, {1, 0, 0, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+
+  expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{{1, 0, 1, 2, 3}, {1, 0, 1, 1, 1}};
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
 TEST_F(SegmentedReductionTestUntyped, Errors)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(

File	Compile time	Size
File	Compile time	Size	t-cmp
", name, "	", build_time_str, "	", file_size_str, "
", file_size_str, "	", + diff_time_str, + "