diff --git a/build.sh b/build.sh
index bee66d819b4..7cbd0fceb5a 100755
--- a/build.sh
+++ b/build.sh
@@ -300,8 +300,7 @@ if buildAll || hasArg libcudf; then
     # Record build times
     if [[ "$BUILD_REPORT_METRICS" == "ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
         echo "Formatting build metrics"
-        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml
-        MSG="<p>"
+        MSG=""
         # get some sccache stats after the compile
         if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" && -x "$(command -v sccache)" ]]; then
            COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }')
@@ -318,7 +317,9 @@ if buildAll || hasArg libcudf; then
         BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIB_BUILD_DIR}"}
         echo "Metrics output dir: [$BMR_DIR]"
         mkdir -p ${BMR_DIR}
-        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${BMR_DIR}/ninja_log.html
+        MSG_OUTFILE="$(mktemp)"
+        echo "$MSG" > "${MSG_OUTFILE}"
+        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "${MSG_OUTFILE}" > ${BMR_DIR}/ninja_log.html
         cp ${LIB_BUILD_DIR}/.ninja_log ${BMR_DIR}/ninja.log
     fi
 
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index b68c2bdbef6..bc27e7d76b0 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -14,29 +14,3 @@ rapids-logger "Begin cpp build"
 rapids-mamba-retry mambabuild conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
-
-if [[ -d $RAPIDS_ARTIFACTS_DIR ]]; then
-  ls -l ${RAPIDS_ARTIFACTS_DIR}
-fi
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
-
-FILE=${RAPIDS_ARTIFACTS_DIR}/ninja.log
-if [[ -f $FILE ]]; then
-  echo -e "\x1B[33;1m\x1B[48;5;240m Ninja log for this build available at the following link \x1B[0m"
-  UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).ninja.log
-  rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}"
-fi
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
-
-FILE=${RAPIDS_ARTIFACTS_DIR}/ninja_log.html
-if [[ -f $FILE ]]; then
-  echo -e "\x1B[33;1m\x1B[48;5;240m Build Metrics Report for this build available at the following link \x1B[0m"
-  UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).BuildMetricsReport.html
-  rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}"
-fi
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
index 9d9758f1f15..0c55c4b9141 100755
--- a/ci/release/apply_wheel_modifications.sh
+++ b/ci/release/apply_wheel_modifications.sh
@@ -6,12 +6,6 @@
 VERSION=${1}
 CUDA_SUFFIX=${2}
 
-# __init__.py versions
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/cudf/cudf/__init__.py
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/dask_cudf/dask_cudf/__init__.py
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/cudf_kafka/cudf_kafka/__init__.py
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/custreamz/custreamz/__init__.py
-
 # pyproject.toml versions
 sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf/pyproject.toml
 sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/dask_cudf/pyproject.toml
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index caa807bd7ec..0b2fc71aacd 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -92,6 +92,7 @@ outputs:
         - test -f $PREFIX/include/cudf/concatenate.hpp
         - test -f $PREFIX/include/cudf/copying.hpp
         - test -f $PREFIX/include/cudf/datetime.hpp
+        - test -f $PREFIX/include/cudf/timezone.hpp
         - test -f $PREFIX/include/cudf/detail/aggregation/aggregation.hpp
         - test -f $PREFIX/include/cudf/detail/aggregation/result_cache.hpp
         - test -f $PREFIX/include/cudf/detail/binaryop.hpp
@@ -128,6 +129,8 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
         - test -f $PREFIX/include/cudf/detail/structs/utilities.hpp
         - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp
+        - test -f $PREFIX/include/cudf/detail/timezone.cuh
+        - test -f $PREFIX/include/cudf/detail/timezone.hpp
         - test -f $PREFIX/include/cudf/detail/transform.hpp
         - test -f $PREFIX/include/cudf/detail/transpose.hpp
         - test -f $PREFIX/include/cudf/detail/unary.hpp
diff --git a/conda/recipes/libcudf/post-link.sh b/conda/recipes/libcudf/post-link.sh
index 64e0b1ad305..8ae2349f791 100644
--- a/conda/recipes/libcudf/post-link.sh
+++ b/conda/recipes/libcudf/post-link.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Only add the license notice to libcudf and not our examples / tests
 if [[ "$PKG_NAME" == "libcudf" ]]; then
-  cat ./nvlink.txt >> $PREFIX/.messages.txt
+  cat ./nvcomp.txt >> $PREFIX/.messages.txt
 fi
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0fcd1895972..13583378134 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -369,7 +369,7 @@ add_library(
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu
   src/io/orc/stripe_init.cu
-  src/io/orc/timezone.cpp
+  src/datetime/timezone.cpp
   src/io/orc/writer_impl.cu
   src/io/parquet/compact_protocol_reader.cpp
   src/io/parquet/compact_protocol_writer.cpp
@@ -890,31 +890,20 @@ install(
   EXPORT cudf-exports
 )
 
-install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf ${CUDF_SOURCE_DIR}/include/cudf_test
-                  ${CUDF_SOURCE_DIR}/include/nvtext DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-)
-
-if(CUDF_BUILD_TESTUTIL)
+set(_components_export_string)
+if(TARGET cudftestutil)
   install(
     TARGETS cudftest_default_stream cudftestutil
     DESTINATION ${lib_dir}
     EXPORT cudf-testing-exports
   )
-
-  install(
-    EXPORT cudf-testing-exports
-    FILE cudf-testing-targets.cmake
-    NAMESPACE cudf::
-    DESTINATION "${lib_dir}/cmake/cudf"
-  )
-
-  include("${rapids-cmake-dir}/export/write_dependencies.cmake")
-  rapids_export_write_dependencies(
-    INSTALL cudf-testing-exports
-    "${PROJECT_BINARY_DIR}/rapids-cmake/cudf/export/cudf-testing-dependencies.cmake"
-  )
+  set(_components_export_string COMPONENTS testing COMPONENTS_EXPORT_SET cudf-testing-exports)
 endif()
 
+install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf ${CUDF_SOURCE_DIR}/include/cudf_test
+                  ${CUDF_SOURCE_DIR}/include/nvtext DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
 if(CUDF_BUILD_STREAMS_TEST_UTIL)
   install(TARGETS cudf_identify_stream_usage_mode_cudf DESTINATION ${lib_dir})
   install(TARGETS cudf_identify_stream_usage_mode_testing DESTINATION ${lib_dir})
@@ -976,12 +965,6 @@ string(
   [=[
 if(testing IN_LIST cudf_FIND_COMPONENTS)
   enable_language(CUDA)
-  if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake")
-  endif()
-  if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
-  endif()
 endif()
 ]=]
 )
@@ -989,8 +972,8 @@ string(APPEND install_code_string "${common_code_string}")
 
 rapids_export(
   INSTALL cudf
-  EXPORT_SET cudf-exports
-  GLOBAL_TARGETS cudf
+  EXPORT_SET cudf-exports ${_components_export_string}
+  GLOBAL_TARGETS cudf cudftestutil
   NAMESPACE cudf::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK install_code_string
@@ -1013,23 +996,13 @@ string(APPEND build_code_string "${common_code_string}")
 
 rapids_export(
   BUILD cudf
-  EXPORT_SET cudf-exports
-  GLOBAL_TARGETS cudf
+  EXPORT_SET cudf-exports ${_components_export_string}
+  GLOBAL_TARGETS cudf cudftestutil
   NAMESPACE cudf::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK build_code_string
 )
 
-if(CUDF_BUILD_TESTUTIL)
-  export(
-    EXPORT cudf-testing-exports
-    FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake
-    NAMESPACE cudf::
-  )
-  rapids_export_write_dependencies(
-    BUILD cudf-testing-exports "${CUDF_BINARY_DIR}/cudf-testing-dependencies.cmake"
-  )
-endif()
 # ##################################################################################################
 # * make documentation ----------------------------------------------------------------------------
 
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 545028260b8..762e9640d12 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -430,8 +430,12 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
                    null_mask.begin());
   }
 
-  auto [result_bitmask, null_count] = cudf::detail::valid_if(
-    null_mask.begin(), null_mask.end(), thrust::identity<bool>{}, cudf::get_default_stream());
+  auto [result_bitmask, null_count] =
+    cudf::detail::valid_if(null_mask.begin(),
+                           null_mask.end(),
+                           thrust::identity<bool>{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
   return std::make_unique<cudf::column>(
     dtype,
@@ -509,8 +513,12 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
                      thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1),
                      num_rows,
                      string_generator{chars.data(), engine});
-  auto [result_bitmask, null_count] = cudf::detail::valid_if(
-    null_mask.begin(), null_mask.end() - 1, thrust::identity<bool>{}, cudf::get_default_stream());
+  auto [result_bitmask, null_count] =
+    cudf::detail::valid_if(null_mask.begin(),
+                           null_mask.end() - 1,
+                           thrust::identity<bool>{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(
     num_rows,
     std::move(offsets),
@@ -628,8 +636,11 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
       auto [null_mask, null_count] = [&]() {
         if (profile.get_null_probability().has_value()) {
           auto valids = valid_dist(engine, num_rows);
-          return cudf::detail::valid_if(
-            valids.begin(), valids.end(), thrust::identity<bool>{}, cudf::get_default_stream());
+          return cudf::detail::valid_if(valids.begin(),
+                                        valids.end(),
+                                        thrust::identity<bool>{},
+                                        cudf::get_default_stream(),
+                                        rmm::mr::get_current_device_resource());
         }
         return std::pair<rmm::device_buffer, cudf::size_type>{};
       }();
@@ -712,9 +723,12 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
     auto offsets_column = std::make_unique<cudf::column>(
       cudf::data_type{cudf::type_id::INT32}, num_rows + 1, offsets.release());
 
-    auto [null_mask, null_count] = cudf::detail::valid_if(
-      valids.begin(), valids.end(), thrust::identity<bool>{}, cudf::get_default_stream());
-    list_column = cudf::make_lists_column(
+    auto [null_mask, null_count] = cudf::detail::valid_if(valids.begin(),
+                                                          valids.end(),
+                                                          thrust::identity<bool>{},
+                                                          cudf::get_default_stream(),
+                                                          rmm::mr::get_current_device_resource());
+    list_column                  = cudf::make_lists_column(
       num_rows,
       std::move(offsets_column),
       std::move(current_child_column),
@@ -840,7 +854,8 @@ std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
     return cudf::detail::valid_if(thrust::make_counting_iterator<cudf::size_type>(0),
                                   thrust::make_counting_iterator<cudf::size_type>(size),
                                   bool_generator{seed, 1.0 - *null_probability},
-                                  cudf::get_default_stream());
+                                  cudf::get_default_stream(),
+                                  rmm::mr::get_current_device_resource());
   }
 }
 
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index 73060200d00..1b1cf9b7e9d 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -140,8 +140,8 @@ void BM_iterator(benchmark::State& state)
   cudf::column_view hasnull_F = wrap_hasnull_F;
 
   // Initialize dev_result to false
-  auto dev_result =
-    cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(1, cudf::get_default_stream());
+  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
@@ -210,7 +210,7 @@ void BM_pair_iterator(benchmark::State& state)
 
   // Initialize dev_result to false
   auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
-    1, cudf::get_default_stream());
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index e37a4ca1193..70036a95377 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -104,8 +104,11 @@ void BM_join(state_type& state, Join JoinFunc)
     // roughly 75% nulls
     auto validity =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{});
-    return cudf::detail::valid_if(
-             validity, validity + size, thrust::identity<bool>{}, cudf::get_default_stream())
+    return cudf::detail::valid_if(validity,
+                                  validity + size,
+                                  thrust::identity<bool>{},
+                                  cudf::get_default_stream(),
+                                  rmm::mr::get_current_device_resource())
       .first;
   };
 
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index a88f621095c..91c3dccfdc6 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -121,8 +121,8 @@ recommend watching Sean Parent's [C++ Seasoning talk](https://www.youtube.com/wa
 and we try to follow his rules: "No raw loops. No raw pointers. No raw synchronization primitives."
 
  * Prefer algorithms from STL and Thrust to raw loops.
- * Prefer libcudf and RMM [owning data structures and views](#libcudf-data-structures) to raw pointers
-   and raw memory allocation.
+ * Prefer libcudf and RMM [owning data structures and views](#libcudf-data-structures) to raw
+   pointers and raw memory allocation.
  * libcudf doesn't have a lot of CPU-thread concurrency, but there is some. And currently libcudf
    does use raw synchronization primitives. So we should revisit Parent's third rule and improve
    here.
@@ -146,8 +146,8 @@ The following guidelines apply to organizing `#include` lines.
  * Separate groups by a blank line.
  * Order the groups from "nearest" to "farthest". In other words, local includes, then includes
    from other RAPIDS libraries, then includes from related libraries, like `<thrust/...>`, then
-   includes from dependencies installed with cuDF, and then standard headers (for example `<string>`,
-   `<iostream>`).
+   includes from dependencies installed with cuDF, and then standard headers (for example
+   `<string>`, `<iostream>`).
  * Use `<>` instead of `""` unless the header is in the same directory as the source file.
  * Tools like `clangd` often auto-insert includes when they can, but they usually get the grouping
    and brackets wrong.
@@ -271,10 +271,12 @@ A *mutable*, non-owning view of a table.
 
 ## cudf::size_type
 
-The `cudf::size_type` is the type used for the number of elements in a column, offsets to elements within a column, indices to address specific elements, segments for subsets of column elements, etc.
+The `cudf::size_type` is the type used for the number of elements in a column, offsets to elements
+within a column, indices to address specific elements, segments for subsets of column elements, etc.
 It is equivalent to a signed, 32-bit integer type and therefore has a maximum value of 2147483647.
-Some APIs also accept negative index values and those functions support a minimum value of -2147483648.
-This fundamental type also influences output values not just for column size limits but for counting elements as well.
+Some APIs also accept negative index values and those functions support a minimum value of
+-2147483648. This fundamental type also influences output values not just for column size limits
+but for counting elements as well.
 
 ## Spans
 
@@ -343,8 +345,8 @@ auto s1 = static_cast<ScalarType *>(s.get());
 ```
 
 ### Passing to device
-Each scalar type, except `list_scalar`, has a corresponding non-owning device view class which allows
-access to the value and its validity from the device. This can be obtained using the function
+Each scalar type, except `list_scalar`, has a corresponding non-owning device view class which
+allows access to the value and its validity from the device. This can be obtained using the function
 `get_scalar_device_view(ScalarType s)`. Note that a device view is not provided for a base scalar
 object, only for the derived typed scalar class objects.
 
@@ -355,68 +357,84 @@ data, a specialized device view for list columns can be constructed via
 
 # libcudf Policies and Design Principles
 
-`libcudf` is designed to provide thread-safe, single-GPU accelerated algorithm primitives for solving a wide variety of problems that arise in data science.
-APIs are written to execute on the default GPU, which can be controlled by the caller through standard CUDA device APIs or environment variables like `CUDA_VISIBLE_DEVICES`.
-Our goal is to enable diverse use cases like Spark or Pandas to benefit from the performance of GPUs, and libcudf relies on these higher-level layers like Spark or Dask to orchestrate multi-GPU tasks.
+`libcudf` is designed to provide thread-safe, single-GPU accelerated algorithm primitives for
+solving a wide variety of problems that arise in data science.  APIs are written to execute on the
+default GPU, which can be controlled by the caller through standard CUDA device APIs or environment
+variables like `CUDA_VISIBLE_DEVICES`.  Our goal is to enable diverse use cases like Spark or Pandas
+to benefit from the performance of GPUs, and libcudf relies on these higher-level layers like Spark
+or Dask to orchestrate multi-GPU tasks.
 
-To best satisfy these use-cases, libcudf prioritizes performance and flexibility, which sometimes may come at the cost of convenience.
-While we welcome users to use libcudf directly, we design with the expectation that most users will be consuming libcudf through higher-level layers like Spark or cuDF Python that handle some of details that direct users of libcudf must handle on their own.
-We document these policies and the reasons behind them here.
+To best satisfy these use-cases, libcudf prioritizes performance and flexibility, which sometimes
+may come at the cost of convenience.  While we welcome users to use libcudf directly, we design with
+the expectation that most users will be consuming libcudf through higher-level layers like Spark or
+cuDF Python that handle some of details that direct users of libcudf must handle on their own.  We
+document these policies and the reasons behind them here.
 
 ## libcudf does not introspect data
 
 libcudf APIs generally do not perform deep introspection and validation of input data.
 There are numerous reasons for this:
 1. It violates the single responsibility principle: validation is separate from execution.
-2. Since libcudf data structures store data on the GPU, any validation incurs _at minimum_ the overhead of a kernel launch, and may in general be prohibitively expensive.
+2. Since libcudf data structures store data on the GPU, any validation incurs _at minimum_ the
+   overhead of a kernel launch, and may in general be prohibitively expensive.
 3. API promises around data introspection often significantly complicate implementation.
 
 Users are therefore responsible for passing valid data into such APIs.
 _Note that this policy does not mean that libcudf performs no validation whatsoever_.
 libcudf APIs should still perform any validation that does not require introspection.
-To give some idea of what should or should not be validated, here are (non-exhaustive) lists of examples.
+To give some idea of what should or should not be validated, here are (non-exhaustive) lists of
+examples.
 
 **Things that libcudf should validate**:
 - Input column/table sizes or data types
 
 **Things that libcudf should not validate**:
 - Integer overflow
-- Ensuring that outputs will not exceed the [2GB size](#cudfsize_type) limit for a given set of inputs
+- Ensuring that outputs will not exceed the [2GB size](#cudfsize_type) limit for a given set of
+  inputs
 
 
 ## libcudf expects nested types to have sanitized null masks
 
-Various libcudf APIs accepting columns of nested data types (such as `LIST` or `STRUCT`) may assume that these columns have been sanitized.
-In this context, sanitization refers to ensuring that the null elements in a column with a nested dtype are compatible with the elements of nested columns.
+Various libcudf APIs accepting columns of nested data types (such as `LIST` or `STRUCT`) may assume
+that these columns have been sanitized. In this context, sanitization refers to ensuring that the
+null elements in a column with a nested dtype are compatible with the elements of nested columns.
 Specifically:
-- Null elements of list columns should also be empty. The starting offset of a null element should be equal to the ending offset.
+- Null elements of list columns should also be empty. The starting offset of a null element should
+  be equal to the ending offset.
 - Null elements of struct columns should also be null elements in the underlying structs.
-- For compound columns, nulls should only be present at the level of the parent column. Child columns should not contain nulls.
+- For compound columns, nulls should only be present at the level of the parent column. Child
+  columns should not contain nulls.
 - Slice operations on nested columns do not propagate offsets to child columns.
 
-libcudf APIs _should_ promise to never return "dirty" columns, i.e. columns containing unsanitized data.
-Therefore, the only problem is if users construct input columns that are not correctly sanitized and then pass those into libcudf APIs.
+libcudf APIs _should_ promise to never return "dirty" columns, i.e. columns containing unsanitized
+data. Therefore, the only problem is if users construct input columns that are not correctly
+sanitized and then pass those into libcudf APIs.
 
 ## Treat libcudf APIs as if they were asynchronous
 
 libcudf APIs called on the host do not guarantee that the stream is synchronized before returning.
-Work in libcudf occurs on `cudf::get_default_stream().value`, which defaults to the CUDA default stream (stream 0).
-Note that the stream 0 behavior differs if [per-thread default stream is enabled](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html) via `CUDF_USE_PER_THREAD_DEFAULT_STREAM`.
-Any data provided to or returned by libcudf that uses a separate non-blocking stream requires synchronization with the default libcudf stream to ensure stream safety.
+Work in libcudf occurs on `cudf::get_default_stream().value`, which defaults to the CUDA default
+stream (stream 0). Note that the stream 0 behavior differs if [per-thread default stream is
+enabled](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html) via
+`CUDF_USE_PER_THREAD_DEFAULT_STREAM`. Any data provided to or returned by libcudf that uses a
+separate non-blocking stream requires synchronization with the default libcudf stream to ensure
+stream safety.
 
 ## libcudf generally does not make ordering guarantees
 
-Functions like merge or groupby in libcudf make no guarantees about the order of entries in the output.
-Promising deterministic ordering is not, in general, conducive to fast parallel algorithms.
+Functions like merge or groupby in libcudf make no guarantees about the order of entries in the
+output. Promising deterministic ordering is not, in general, conducive to fast parallel algorithms.
 Calling code is responsible for performing sorts after the fact if sorted outputs are needed.
 
 ## libcudf does not promise specific exception messages
 
-libcudf documents the exceptions that will be thrown by an API for different kinds of invalid inputs.
-The types of those exceptions (e.g. `cudf::logic_error`) are part of the public API.
-However, the explanatory string returned by the `what` method of those exceptions is not part of the API and is subject to change.
-Calling code should not rely on the contents of libcudf error messages to determine the nature of the error.
-For information on the types of exceptions that libcudf throws under different circumstances, see the [section on error handling](#errors).
+libcudf documents the exceptions that will be thrown by an API for different kinds of invalid
+inputs. The types of those exceptions (e.g. `cudf::logic_error`) are part of the public API.
+However, the explanatory string returned by the `what` method of those exceptions is not part of the
+API and is subject to change. Calling code should not rely on the contents of libcudf error
+messages to determine the nature of the error. For information on the types of exceptions that
+libcudf throws under different circumstances, see the [section on error handling](#errors).
 
 # libcudf API and Implementation
 
@@ -475,14 +493,6 @@ asynchrony if and when we add an asynchronous API to libcudf.
 **Note:** `cudaDeviceSynchronize()` should *never* be used.
 This limits the ability to do any multi-stream/multi-threaded work with libcudf APIs.
 
- ### NVTX Ranges
-
-In order to aid in performance optimization and debugging, all compute intensive libcudf functions
-should have a corresponding NVTX range. In libcudf, we have a convenience macro `CUDF_FUNC_RANGE()`
-that will automatically annotate the lifetime of the enclosing function and use the function's name
-as the name of the NVTX range. For more information about NVTX, see
-[here](https://github.com/NVIDIA/NVTX/tree/dev/c).
-
  ### Stream Creation
 
 There may be times in implementing libcudf features where it would be advantageous to use streams
@@ -494,8 +504,8 @@ should avoid creating streams (even if it is slightly less efficient). It is a g
 
 ## Memory Allocation
 
-Device [memory resources](#rmmdevice_memory_resource) are used in libcudf to abstract and control how device
-memory is allocated.
+Device [memory resources](#rmmdevice_memory_resource) are used in libcudf to abstract and control
+how device memory is allocated.
 
 ### Output Memory
 
@@ -515,6 +525,12 @@ std::unique_ptr<column> returns_output_memory(
 void does_not_allocate_output_memory(...);
 ```
 
+This rule automatically applies to all detail APIs that allocates memory. Any detail API may be
+called by any public API, and therefore could be allocating memory that is returned to the user.
+To support such uses cases, all detail APIs allocating memory resources should accept an `mr`
+parameter. Callers are responsible for either passing through a provided `mr` or
+`rmm::mr::get_current_device_resource()` as needed.
+
 ### Temporary Memory
 
 Not all memory allocated within a libcudf API is returned to the caller. Often algorithms must
@@ -535,7 +551,7 @@ rmm::device_buffer some_function(
 ### Memory Management
 
 libcudf code generally eschews raw pointers and direct memory allocation. Use RMM classes built to
-use `device_memory_resource`(*)s for device memory allocation with automated lifetime management.
+use `device_memory_resource`s for device memory allocation with automated lifetime management.
 
 #### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
@@ -617,6 +633,32 @@ rmm::mr::device_memory_resource * mr = new my_custom_resource{...};
 rmm::device_uvector<int32_t> v2{100, s, mr};
 ```
 
+## Default Parameters
+
+While public libcudf APIs are free to include default function parameters, detail functions should
+not. Default memory resource parameters make it easy for developers to accidentally allocate memory
+using the incorrect resource. Avoiding default memory resources forces developers to consider each
+memory allocation carefully.
+
+While streams are not currently exposed in libcudf's API, we plan to do so eventually. As a result,
+the same reasons for memory resources also apply to streams. Public APIs default to using
+`cudf::get_default_stream()`. However, including the same default in detail APIs opens the door for
+developers to forget to pass in a user-provided stream if one is passed to a public API. Forcing
+every detail API call to explicitly pass a stream is intended to prevent such mistakes.
+
+The memory resources (and eventually, the stream) are the final parameters for essentially all
+public APIs. For API consistency, the same is true throughout libcudf's internals. Therefore, a
+consequence of not allowing default streams or MRs is that no parameters in detail APIs may have
+defaults.
+
+## NVTX Ranges
+
+In order to aid in performance optimization and debugging, all compute intensive libcudf functions
+should have a corresponding NVTX range. libcudf has a convenience macro `CUDF_FUNC_RANGE()` that
+automatically annotates the lifetime of the enclosing function and uses the function's name as
+the name of the NVTX range. For more information about NVTX, see
+[here](https://github.com/NVIDIA/NVTX/tree/dev/c).
+
 ## Input/Output Style
 
 The preferred style for how inputs are passed in and outputs are returned is the following:
@@ -886,9 +928,9 @@ CUDF_FAIL("This code path should not be reached.");
 
 ### CUDA Error Checking
 
-Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
-macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The
-thrown exception includes a description of the CUDA error code in its `what()` message.
+Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions.
+This macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`.
+The thrown exception includes a description of the CUDA error code in its `what()` message.
 
 Example:
 
@@ -1111,8 +1153,8 @@ For list columns, the parent column's type is `LIST` and contains no data, but i
 the number of lists in the column, and its null mask represents the validity of each list element.
 The parent has two children.
 
-1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each list
-   in a dense column of elements.
+1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the
+   beginning of each list in a dense column of elements.
 2. A column containing the actual data and optional null mask for all elements of all the lists
    packed together.
 
@@ -1271,9 +1313,9 @@ libcudf provides view types for nested column types as well as for the data elem
 `cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of
 any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore
 `cudf::string_view` is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
-data type for a `cudf::column` of type [`size_type`](#cudfsize_type). As its name implies, this is a read-only object
-instance that points to device memory inside the strings column. It's lifespan is the same (or less)
-as the column it views.
+data type for a `cudf::column` of type [`size_type`](#cudfsize_type). As its name implies, this is a
+read-only object instance that points to device memory inside the strings column. It's lifespan is
+the same (or less) as the column it views.
 
 Use the `column_device_view::element` method to access an individual row element. Like any other
 column, do not call `element()` on a row that is null.
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index ac2865c05c5..5460a0e5a76 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -583,10 +583,12 @@ void gather_bitmask(table_view const& source,
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
-  auto d_target_masks = make_device_uvector_async(target_masks, stream);
+  auto d_target_masks =
+    make_device_uvector_async(target_masks, stream, rmm::mr::get_current_device_resource());
 
   auto const device_source = table_device_view::create(source, stream);
-  auto d_valid_counts      = make_zeroed_device_uvector_async<size_type>(target.size(), stream);
+  auto d_valid_counts      = make_zeroed_device_uvector_async<size_type>(
+    target.size(), stream, rmm::mr::get_current_device_resource());
 
   // Dispatch operation enum to get implementation
   auto const impl = [op]() {
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index cb9ced6fc28..ce2619d767e 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -426,7 +426,8 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
 
   // Construct a contiguous host buffer of indices and copy to device.
   auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
-  auto const d_indices = make_device_uvector_async(h_indices, stream);
+  auto const d_indices =
+    make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
 
   // Compute the bit counts over each segment.
   auto first_bit_indices_begin = thrust::make_transform_iterator(
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index ccffcbc61df..5ab53690a23 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,21 +30,19 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<table> tile(
-  table_view const& input,
-  size_type count,
-  rmm::cuda_stream_view,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> tile(table_view const& input,
+                            size_type count,
+                            rmm::cuda_stream_view,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::interleave_columns
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<column> interleave_columns(
-  table_view const& input,
-  rmm::cuda_stream_view,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           rmm::cuda_stream_view,
+                                           rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index 1e5612919f4..cdfc7caef37 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,12 +31,11 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> round(
-  column_view const& input,
-  int32_t decimal_places,
-  rounding_method method,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> round(column_view const& input,
+                              int32_t decimal_places,
+                              rounding_method method,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index c8b17e22df2..dbf7bfa9527 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -386,13 +386,12 @@ struct column_scatterer_impl<struct_view> {
  * @return Result of scattering values from source to target
  */
 template <typename MapIterator>
-std::unique_ptr<table> scatter(
-  table_view const& source,
-  MapIterator scatter_map_begin,
-  MapIterator scatter_map_end,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> scatter(table_view const& source,
+                               MapIterator scatter_map_begin,
+                               MapIterator scatter_map_end,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 7c4b04537ea..39ae4fe1944 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,12 +59,11 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
-std::unique_ptr<table> scatter(
-  table_view const& source,
-  column_view const& scatter_map,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> scatter(table_view const& source,
+                               column_view const& scatter_map,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::detail::scatter(table_view const&,column_view const&,table_view
@@ -72,12 +71,11 @@ std::unique_ptr<table> scatter(
  *
  * @throws cudf::logic_error if `scatter_map` span size is larger than max of `size_type`.
  */
-std::unique_ptr<table> scatter(
-  table_view const& source,
-  device_span<size_type const> const scatter_map,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> scatter(table_view const& source,
+                               device_span<size_type const> const scatter_map,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -108,12 +106,11 @@ std::unique_ptr<table> scatter(
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
-std::unique_ptr<table> scatter(
-  std::vector<std::reference_wrapper<const scalar>> const& source,
-  column_view const& indices,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
+                               column_view const& indices,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
@@ -123,12 +120,11 @@ std::unique_ptr<table> scatter(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> boolean_mask_scatter(
-  table_view const& source,
-  table_view const& target,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> boolean_mask_scatter(table_view const& source,
+                                            table_view const& target,
+                                            column_view const& boolean_mask,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
@@ -144,7 +140,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 56d41fd635c..4c4ad7834f4 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,12 +89,11 @@ std::unique_ptr<column> contains(column_view const& haystack,
  * @param mr Device memory resource used to allocate the returned vector
  * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
  */
-rmm::device_uvector<bool> contains(
-  table_view const& haystack,
-  table_view const& needles,
-  null_equality compare_nulls,
-  nan_equality compare_nans,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_uvector<bool> contains(table_view const& haystack,
+                                   table_view const& needles,
+                                   null_equality compare_nulls,
+                                   nan_equality compare_nans,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index 4a9bf5c74e1..3c3d1d0ed9e 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,12 +32,11 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> sequence(
-  size_type size,
-  scalar const& init,
-  scalar const& step,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> sequence(size_type size,
+                                 scalar const& init,
+                                 scalar const& step,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init,
@@ -46,11 +45,10 @@ std::unique_ptr<column> sequence(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> sequence(
-  size_type size,
-  scalar const& init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> sequence(size_type size,
+                                 scalar const& init,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::calendrical_month_sequence(size_type size,
@@ -60,12 +58,11 @@ std::unique_ptr<column> sequence(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> calendrical_month_sequence(
-  size_type size,
-  scalar const& init,
-  size_type months,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
+                                                         scalar const& init,
+                                                         size_type months,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh
new file mode 100644
index 00000000000..830ee1a7fa6
--- /dev/null
+++ b/cpp/include/cudf/detail/timezone.cuh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/timezone.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+
+namespace cudf::detail {
+
+/**
+ * @brief Returns the UT offset for a given date and given timezone table.
+ *
+ * @param transition_times Transition times; trailing `solar_cycle_entry_count` entries are used for
+ * all times beyond the one covered by the TZif file
+ * @param offsets Time offsets in specific intervals; trailing `solar_cycle_entry_count` entries are
+ * used for all times beyond the one covered by the TZif file
+ * @param ts ORC timestamp
+ *
+ * @return offset from UT, in seconds
+ */
+inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp_s ts)
+{
+  if (tz_table.num_rows() == 0) { return duration_s{0}; }
+
+  cudf::device_span<timestamp_s const> transition_times(tz_table.column(0).head<timestamp_s>(),
+                                                        static_cast<size_t>(tz_table.num_rows()));
+
+  auto const ts_ttime_it = [&]() {
+    auto last_less_equal = [](auto begin, auto end, auto value) {
+      auto const first_larger = thrust::upper_bound(thrust::seq, begin, end, value);
+      // Return start of the range if all elements are larger than the value
+      if (first_larger == begin) return begin;
+      // Element before the first larger element is the last one less or equal
+      return first_larger - 1;
+    };
+
+    auto const file_entry_end =
+      transition_times.begin() + (transition_times.size() - solar_cycle_entry_count);
+
+    if (ts <= *(file_entry_end - 1)) {
+      // Search the file entries if the timestamp is in range
+      return last_less_equal(transition_times.begin(), file_entry_end, ts);
+    } else {
+      auto project_to_cycle = [](timestamp_s ts) {
+        // Years divisible by four are leap years
+        // Exceptions are years divisible by 100, but not divisible by 400
+        static constexpr int32_t num_leap_years_in_cycle =
+          solar_cycle_years / 4 - (solar_cycle_years / 100 - solar_cycle_years / 400);
+        static constexpr duration_s cycle_s = cuda::std::chrono::duration_cast<duration_s>(
+          duration_D{365 * solar_cycle_years + num_leap_years_in_cycle});
+        return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s};
+      };
+      // Search the 400-year cycle if outside of the file entries range
+      return last_less_equal(file_entry_end, transition_times.end(), project_to_cycle(ts));
+    }
+  }();
+
+  return tz_table.column(1).element<duration_s>(ts_ttime_it - transition_times.begin());
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
new file mode 100644
index 00000000000..f7f97c0a7c2
--- /dev/null
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/timezone.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::detail {
+
+/**
+ * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> make_timezone_transition_table(
+  std::optional<std::string_view> tzif_dir,
+  std::string_view timezone_name,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 8e19ebb8da7..5b64f61f11a 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,24 +29,22 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> transform(
-  column_view const& input,
-  std::string const& unary_udf,
-  data_type output_type,
-  bool is_ptx,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> transform(column_view const& input,
+                                  std::string const& unary_udf,
+                                  data_type output_type,
+                                  bool is_ptx,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::compute_column
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> compute_column(
-  table_view const table,
-  ast::operation const& expr,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> compute_column(table_view const table,
+                                       ast::operation const& expr,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::nans_to_nulls
@@ -54,9 +52,7 @@ std::unique_ptr<column> compute_column(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::bools_to_mask
@@ -64,9 +60,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::encode
@@ -74,42 +68,37 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::one_hot_encode
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
-  column_view const& input,
-  column_view const& categories,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
+                                                              column_view const& categories,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::mask_to_bools
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> mask_to_bools(
-  bitmask_type const* null_mask,
-  size_type begin_bit,
-  size_type end_bit,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> mask_to_bools(bitmask_type const* null_mask,
+                                      size_type begin_bit,
+                                      size_type end_bit,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::row_bit_count
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> row_bit_count(
-  table_view const& t,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> row_bit_count(table_view const& t,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index 0470d625edc..d0be51860b2 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,10 +28,9 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::pair<std::unique_ptr<column>, table_view> transpose(
-  table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index b7ecedc1489..3fbdf4a5a8f 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -45,13 +45,12 @@ namespace detail {
  */
 
 template <typename InputIterator, typename Predicate>
-std::unique_ptr<column> true_if(
-  InputIterator begin,
-  InputIterator end,
-  size_type size,
-  Predicate p,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> true_if(InputIterator begin,
+                                InputIterator end,
+                                size_type size,
+                                Predicate p,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   auto output =
     make_numeric_column(data_type(type_id::BOOL8), size, mask_state::UNALLOCATED, stream, mr);
@@ -68,52 +67,47 @@ std::unique_ptr<column> true_if(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> unary_operation(
-  cudf::column_view const& input,
-  cudf::unary_operator op,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
+                                              cudf::unary_operator op,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_valid
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> is_valid(
-  cudf::column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::cast
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> cast(
-  column_view const& input,
-  data_type type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> cast(column_view const& input,
+                             data_type type,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_nan
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> is_nan(
-  cudf::column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> is_nan(cudf::column_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_not_nan
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> is_not_nan(
-  cudf::column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 75df0d92d0a..c446a7b5148 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -48,10 +48,9 @@ namespace detail {
  * @return A device_uvector containing zeros
  */
 template <typename T>
-rmm::device_uvector<T> make_zeroed_device_uvector_async(
-  std::size_t size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_zeroed_device_uvector_async(std::size_t size,
+                                                        rmm::cuda_stream_view stream,
+                                                        rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -70,10 +69,9 @@ rmm::device_uvector<T> make_zeroed_device_uvector_async(
  * @return A device_uvector containing zeros
  */
 template <typename T>
-rmm::device_uvector<T> make_zeroed_device_uvector_sync(
-  std::size_t size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_zeroed_device_uvector_sync(std::size_t size,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -94,10 +92,9 @@ rmm::device_uvector<T> make_zeroed_device_uvector_sync(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_async(
-  host_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -126,9 +123,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_async(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -146,10 +141,9 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_async(
-  device_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_async(device_span<T const> source_data,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -178,9 +172,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_async(
     device_span<typename Container::value_type const>{c}, stream, mr);
@@ -199,10 +191,9 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_sync(
-  host_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_sync(host_span<T const> source_data,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -227,9 +218,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_sync(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -247,10 +236,9 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_sync(
-  device_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_sync(device_span<T const> source_data,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -275,9 +263,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 04c78bed17d..76d6fd719a4 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,12 +86,11 @@ __global__ void valid_if_kernel(
  * null count
  */
 template <typename InputIterator, typename Predicate>
-std::pair<rmm::device_buffer, size_type> valid_if(
-  InputIterator begin,
-  InputIterator end,
-  Predicate p,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
+                                                  InputIterator end,
+                                                  Predicate p,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(begin <= end, "Invalid range.");
 
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 90d730338fc..9fdc7a47fb9 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ void write_csv(data_sink* sink,
                host_span<std::string const> column_names,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+               rmm::mr::device_memory_resource* mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index b03dbd4fb70..4914f434c98 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -131,7 +131,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   device_span<SymbolT const> json_in,
   cudf::io::json_reader_options const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp
index bf833d4720c..6ae399fbe75 100644
--- a/cpp/include/cudf/io/text/detail/tile_state.hpp
+++ b/cpp/include/cudf/io/text/detail/tile_state.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,7 +82,7 @@ struct scan_tile_state {
 
   scan_tile_state(cudf::size_type num_tiles,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                  rmm::mr::device_memory_resource* mr)
     : tile_status(rmm::device_uvector<cuda::atomic<scan_tile_status, cuda::thread_scope_device>>(
         num_tiles, stream, mr)),
       tile_state_partial(rmm::device_uvector<T>(num_tiles, stream, mr)),
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index a908a9fa227..7bb2e4e2ece 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -165,7 +165,7 @@ struct trie {
    */
   static trie create(std::string const& pattern,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                     rmm::mr::device_memory_resource* mr)
 
   {
     return create(std::vector<std::string>{pattern}, stream, mr);
@@ -181,7 +181,7 @@ struct trie {
    */
   static trie create(std::vector<std::string> const& patterns,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                     rmm::mr::device_memory_resource* mr)
   {
     std::vector<char> tokens;
     std::vector<uint8_t> transitions;
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
index 9f28074173a..4bc45e48a9f 100644
--- a/cpp/include/cudf/lists/detail/combine.hpp
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,22 +27,20 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate_rows(
-  table_view const& input,
-  concatenate_null_policy null_policy,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate_rows(table_view const& input,
+                                         concatenate_null_policy null_policy,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::concatenate_list_elements
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate_list_elements(
-  column_view const& input,
-  concatenate_null_policy null_policy,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate_list_elements(column_view const& input,
+                                                  concatenate_null_policy null_policy,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/contains.hpp b/cpp/include/cudf/lists/detail/contains.hpp
index 24318e72e98..58ec18cb9ef 100644
--- a/cpp/include/cudf/lists/detail/contains.hpp
+++ b/cpp/include/cudf/lists/detail/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,12 +29,11 @@ namespace detail {
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> index_of(
-  cudf::lists_column_view const& lists,
-  cudf::scalar const& search_key,
-  cudf::lists::duplicate_find_option find_option,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
+                                 cudf::scalar const& search_key,
+                                 cudf::lists::duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
@@ -43,12 +42,11 @@ std::unique_ptr<column> index_of(
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> index_of(
-  cudf::lists_column_view const& lists,
-  cudf::column_view const& search_keys,
-  cudf::lists::duplicate_find_option find_option,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
+                                 cudf::column_view const& search_keys,
+                                 cudf::lists::duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
@@ -56,11 +54,10 @@ std::unique_ptr<column> index_of(
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> contains(
-  cudf::lists_column_view const& lists,
-  cudf::scalar const& search_key,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
+                                 cudf::scalar const& search_key,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
@@ -68,11 +65,10 @@ std::unique_ptr<column> contains(
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> contains(
-  cudf::lists_column_view const& lists,
-  cudf::column_view const& search_keys,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
+                                 cudf::column_view const& search_keys,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/set_operations.hpp b/cpp/include/cudf/lists/detail/set_operations.hpp
index ef4255de430..1411c65448e 100644
--- a/cpp/include/cudf/lists/detail/set_operations.hpp
+++ b/cpp/include/cudf/lists/detail/set_operations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,52 +30,48 @@ namespace cudf::lists::detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> have_overlap(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
+                                     lists_column_view const& rhs,
+                                     null_equality nulls_equal,
+                                     nan_equality nans_equal,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::intersect_distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> intersect_distinct(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
+                                           lists_column_view const& rhs,
+                                           null_equality nulls_equal,
+                                           nan_equality nans_equal,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::union_distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> union_distinct(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
+                                       lists_column_view const& rhs,
+                                       null_equality nulls_equal,
+                                       nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::difference_distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> difference_distinct(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
+                                            lists_column_view const& rhs,
+                                            null_equality nulls_equal,
+                                            nan_equality nans_equal,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /** @} */  // end of group
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/lists_column_factories.hpp b/cpp/include/cudf/lists/lists_column_factories.hpp
index a6eacb97e91..fea1118748c 100644
--- a/cpp/include/cudf/lists/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/lists_column_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,11 +35,10 @@ namespace detail {
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @param[in] mr Device memory resource used to allocate the returned column's device memory.
  */
-std::unique_ptr<cudf::column> make_lists_column_from_scalar(
-  list_scalar const& value,
-  size_type size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& value,
+                                                            size_type size,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index a098703e4b0..82ccca188e2 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,10 +48,9 @@ namespace detail {
  * @param mr      Device memory resource used to allocate the returned column's device memory.
  * @return        New column with concatenated results.
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
new file mode 100644
index 00000000000..56678c73811
--- /dev/null
+++ b/cpp/include/cudf/timezone.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <memory>
+#include <optional>
+#include <string>
+
+namespace cudf {
+class table;
+
+// Cycle in which the time offsets repeat in Gregorian calendar
+static constexpr int32_t solar_cycle_years = 400;
+// Number of future entries in the timezone transition table:
+// Two entries per year, over the length of the Gregorian calendar's solar cycle
+static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
+
+/**
+ * @brief Creates a transition table to convert ORC timestamps to UTC.
+ *
+ * Uses system's TZif files. Assumes little-endian platform when parsing these files.
+ * The transition table starts with the entries from the TZif file. For timestamps after the file's
+ * last transition, the table includes entries that form a `solar_cycle_years`-year cycle (future
+ * entries). This portion of the table has `solar_cycle_entry_count` elements, as it assumes two
+ * transitions per year from Daylight Saving Time. If the timezone does not have DST, the table will
+ * still include the future entries, which will all have the same offset.
+ *
+ * @param tzif_dir The directory where the TZif files are located
+ * @param timezone_name standard timezone name (for example, "America/Los_Angeles")
+ * @param mr Device memory resource used to allocate the returned table's device memory.
+ *
+ * @return The transition table for the given timezone
+ */
+std::unique_ptr<table> make_timezone_transition_table(
+  std::optional<std::string_view> tzif_dir,
+  std::string_view timezone_name,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index 4fa712fe7c3..b925fc8ae92 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,4 +36,15 @@ namespace cudf {
  */
 bool column_types_equal(column_view const& lhs, column_view const& rhs);
 
+/**
+ * @brief Compare the type IDs of two `column_view`s
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is ignored.
+ *
+ * @param lhs The first `column_view` to compare
+ * @param rhs The second `column_view` to compare
+ * @return true if column types match
+ */
+bool column_types_equivalent(column_view const& lhs, column_view const& rhs);
+
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 91773b2c3f1..6341e2e10b0 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -732,9 +732,11 @@ class strings_column_wrapper : public detail::column_wrapper {
   {
     auto all_valid        = thrust::make_constant_iterator(true);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
-    auto d_chars   = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
-    auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
-    wrapped        = cudf::make_strings_column(d_chars, d_offsets);
+    auto d_chars          = cudf::detail::make_device_uvector_sync(
+      chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto d_offsets = cudf::detail::make_device_uvector_sync(
+      offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    wrapped = cudf::make_strings_column(d_chars, d_offsets);
   }
 
   /**
@@ -772,10 +774,13 @@ class strings_column_wrapper : public detail::column_wrapper {
     size_type num_strings = std::distance(begin, end);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
     auto null_mask        = detail::make_null_mask_vector(v, v + num_strings);
-    auto d_chars   = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
-    auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
-    auto d_bitmask = cudf::detail::make_device_uvector_sync(null_mask, cudf::get_default_stream());
-    wrapped        = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
+    auto d_chars          = cudf::detail::make_device_uvector_sync(
+      chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto d_offsets = cudf::detail::make_device_uvector_sync(
+      offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto d_bitmask = cudf::detail::make_device_uvector_sync(
+      null_mask, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
   }
 
   /**
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index ce45ad91be1..d23d7f29a6c 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -168,7 +168,8 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv,
   // verify min/max
   thrust::host_vector<device_span<T const>> h_spans;
   h_spans.push_back({input_values.begin<T>(), static_cast<size_t>(input_values.size())});
-  auto spans = cudf::detail::make_device_uvector_async(h_spans, cudf::get_default_stream());
+  auto spans = cudf::detail::make_device_uvector_async(
+    h_spans, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto expected_min = cudf::make_fixed_width_column(
     data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED);
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index 9cb8afbff9f..3fe503f749e 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,10 +1,11 @@
 #
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 import argparse
 import os
 import sys
 import xml.etree.ElementTree as ET
+from pathlib import Path
 from xml.dom import minidom
 
 parser = argparse.ArgumentParser()
@@ -22,52 +23,50 @@
     "--msg",
     type=str,
     default=None,
-    help="optional message to include in html output",
+    help="optional text file to include at the top of the html output",
+)
+parser.add_argument(
+    "--cmp_log",
+    type=str,
+    default=None,
+    help="optional baseline ninja_log to compare results",
 )
 args = parser.parse_args()
 
 log_file = args.log_file
-log_path = os.path.dirname(os.path.abspath(log_file))
-
 output_fmt = args.fmt
+cmp_file = args.cmp_log
 
 # build a map of the log entries
-entries = {}
-with open(log_file) as log:
-    last = 0
-    files = {}
-    for line in log:
-        entry = line.split()
-        if len(entry) > 4:
-            obj_file = entry[3]
-            file_size = (
-                os.path.getsize(os.path.join(log_path, obj_file))
-                if os.path.exists(obj_file)
-                else 0
-            )
-            start = int(entry[0])
-            end = int(entry[1])
-            # logic based on ninjatracing
-            if end < last:
-                files = {}
-            last = end
-            files.setdefault(entry[4], (entry[3], start, end, file_size))
-
-    # build entries from files dict
-    for entry in files.values():
-        entries[entry[0]] = (entry[1], entry[2], entry[3])
-
-# check file could be loaded and we have entries to report
-if len(entries) == 0:
-    print("Could not parse", log_file)
-    exit()
+def build_log_map(log_file):
+    entries = {}
+    log_path = os.path.dirname(os.path.abspath(log_file))
+    with open(log_file) as log:
+        last = 0
+        files = {}
+        for line in log:
+            entry = line.split()
+            if len(entry) > 4:
+                obj_file = entry[3]
+                file_size = (
+                    os.path.getsize(os.path.join(log_path, obj_file))
+                    if os.path.exists(obj_file)
+                    else 0
+                )
+                start = int(entry[0])
+                end = int(entry[1])
+                # logic based on ninjatracing
+                if end < last:
+                    files = {}
+                last = end
+                files.setdefault(entry[4], (entry[3], start, end, file_size))
+
+        # build entries from files dict
+        for entry in files.values():
+            entries[entry[0]] = (entry[1], entry[2], entry[3])
+
+    return entries
 
-# sort the entries by build-time (descending order)
-sorted_list = sorted(
-    list(entries.keys()),
-    key=lambda k: entries[k][1] - entries[k][0],
-    reverse=True,
-)
 
 # output results in XML format
 def output_xml(entries, sorted_list, args):
@@ -147,14 +146,46 @@ def assign_entries_to_threads(entries):
     return (results, end_time)
 
 
-# output chart results in HTML format
-def output_html(entries, sorted_list, args):
+# format the build-time
+def format_build_time(input_time):
+    build_time = abs(input_time)
+    build_time_str = str(build_time) + " ms"
+    if build_time > 120000:  # 2 minutes
+        minutes = int(build_time / 60000)
+        seconds = int(((build_time / 60000) - minutes) * 60)
+        build_time_str = "{:d}:{:02d} min".format(minutes, seconds)
+    elif build_time > 1000:
+        build_time_str = "{:.3f} s".format(build_time / 1000)
+    if input_time < 0:
+        build_time_str = "-" + build_time_str
+    return build_time_str
+
+
+# format file size
+def format_file_size(input_size):
+    file_size = abs(input_size)
+    file_size_str = ""
+    if file_size > 1000000:
+        file_size_str = "{:.3f} MB".format(file_size / 1000000)
+    elif file_size > 1000:
+        file_size_str = "{:.3f} KB".format(file_size / 1000)
+    elif file_size > 0:
+        file_size_str = str(file_size) + " bytes"
+    if input_size < 0:
+        file_size_str = "-" + file_size_str
+    return file_size_str
+
+
+# Output chart results in HTML format
+# Builds a standalone html file with no javascript or styles
+def output_html(entries, sorted_list, cmp_entries, args):
     print("<html><head><title>Build Metrics Report</title>")
-    # Note: Jenkins does not support javascript nor style defined in the html
-    # https://www.jenkins.io/doc/book/security/configuring-content-security-policy/
     print("</head><body>")
     if args.msg is not None:
-        print("<p>", args.msg, "</p>")
+        msg_file = Path(args.msg)
+        if msg_file.is_file():
+            msg = msg_file.read_text()
+            print("<p>", msg, "</p>")
 
     # map entries to threads
     # the end_time is used to scale all the entries to a fixed output width
@@ -201,15 +232,8 @@ def output_html(entries, sorted_list, args):
             # adjust for the cellspacing
             prev_end = end + int(end_time / 500)
 
-            # format the build-time
             build_time = end - start
-            build_time_str = str(build_time) + " ms"
-            if build_time > 120000:  # 2 minutes
-                minutes = int(build_time / 60000)
-                seconds = int(((build_time / 60000) - minutes) * 60)
-                build_time_str = "{:d}:{:02d} min".format(minutes, seconds)
-            elif build_time > 1000:
-                build_time_str = "{:.3f} s".format(build_time / 1000)
+            build_time_str = format_build_time(build_time)
 
             # assign color and accumulate legend values
             color = white
@@ -248,7 +272,7 @@ def output_html(entries, sorted_list, args):
             # done with this entry
             print("</font></td>")
             # update the entry with just the computed output info
-            entries[name] = (build_time_str, color, entry[2])
+            entries[name] = (build_time, color, entry[2])
 
         # add a filler column at the end of each row
         print("<td width='*'></td></tr></table></td></tr>")
@@ -259,30 +283,53 @@ def output_html(entries, sorted_list, args):
     # output detail table in build-time descending order
     print("<table id='detail' bgcolor='#EEEEEE'>")
     print(
-        "<tr><th>File</th>",
-        "<th>Compile time</th>",
-        "<th>Size</th><tr>",
-        sep="",
+        "<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep=""
     )
+    if cmp_entries:
+        print("<th>t-cmp</th>", sep="")
+    print("</tr>")
+
     for name in sorted_list:
         entry = entries[name]
-        build_time_str = entry[0]
+        build_time = entry[0]
         color = entry[1]
         file_size = entry[2]
 
-        # format file size
-        file_size_str = ""
-        if file_size > 1000000:
-            file_size_str = "{:.3f} MB".format(file_size / 1000000)
-        elif file_size > 1000:
-            file_size_str = "{:.3f} KB".format(file_size / 1000)
-        elif file_size > 0:
-            file_size_str = str(file_size) + " bytes"
+        build_time_str = format_build_time(build_time)
+        file_size_str = format_file_size(file_size)
 
         # output entry row
         print("<tr ", color, "><td>", name, "</td>", sep="", end="")
         print("<td align='right'>", build_time_str, "</td>", sep="", end="")
-        print("<td align='right'>", file_size_str, "</td></tr>", sep="")
+        print("<td align='right'>", file_size_str, "</td>", sep="", end="")
+        # output diff column
+        cmp_entry = (
+            cmp_entries[name] if cmp_entries and name in cmp_entries else None
+        )
+        if cmp_entry:
+            diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
+            diff_time_str = format_build_time(diff_time)
+            diff_color = white
+            diff_percent = int((diff_time / build_time) * 100)
+            if build_time > 60000:
+                if diff_percent > 20:
+                    diff_color = red
+                    diff_time_str = "<b>" + diff_time_str + "</b>"
+                elif diff_percent < -20:
+                    diff_color = green
+                    diff_time_str = "<b>" + diff_time_str + "</b>"
+                elif diff_percent > 0:
+                    diff_color = yellow
+            print(
+                "<td align='right' ",
+                diff_color,
+                ">",
+                diff_time_str,
+                "</td>",
+                sep="",
+                end="",
+            )
+        print("</tr>")
 
     print("</table><br/>")
 
@@ -296,22 +343,62 @@ def output_html(entries, sorted_list, args):
     print("<td align='right'>", summary["green"], "</td></tr>")
     print("<tr><td", white, ">time &lt; 1 second</td>")
     print("<td align='right'>", summary["white"], "</td></tr>")
-    print("</table></body></html>")
+    print("</table>")
+
+    if cmp_entries:
+        print("<table id='legend' border='2' bgcolor='#EEEEEE'>")
+        print("<tr><td", red, ">time increase &gt; 20%</td></tr>")
+        print("<tr><td", yellow, ">time increase &gt; 0</td></tr>")
+        print("<tr><td", green, ">time decrease &gt; 20%</td></tr>")
+        print(
+            "<tr><td",
+            white,
+            ">time change &lt; 20%% or build time &lt; 1 minute</td></tr>",
+        )
+        print("</table>")
+
+    print("</body></html>")
 
 
 # output results in CSV format
-def output_csv(entries, sorted_list, args):
-    print("time,size,file")
+def output_csv(entries, sorted_list, cmp_entries, args):
+    print("time,size,file", end="")
+    if cmp_entries:
+        print(",diff", end="")
+    print()
     for name in sorted_list:
         entry = entries[name]
         build_time = entry[1] - entry[0]
         file_size = entry[2]
-        print(build_time, file_size, name, sep=",")
+        cmp_entry = (
+            cmp_entries[name] if cmp_entries and name in cmp_entries else None
+        )
+        print(build_time, file_size, name, sep=",", end="")
+        if cmp_entry:
+            diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
+            print(",", diff_time, sep="", end="")
+        print()
+
+
+# parse log file into map
+entries = build_log_map(log_file)
+if len(entries) == 0:
+    print("Could not parse", log_file)
+    exit()
+
+# sort the entries by build-time (descending order)
+sorted_list = sorted(
+    list(entries.keys()),
+    key=lambda k: entries[k][1] - entries[k][0],
+    reverse=True,
+)
 
+# load the comparison build log if available
+cmp_entries = build_log_map(cmp_file) if cmp_file else None
 
 if output_fmt == "xml":
     output_xml(entries, sorted_list, args)
 elif output_fmt == "html":
-    output_html(entries, sorted_list, args)
+    output_html(entries, sorted_list, cmp_entries, args)
 else:
-    output_csv(entries, sorted_list, args)
+    output_csv(entries, sorted_list, cmp_entries, args)
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 5d36d70696c..6d6ef9fd7b0 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -76,7 +76,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
                  std::back_inserter(device_views),
                  [](auto const& col) { return *col; });
 
-  auto d_views = make_device_uvector_async(device_views, stream);
+  auto d_views =
+    make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource());
 
   // Compute the partition offsets
   auto offsets = thrust::host_vector<size_t>(views.size() + 1);
@@ -87,7 +88,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
     thrust::plus{});
-  auto d_offsets         = make_device_uvector_async(offsets, stream);
+  auto d_offsets =
+    make_device_uvector_async(offsets, stream, rmm::mr::get_current_device_resource());
   auto const output_size = offsets.back();
 
   return std::make_tuple(
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 5e76b4adbbe..cc12aaa1382 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,11 +37,10 @@ namespace {
 
 struct get_element_functor {
   template <typename T, std::enable_if_t<is_fixed_width<T>() && !is_fixed_point<T>()>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto s = make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
 
@@ -61,11 +60,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, string_view>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto device_col = column_device_view::create(input, stream);
 
@@ -86,11 +84,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, dictionary32>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto dict_view    = dictionary_column_view(input);
     auto indices_iter = detail::indexalator_factory::make_input_iterator(dict_view.indices());
@@ -122,11 +119,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, list_view>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     bool valid               = is_element_valid_sync(input, index, stream);
     auto const child_col_idx = lists_column_view::child_column_index;
@@ -147,11 +143,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     using Type = typename T::rep;
 
@@ -178,11 +173,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, struct_view>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     bool valid = is_element_valid_sync(input, index, stream);
     auto row_contents =
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index 5bdf10c8af6..20a8ce986aa 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,8 @@ bool has_nonempty_null_rows(cudf::column_view const& input, rmm::cuda_stream_vie
 {
   if (not input.has_nulls()) { return false; }  // No nulls => no dirty rows.
 
+  if ((input.size() == input.null_count()) && (input.num_children() == 0)) { return false; }
+
   // Cross-reference nullmask and offsets.
   auto const type         = input.type().id();
   auto const offsets      = (type == type_id::STRING) ? (strings_column_view{input}).offsets()
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/datetime/timezone.cpp
similarity index 79%
rename from cpp/src/io/orc/timezone.cpp
rename to cpp/src/datetime/timezone.cpp
index 810dfe87320..55d68fe4a1a 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,22 +13,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "timezone.cuh"
+#include <cudf/detail/timezone.hpp>
 
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/table.hpp>
 
 #include <algorithm>
+#include <filesystem>
 #include <fstream>
 
 namespace cudf {
-namespace io {
+
+namespace {
 
 constexpr uint32_t tzif_magic           = ('T' << 0) | ('Z' << 8) | ('i' << 16) | ('f' << 24);
 std::string const tzif_system_directory = "/usr/share/zoneinfo/";
 
-// Seconds from Jan 1st, 1970 to Jan 1st, 2015
-constexpr int64_t orc_utc_offset = 1420070400;
-
 #pragma pack(push, 1)
 /**
  * @brief 32-bit TZif header
@@ -127,12 +129,13 @@ struct timezone_file {
                  "Number of transition times is larger than the file size.");
   }
 
-  timezone_file(std::string const& timezone_name)
+  timezone_file(std::optional<std::string_view> tzif_dir, std::string_view timezone_name)
   {
     using std::ios_base;
 
     // Open the input file
-    std::string const tz_filename = tzif_system_directory + timezone_name;
+    auto const tz_filename =
+      std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name;
     std::ifstream fin;
     fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate);
     CUDF_EXPECTS(fin, "Failed to open the timezone file.");
@@ -373,45 +376,62 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
   return trans.time + cuda::std::chrono::duration_cast<duration_s>(duration_D{day}).count();
 }
 
-timezone_table build_timezone_transition_table(std::string const& timezone_name,
-                                               rmm::cuda_stream_view stream)
+}  // namespace
+
+std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
+                                                      std::string_view timezone_name,
+                                                      rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::make_timezone_transition_table(
+    tzif_dir, timezone_name, cudf::get_default_stream(), mr);
+}
+
+namespace detail {
+
+std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
+                                                      std::string_view timezone_name,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::mr::device_memory_resource* mr)
 {
   if (timezone_name == "UTC" || timezone_name.empty()) {
     // Return an empty table for UTC
-    return {};
+    return std::make_unique<cudf::table>();
   }
 
-  timezone_file const tzf(timezone_name);
+  timezone_file const tzf(tzif_dir, timezone_name);
 
-  std::vector<int64_t> ttimes(1);
-  std::vector<int32_t> offsets(1);
+  std::vector<timestamp_s::rep> transition_times(1);
+  std::vector<duration_s::rep> offsets(1);
   // One ancient rule entry, one per TZ file entry, 2 entries per year in the future cycle
-  ttimes.reserve(1 + tzf.timecnt() + cycle_entry_cnt);
-  offsets.reserve(1 + tzf.timecnt() + cycle_entry_cnt);
+  transition_times.reserve(1 + tzf.timecnt() + solar_cycle_entry_count);
+  offsets.reserve(1 + tzf.timecnt() + solar_cycle_entry_count);
   size_t earliest_std_idx = 0;
   for (size_t t = 0; t < tzf.timecnt(); t++) {
     auto const ttime = tzf.transition_times[t];
     auto const idx   = tzf.ttime_idx[t];
     CUDF_EXPECTS(idx < tzf.typecnt(), "Out-of-range type index");
     auto const utcoff = tzf.ttype[idx].utcoff;
-    ttimes.push_back(ttime);
+    transition_times.push_back(ttime);
     offsets.push_back(utcoff);
-    if (!earliest_std_idx && !tzf.ttype[idx].isdst) { earliest_std_idx = ttimes.size() - 1; }
+    if (!earliest_std_idx && !tzf.ttype[idx].isdst) {
+      earliest_std_idx = transition_times.size() - 1;
+    }
   }
 
   if (tzf.timecnt() != 0) {
     if (!earliest_std_idx) { earliest_std_idx = 1; }
-    ttimes[0]  = ttimes[earliest_std_idx];
-    offsets[0] = offsets[earliest_std_idx];
+    transition_times[0] = transition_times[earliest_std_idx];
+    offsets[0]          = offsets[earliest_std_idx];
   } else {
     if (tzf.typecnt() == 0 || tzf.ttype[0].utcoff == 0) {
       // No transitions, offset is zero; Table would be a no-op.
       // Return an empty table to speed up parsing.
-      return {};
+      return std::make_unique<cudf::table>();
     }
     // No transitions to use for the time/offset - use the first offset and apply to all timestamps
-    ttimes[0]  = std::numeric_limits<int64_t>::max();
-    offsets[0] = tzf.ttype[0].utcoff;
+    transition_times[0] = std::numeric_limits<int64_t>::max();
+    offsets[0]          = tzf.ttype[0].utcoff;
   }
 
   // Generate entries for times after the last transition
@@ -440,19 +460,19 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
 
   // Add entries to fill the transition cycle
   int64_t year_timestamp = 0;
-  for (int32_t year = 1970; year < 1970 + cycle_years; ++year) {
+  for (int32_t year = 1970; year < 1970 + solar_cycle_years; ++year) {
     auto const dst_start_time = get_transition_time(dst_start, year);
     auto const dst_end_time   = get_transition_time(dst_end, year);
 
     // Two entries per year, since there are two transitions
-    ttimes.push_back(year_timestamp + dst_start_time - future_std_offset);
+    transition_times.push_back(year_timestamp + dst_start_time - future_std_offset);
     offsets.push_back(future_dst_offset);
-    ttimes.push_back(year_timestamp + dst_end_time - future_dst_offset);
+    transition_times.push_back(year_timestamp + dst_end_time - future_dst_offset);
     offsets.push_back(future_std_offset);
 
     // Swap the newly added transitions if in descending order
-    if (ttimes.rbegin()[1] > ttimes.rbegin()[0]) {
-      std::swap(ttimes.rbegin()[0], ttimes.rbegin()[1]);
+    if (transition_times.rbegin()[1] > transition_times.rbegin()[0]) {
+      std::swap(transition_times.rbegin()[0], transition_times.rbegin()[1]);
       std::swap(offsets.rbegin()[0], offsets.rbegin()[1]);
     }
 
@@ -461,13 +481,33 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
                         .count();
   }
 
-  rmm::device_uvector<int64_t> d_ttimes  = cudf::detail::make_device_uvector_async(ttimes, stream);
-  rmm::device_uvector<int32_t> d_offsets = cudf::detail::make_device_uvector_async(offsets, stream);
-  auto const gmt_offset                  = get_gmt_offset(ttimes, offsets, orc_utc_offset);
+  CUDF_EXPECTS(transition_times.size() == offsets.size(),
+               "Error reading TZif file for timezone " + std::string{timezone_name});
+
+  std::vector<timestamp_s> ttimes_typed;
+  ttimes_typed.reserve(transition_times.size());
+  std::transform(transition_times.cbegin(),
+                 transition_times.cend(),
+                 std::back_inserter(ttimes_typed),
+                 [](auto ts) { return timestamp_s{duration_s{ts}}; });
+  std::vector<duration_s> offsets_typed;
+  offsets_typed.reserve(offsets.size());
+  std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) {
+    return duration_s{ts};
+  });
+
+  auto d_ttimes  = cudf::detail::make_device_uvector_async(ttimes_typed, stream, mr);
+  auto d_offsets = cudf::detail::make_device_uvector_async(offsets_typed, stream, mr);
+
+  std::vector<std::unique_ptr<column>> tz_table_columns;
+  tz_table_columns.emplace_back(std::make_unique<cudf::column>(std::move(d_ttimes)));
+  tz_table_columns.emplace_back(std::make_unique<cudf::column>(std::move(d_offsets)));
+
+  // Need to finish copies before transition_times and offsets go out of scope
   stream.synchronize();
 
-  return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)};
+  return std::make_unique<cudf::table>(std::move(tz_table_columns));
 }
 
-}  // namespace io
+}  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index d4f3a9ca495..bc54f65bbd3 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -114,7 +114,8 @@ struct compute_children_offsets_fn {
       [](auto lhs, auto rhs) {
         return offsets_pair{lhs.first + rhs.first, lhs.second + rhs.second};
       });
-    return cudf::detail::make_device_uvector_sync(offsets, stream);
+    return cudf::detail::make_device_uvector_sync(
+      offsets, stream, rmm::mr::get_current_device_resource());
   }
 
  private:
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index ecd66f1b0c9..a747cc195ae 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,11 +104,10 @@ struct out_of_place_fill_range_dispatch {
 
   template <typename T,
             CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>() or cudf::is_fixed_point<T>())>
-  std::unique_ptr<cudf::column> operator()(
-    cudf::size_type begin,
-    cudf::size_type end,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<cudf::column> operator()(cudf::size_type begin,
+                                           cudf::size_type end,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index 284e7c46347..b4bab369c61 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,11 +134,10 @@ std::unique_ptr<column> sequence(size_type size,
   return type_dispatcher(init.type(), sequence_functor{}, size, init, step, stream, mr);
 }
 
-std::unique_ptr<column> sequence(
-  size_type size,
-  scalar const& init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> sequence(size_type size,
+                                 scalar const& init,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "init scalar type must be numeric");
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 72ac6255549..f8203218760 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -481,7 +481,8 @@ void compute_single_pass_aggs(table_view const& keys,
   // prepare to launch kernel to do the actual aggregation
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
   auto d_values       = table_device_view::create(flattened_values, stream);
-  auto const d_aggs   = cudf::detail::make_device_uvector_async(agg_kinds, stream);
+  auto const d_aggs   = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
   auto const skip_key_rows_with_nulls =
     keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index 90ca5a5c90e..a9edcfecbf7 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -156,7 +156,8 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  auto dv_quantiles = cudf::detail::make_device_uvector_async(quantiles, stream);
+  auto dv_quantiles = cudf::detail::make_device_uvector_async(
+    quantiles, stream, rmm::mr::get_current_device_resource());
 
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 1aaa06750db..820dc8a3077 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -129,8 +129,10 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
   auto const group_labels_view = column_view(cudf::device_span<const size_type>(group_labels));
   auto const gather_map        = [&]() {
     if (is_presorted()) {  // assumes both keys and values are sorted, Spark does this.
-      return cudf::detail::sequence(
-        group_labels.size(), *cudf::make_fixed_width_scalar(size_type{0}, stream), stream);
+      return cudf::detail::sequence(group_labels.size(),
+                                    *cudf::make_fixed_width_scalar(size_type{0}, stream),
+                                    stream,
+                                    rmm::mr::get_current_device_resource());
     } else {
       auto sort_order = (rank_agg._method == rank_method::FIRST ? cudf::detail::stable_sorted_order
                                                                        : cudf::detail::sorted_order);
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index c017fd43079..55036bec6a6 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -84,10 +84,10 @@ class unordered_multiset {
     auto d_column = column_device_view::create(col, stream);
     auto d_col    = *d_column;
 
-    auto hash_bins_start =
-      cudf::detail::make_zeroed_device_uvector_async<size_type>(2 * d_col.size() + 1, stream);
-    auto hash_bins_end =
-      cudf::detail::make_zeroed_device_uvector_async<size_type>(2 * d_col.size() + 1, stream);
+    auto hash_bins_start = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
+    auto hash_bins_end = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
     auto hash_data = rmm::device_uvector<Element>(d_col.size(), stream);
 
     Hasher hasher;
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 861b5b0fba4..7f88019beb2 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -215,7 +215,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                                                                   arrow::MemoryPool* ar_mr,
                                                                   rmm::cuda_stream_view stream)
 {
-  auto bitmask = bools_to_mask(input, stream);
+  auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
 
   auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
 
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index cd557ff166a..60a1b4263b2 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -444,7 +444,8 @@ std::vector<column_buffer> decode_data(metadata& meta,
     }
   }
 
-  auto block_list = cudf::detail::make_device_uvector_async(meta.block_list, stream);
+  auto block_list = cudf::detail::make_device_uvector_async(
+    meta.block_list, stream, rmm::mr::get_current_device_resource());
 
   schema_desc.host_to_device(stream);
 
@@ -574,8 +575,10 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
           }
         }
 
-        d_global_dict      = cudf::detail::make_device_uvector_async(h_global_dict, stream);
-        d_global_dict_data = cudf::detail::make_device_uvector_async(h_global_dict_data, stream);
+        d_global_dict = cudf::detail::make_device_uvector_async(
+          h_global_dict, stream, rmm::mr::get_current_device_resource());
+        d_global_dict_data = cudf::detail::make_device_uvector_async(
+          h_global_dict_data, stream, rmm::mr::get_current_device_resource());
 
         stream.synchronize();
       }
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 6778ddead28..008c7215cca 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -509,9 +509,10 @@ size_t decompress_zstd(host_span<uint8_t const> src,
                        rmm::cuda_stream_view stream)
 {
   // Init device span of spans (source)
-  auto const d_src = cudf::detail::make_device_uvector_async(src, stream);
-  auto hd_srcs     = hostdevice_vector<device_span<uint8_t const>>(1, stream);
-  hd_srcs[0]       = d_src;
+  auto const d_src =
+    cudf::detail::make_device_uvector_async(src, stream, rmm::mr::get_current_device_resource());
+  auto hd_srcs = hostdevice_vector<device_span<uint8_t const>>(1, stream);
+  hd_srcs[0]   = d_src;
   hd_srcs.host_to_device(stream);
 
   // Init device span of spans (temporary destination)
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 4f6f8162246..51e3783bac5 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -794,8 +794,8 @@ std::vector<column_type_histogram> detect_column_types(
   const int block_size = csvparse_block_dim;
   const int grid_size  = (row_starts.size() + block_size - 1) / block_size;
 
-  auto d_stats =
-    detail::make_zeroed_device_uvector_async<column_type_histogram>(num_active_columns, stream);
+  auto d_stats = detail::make_zeroed_device_uvector_async<column_type_histogram>(
+    num_active_columns, stream, rmm::mr::get_current_device_resource());
 
   data_type_detection<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_starts, d_stats);
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
index d42ddf3817c..ac925011c58 100644
--- a/cpp/src/io/csv/durations.hpp
+++ b/cpp/src/io/csv/durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,10 +28,9 @@ namespace io {
 namespace detail {
 namespace csv {
 
-std::unique_ptr<column> pandas_format_durations(
-  column_view const& durations,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> pandas_format_durations(column_view const& durations,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 2e38ea7f4ab..9c1ff67d97c 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -523,13 +523,13 @@ void infer_column_types(parse_options const& parse_opts,
     });
   if (num_inferred_columns == 0) { return; }
 
-  auto const column_stats =
-    cudf::io::csv::gpu::detect_column_types(parse_opts.view(),
-                                            data,
-                                            make_device_uvector_async(column_flags, stream),
-                                            row_offsets,
-                                            num_inferred_columns,
-                                            stream);
+  auto const column_stats = cudf::io::csv::gpu::detect_column_types(
+    parse_opts.view(),
+    data,
+    make_device_uvector_async(column_flags, stream, rmm::mr::get_current_device_resource()),
+    row_offsets,
+    num_inferred_columns,
+    stream);
   stream.synchronize();
 
   auto inf_col_idx = 0;
@@ -595,14 +595,15 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
     h_valid[i] = out_buffers[i].null_mask();
   }
 
-  cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(),
-                                             data,
-                                             make_device_uvector_async(column_flags, stream),
-                                             row_offsets,
-                                             make_device_uvector_async(column_types, stream),
-                                             make_device_uvector_async(h_data, stream),
-                                             make_device_uvector_async(h_valid, stream),
-                                             stream);
+  cudf::io::csv::gpu::decode_row_column_data(
+    parse_opts.view(),
+    data,
+    make_device_uvector_async(column_flags, stream, rmm::mr::get_current_device_resource()),
+    row_offsets,
+    make_device_uvector_async(column_types, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(h_data, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(h_valid, stream, rmm::mr::get_current_device_resource()),
+    stream);
 
   return out_buffers;
 }
diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp
index 70a0b66ebc6..c18b15708ab 100644
--- a/cpp/src/io/json/experimental/read_json.cpp
+++ b/cpp/src/io/json/experimental/read_json.cpp
@@ -80,7 +80,8 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
     auto const uncomp_data = decompress(compression, buffer);
     return cudf::detail::make_device_uvector_sync(
       host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
-      stream);
+      stream,
+      rmm::mr::get_current_device_resource());
   }
 }
 
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index d174cc8aca3..c937315969c 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -602,8 +602,10 @@ void make_device_json_column(device_span<SymbolT const> input,
                                             col.validity.data()};
   }
 
-  auto d_ignore_vals  = cudf::detail::make_device_uvector_async(ignore_vals, stream);
-  auto d_columns_data = cudf::detail::make_device_uvector_async(columns_data, stream);
+  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
+    ignore_vals, stream, rmm::mr::get_current_device_resource());
+  auto d_columns_data = cudf::detail::make_device_uvector_async(
+    columns_data, stream, rmm::mr::get_current_device_resource());
 
   // 3. scatter string offsets to respective columns, set validity bits
   thrust::for_each_n(
@@ -891,7 +893,8 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
 
   auto gpu_tree = [&]() {
     // Parse the JSON and get the token stream
-    const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream);
+    const auto [tokens_gpu, token_indices_gpu] =
+      get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource());
     // gpu tree generation
     return get_tree_representation(
       tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 8b6c0f9d528..d1711db0484 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -578,7 +578,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
       return d_column_infos;
     } else {
       return cudf::detail::make_zeroed_device_uvector_async<cudf::io::column_type_histogram>(
-        num_columns, stream);
+        num_columns, stream, rmm::mr::get_current_device_resource());
     }
   }();
 
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 30b3911089f..77749b42781 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1597,9 +1597,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
 
       // Move string_offsets and string_lengths to GPU
       rmm::device_uvector<json_column::row_offset_t> d_string_offsets =
-        cudf::detail::make_device_uvector_async(json_col.string_offsets, stream);
+        cudf::detail::make_device_uvector_async(
+          json_col.string_offsets, stream, rmm::mr::get_current_device_resource());
       rmm::device_uvector<json_column::row_offset_t> d_string_lengths =
-        cudf::detail::make_device_uvector_async(json_col.string_lengths, stream);
+        cudf::detail::make_device_uvector_async(
+          json_col.string_lengths, stream, rmm::mr::get_current_device_resource());
 
       // Prepare iterator that returns (string_offset, string_length)-tuples
       auto offset_length_it =
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 6e1089796de..7ae8deb8055 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -340,8 +340,8 @@ rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reade
                "Error finding the record within the specified byte range.\n");
 
   // Upload the raw data that is within the rows of interest
-  return cudf::detail::make_device_uvector_async(h_data.subspan(start_offset, bytes_to_upload),
-                                                 stream);
+  return cudf::detail::make_device_uvector_async(
+    h_data.subspan(start_offset, bytes_to_upload), stream, rmm::mr::get_current_device_resource());
 }
 
 std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
@@ -512,11 +512,14 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
     h_valid[i]  = out_buffers[i].null_mask();
   }
 
-  auto d_dtypes = cudf::detail::make_device_uvector_async<data_type>(h_dtypes, stream);
-  auto d_data   = cudf::detail::make_device_uvector_async<void*>(h_data, stream);
-  auto d_valid  = cudf::detail::make_device_uvector_async<cudf::bitmask_type*>(h_valid, stream);
-  auto d_valid_counts =
-    cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(num_columns, stream);
+  auto d_dtypes = cudf::detail::make_device_uvector_async<data_type>(
+    h_dtypes, stream, rmm::mr::get_current_device_resource());
+  auto d_data = cudf::detail::make_device_uvector_async<void*>(
+    h_data, stream, rmm::mr::get_current_device_resource());
+  auto d_valid = cudf::detail::make_device_uvector_async<cudf::bitmask_type*>(
+    h_valid, stream, rmm::mr::get_current_device_resource());
+  auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
+    num_columns, stream, rmm::mr::get_current_device_resource());
 
   cudf::io::json::gpu::convert_json_to_columns(
     parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream);
@@ -530,13 +533,18 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   auto repl_chars   = std::vector<char>{'"', '\\', '\t', '\r', '\b'};
   auto repl_offsets = std::vector<size_type>{0, 1, 2, 3, 4, 5};
 
-  auto target = make_strings_column(cudf::detail::make_device_uvector_async(target_chars, stream),
-                                    cudf::detail::make_device_uvector_async(target_offsets, stream),
-                                    {},
-                                    0,
-                                    stream);
-  auto repl   = make_strings_column(cudf::detail::make_device_uvector_async(repl_chars, stream),
-                                  cudf::detail::make_device_uvector_async(repl_offsets, stream),
+  auto target =
+    make_strings_column(cudf::detail::make_device_uvector_async(
+                          target_chars, stream, rmm::mr::get_current_device_resource()),
+                        cudf::detail::make_device_uvector_async(
+                          target_offsets, stream, rmm::mr::get_current_device_resource()),
+                        {},
+                        0,
+                        stream);
+  auto repl = make_strings_column(cudf::detail::make_device_uvector_async(
+                                    repl_chars, stream, rmm::mr::get_current_device_resource()),
+                                  cudf::detail::make_device_uvector_async(
+                                    repl_offsets, stream, rmm::mr::get_current_device_resource()),
                                   {},
                                   0,
                                   stream);
@@ -617,7 +625,8 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
   auto d_data = rmm::device_uvector<char>(0, stream);
 
   if (should_load_whole_source(reader_opts)) {
-    d_data = cudf::detail::make_device_uvector_async(h_data, stream);
+    d_data = cudf::detail::make_device_uvector_async(
+      h_data, stream, rmm::mr::get_current_device_resource());
   }
 
   auto rec_starts = find_record_starts(reader_opts, h_data, d_data, stream);
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index b4bcb5548de..9e56b20114c 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -552,14 +552,16 @@ std::unique_ptr<column> make_strings_column_from_host(host_span<std::string cons
 {
   std::string const host_chars =
     std::accumulate(host_strings.begin(), host_strings.end(), std::string(""));
-  auto d_chars = cudf::detail::make_device_uvector_async(host_chars, stream);
+  auto d_chars = cudf::detail::make_device_uvector_async(
+    host_chars, stream, rmm::mr::get_current_device_resource());
   std::vector<cudf::size_type> offsets(host_strings.size() + 1, 0);
   std::transform_inclusive_scan(host_strings.begin(),
                                 host_strings.end(),
                                 offsets.begin() + 1,
                                 std::plus<cudf::size_type>{},
                                 [](auto& str) { return str.size(); });
-  auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream);
+  auto d_offsets =
+    cudf::detail::make_device_uvector_sync(offsets, stream, rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(
     host_strings.size(), std::move(d_offsets), std::move(d_chars), {}, 0);
 }
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index d30c3823080..21fc04a69ec 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -38,6 +38,8 @@ namespace io {
 namespace orc {
 
 static constexpr uint32_t block_header_size = 3;
+// Seconds from January 1st, 1970 to January 1st, 2015
+static constexpr int64_t orc_utc_epoch = 1420070400;
 
 struct PostScript {
   uint64_t footerLength       = 0;     // the length of the footer section in bytes
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 43f0565845c..05560a3ca62 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "timezone.cuh"
+#include <cudf/detail/timezone.cuh>
 
 #include "orc.hpp"
 
@@ -294,7 +294,7 @@ void DecodeOrcColumnData(ColumnDesc* chunks,
                          uint32_t num_columns,
                          uint32_t num_stripes,
                          size_t first_row,
-                         timezone_table_view tz_table,
+                         table_device_view tz_table,
                          uint32_t num_rowgroups,
                          uint32_t rowidx_stride,
                          size_t level,
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 96eb20e1e66..bcf53159676 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -23,13 +23,13 @@
 #include "orc_gpu.hpp"
 
 #include "reader_impl.hpp"
-#include "timezone.cuh"
 
 #include <io/comp/gpuinflate.hpp>
 #include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
+#include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
@@ -576,8 +576,8 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
       prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
     }
   }
-  auto const d_prefix_sums_to_update =
-    cudf::detail::make_device_uvector_async(prefix_sums_to_update, stream);
+  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
+    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
 
   thrust::for_each(rmm::exec_policy(stream),
                    d_prefix_sums_to_update.begin(),
@@ -603,7 +603,7 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
 void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                                       size_t num_dicts,
                                       size_t skip_rows,
-                                      timezone_table_view tz_table,
+                                      table_device_view tz_table,
                                       cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
                                       size_t row_index_stride,
                                       std::vector<column_buffer>& out_buffers,
@@ -915,11 +915,11 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   decimal128_columns = options.get_decimal128_columns();
 }
 
-timezone_table reader::impl::compute_timezone_table(
+std::unique_ptr<table> reader::impl::compute_timezone_table(
   const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
   rmm::cuda_stream_view stream)
 {
-  if (selected_stripes.empty()) return {};
+  if (selected_stripes.empty()) return std::make_unique<cudf::table>();
 
   auto const has_timestamp_column = std::any_of(
     selected_columns.levels.cbegin(), selected_columns.levels.cend(), [&](auto& col_lvl) {
@@ -927,10 +927,10 @@ timezone_table reader::impl::compute_timezone_table(
         return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
       });
     });
-  if (not has_timestamp_column) return {};
+  if (not has_timestamp_column) return std::make_unique<cudf::table>();
 
-  return build_timezone_transition_table(selected_stripes[0].stripe_info[0].second->writerTimezone,
-                                         stream);
+  return cudf::detail::make_timezone_transition_table(
+    {}, selected_stripes[0].stripe_info[0].second->writerTimezone, stream);
 }
 
 table_with_metadata reader::impl::read(size_type skip_rows,
@@ -1038,7 +1038,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                       selected_columns.levels[level].size(),
                       [&]() {
                         return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                          total_num_stripes, stream);
+                          total_num_stripes, stream, rmm::mr::get_current_device_resource());
                       });
 
       // Tracker for eventually deallocating compressed and uncompressed data
@@ -1238,10 +1238,11 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         }
 
         if (not is_level_data_empty) {
+          auto const tz_table_dview = table_device_view::create(tz_table->view(), stream);
           decode_stream_data(chunks,
                              num_dict_entries,
                              skip_rows,
-                             tz_table.view(),
+                             *tz_table_dview,
                              row_groups,
                              _metadata.get_row_index_stride(),
                              out_buffers[level],
@@ -1270,7 +1271,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
             });
 
           if (buff_data.size()) {
-            auto const dev_buff_data = cudf::detail::make_device_uvector_async(buff_data, stream);
+            auto const dev_buff_data = cudf::detail::make_device_uvector_async(
+              buff_data, stream, rmm::mr::get_current_device_resource());
             generate_offsets_for_list(dev_buff_data, stream);
           }
         }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 96492e4c2b2..94b0fdc09d2 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,7 +145,7 @@ class reader::impl {
   void decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                           size_t num_dicts,
                           size_t skip_rows,
-                          timezone_table_view tz_table,
+                          table_device_view tz_table,
                           cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
                           size_t row_index_stride,
                           std::vector<column_buffer>& out_buffers,
@@ -210,7 +210,7 @@ class reader::impl {
    *
    * @return Timezone table with timestamp offsets
    */
-  timezone_table compute_timezone_table(
+  std::unique_ptr<table> compute_timezone_table(
     const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
     rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index d0d077d2611..8e698dd9dff 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,9 +43,6 @@ inline __device__ uint8_t is_rlev1(uint8_t encoding_mode) { return encoding_mode
 
 inline __device__ uint8_t is_dictionary(uint8_t encoding_mode) { return encoding_mode & 1; }
 
-static __device__ __constant__ int64_t kORCTimeToUTC =
-  1420070400;  // Seconds from January 1st, 1970 to January 1st, 2015
-
 struct orc_bytestream_s {
   const uint8_t* base;
   uint32_t pos;
@@ -101,7 +98,7 @@ struct orc_datadec_state_s {
   uint32_t max_vals;        // max # of non-zero values to decode in this batch
   uint32_t nrows;           // # of rows in current batch (up to block_size)
   uint32_t buffered_count;  // number of buffered values in the secondary data stream
-  int64_t utc_epoch;        // kORCTimeToUTC - gmtOffset
+  duration_s tz_epoch;      // orc_ut_epoch - ut_offset
   RowGroup index;
 };
 
@@ -1374,7 +1371,7 @@ template <int block_size>
 __global__ void __launch_bounds__(block_size)
   gpuDecodeOrcColumnData(ColumnDesc* chunks,
                          DictionaryEntry* global_dictionary,
-                         timezone_table_view tz_table,
+                         table_device_view tz_table,
                          device_2dspan<RowGroup> row_groups,
                          size_t first_row,
                          uint32_t rowidx_stride,
@@ -1446,7 +1443,8 @@ __global__ void __launch_bounds__(block_size)
     }
     if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; }
 
-    s->top.data.utc_epoch = kORCTimeToUTC - tz_table.gmt_offset;
+    static constexpr duration_s d_orc_utc_epoch = duration_s{orc_utc_epoch};
+    s->top.data.tz_epoch = d_orc_utc_epoch - get_ut_offset(tz_table, timestamp_s{d_orc_utc_epoch});
 
     bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]);
     bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]);
@@ -1769,37 +1767,33 @@ __global__ void __launch_bounds__(block_size)
               break;
             }
             case TIMESTAMP: {
-              int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch;
-              int64_t nanos   = secondary_val;
-              nanos           = (nanos >> 3) * kTimestampNanoScale[nanos & 7];
-              if (!tz_table.ttimes.empty()) {
-                seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
-              }
+              auto seconds = s->top.data.tz_epoch + duration_s{s->vals.i64[t + vals_skipped]};
+              // Convert to UTC
+              seconds += get_ut_offset(tz_table, timestamp_s{seconds});
+
+              duration_ns nanos = duration_ns{(static_cast<int64_t>(secondary_val) >> 3) *
+                                              kTimestampNanoScale[secondary_val & 7]};
+
               // Adjust seconds only for negative timestamps with positive nanoseconds.
               // Alternative way to represent negative timestamps is with negative nanoseconds
               // in which case the adjustment in not needed.
               // Comparing with 999999 instead of zero to match the apache writer.
-              if (seconds < 0 and nanos > 999999) { seconds -= 1; }
-
-              duration_ns d_ns{nanos};
-              duration_s d_s{seconds};
+              if (seconds.count() < 0 and nanos.count() > 999999) { seconds -= duration_s{1}; }
 
               static_cast<int64_t*>(data_out)[row] = [&]() {
                 using cuda::std::chrono::duration_cast;
                 switch (s->chunk.timestamp_type_id) {
                   case type_id::TIMESTAMP_SECONDS:
-                    return d_s.count() + duration_cast<duration_s>(d_ns).count();
+                    return (seconds + duration_cast<duration_s>(nanos)).count();
                   case type_id::TIMESTAMP_MILLISECONDS:
-                    return duration_cast<duration_ms>(d_s).count() +
-                           duration_cast<duration_ms>(d_ns).count();
+                    return (seconds + duration_cast<duration_ms>(nanos)).count();
                   case type_id::TIMESTAMP_MICROSECONDS:
-                    return duration_cast<duration_us>(d_s).count() +
-                           duration_cast<duration_us>(d_ns).count();
+                    return (seconds + duration_cast<duration_us>(nanos)).count();
                   case type_id::TIMESTAMP_NANOSECONDS:
                   default:
-                    return duration_cast<duration_ns>(d_s).count() +
-                           d_ns.count();  // nanoseconds as output in case of `type_id::EMPTY` and
-                                          // `type_id::TIMESTAMP_NANOSECONDS`
+                    // nanoseconds as output in case of `type_id::EMPTY` and
+                    // `type_id::TIMESTAMP_NANOSECONDS`
+                    return (seconds + nanos).count();
                 }
               }();
 
@@ -1887,7 +1881,7 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
                                   uint32_t num_columns,
                                   uint32_t num_stripes,
                                   size_t first_row,
-                                  timezone_table_view tz_table,
+                                  table_device_view tz_table,
                                   uint32_t num_rowgroups,
                                   uint32_t rowidx_stride,
                                   size_t level,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 9032e3d2502..427167e2d0f 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,9 +49,6 @@ constexpr int scratch_buffer_size = 512 * 4;
 // Workaround replaces zero-length patch lists by a dummy zero patch
 constexpr bool zero_pll_war = true;
 
-static __device__ __constant__ int64_t kORCTimeToUTC =
-  1420070400;  // Seconds from January 1st, 1970 to January 1st, 2015
-
 struct byterle_enc_state_s {
   uint32_t literal_run;
   uint32_t repeat_run;
@@ -814,7 +811,7 @@ __global__ void __launch_bounds__(block_size)
             int32_t ts_scale    = powers_of_ten[9 - min(s->chunk.scale, 9)];
             int64_t seconds     = ts / ts_scale;
             int64_t nanos       = (ts - seconds * ts_scale);
-            s->vals.i64[nz_idx] = seconds - kORCTimeToUTC;
+            s->vals.i64[nz_idx] = seconds - orc_utc_epoch;
             if (nanos != 0) {
               // Trailing zeroes are encoded in the lower 3-bits
               uint32_t zeroes = 0;
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
deleted file mode 100644
index 52736d6451a..00000000000
--- a/cpp/src/io/orc/timezone.cuh
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/span.hpp>
-#include <io/utilities/time_utils.cuh>
-
-#include <rmm/device_uvector.hpp>
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-namespace cudf {
-namespace io {
-
-struct timezone_table_view {
-  int32_t gmt_offset = 0;
-  cudf::device_span<int64_t const> ttimes;
-  cudf::device_span<int32_t const> offsets;
-};
-
-// Cycle in which the time offsets repeat
-static constexpr int32_t cycle_years = 400;
-// Number of seconds in 400 years
-static constexpr int64_t cycle_seconds =
-  cuda::std::chrono::duration_cast<duration_s>(duration_D{365 * cycle_years + (100 - 3)}).count();
-// Two entries per year, over the length of the cycle
-static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years;
-
-/**
- * @brief Returns the GMT offset for a given date and given timezone table.
- *
- * @param ttimes Transition times; trailing `cycle_entry_cnt` entries are used for all times
- * beyond the one covered by the TZif file
- * @param offsets Time offsets in specific intervals; trailing `cycle_entry_cnt` entries are used
- * for all times beyond the one covered by the TZif file
- * @param count Number of elements in @p ttimes and @p offsets
- * @param ts ORC timestamp
- *
- * @return GMT offset
- */
-CUDF_HOST_DEVICE inline int32_t get_gmt_offset_impl(int64_t const* ttimes,
-                                                    int32_t const* offsets,
-                                                    size_t count,
-                                                    int64_t ts)
-{
-  // Returns start of the range if all elements are larger than the input timestamp
-  auto last_less_equal_ttime_idx = [&](long begin_idx, long end_idx, int64_t ts) {
-    auto const first_larger_ttime =
-      thrust::upper_bound(thrust::seq, ttimes + begin_idx, ttimes + end_idx, ts);
-    // Element before the first larger element is the last one less of equal
-    return std::max(first_larger_ttime - ttimes - 1, begin_idx);
-  };
-
-  auto const file_entry_cnt = count - cycle_entry_cnt;
-  // Search in the file entries if the timestamp is in range
-  if (ts <= ttimes[file_entry_cnt - 1]) {
-    return offsets[last_less_equal_ttime_idx(0, file_entry_cnt, ts)];
-  } else {
-    // Search in the 400-year cycle if outside of the file entries range
-    return offsets[last_less_equal_ttime_idx(
-      file_entry_cnt, count, (ts + cycle_seconds) % cycle_seconds)];
-  }
-}
-
-/**
- * @brief Host `get_gmt_offset` interface.
- *
- * Implemented in `get_gmt_offset_impl`.
- */
-inline __host__ int32_t get_gmt_offset(cudf::host_span<int64_t const> ttimes,
-                                       cudf::host_span<int32_t const> offsets,
-                                       int64_t ts)
-{
-  CUDF_EXPECTS(ttimes.size() == offsets.size(),
-               "transition times and offsets must have the same length");
-  return get_gmt_offset_impl(ttimes.begin(), offsets.begin(), ttimes.size(), ts);
-}
-
-/**
- * @brief Device `get_gmt_offset` interface.
- *
- * Implemented in `get_gmt_offset_impl`.
- */
-inline __device__ int32_t get_gmt_offset(cudf::device_span<int64_t const> ttimes,
-                                         cudf::device_span<int32_t const> offsets,
-                                         int64_t ts)
-{
-  return get_gmt_offset_impl(ttimes.begin(), offsets.begin(), ttimes.size(), ts);
-}
-
-class timezone_table {
-  int32_t gmt_offset = 0;
-  rmm::device_uvector<int64_t> ttimes;
-  rmm::device_uvector<int32_t> offsets;
-
- public:
-  // Safe to use the default stream, device_uvectors will not change after they are created empty
-  timezone_table() : ttimes{0, cudf::get_default_stream()}, offsets{0, cudf::get_default_stream()}
-  {
-  }
-  timezone_table(int32_t gmt_offset,
-                 rmm::device_uvector<int64_t>&& ttimes,
-                 rmm::device_uvector<int32_t>&& offsets)
-    : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)}
-  {
-  }
-  [[nodiscard]] timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; }
-};
-
-/**
- * @brief Creates a transition table to convert ORC timestamps to UTC.
- *
- * Uses system's TZif files. Assumes little-endian platform when parsing these files.
- *
- * @param timezone_name standard timezone name (for example, "US/Pacific")
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return The transition table for the given timezone
- */
-timezone_table build_timezone_transition_table(std::string const& timezone_name,
-                                               rmm::cuda_stream_view stream);
-
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 00b5c5428b1..d3bb0a45c12 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -55,6 +55,7 @@
 #include <algorithm>
 #include <cstring>
 #include <numeric>
+#include <tuple>
 #include <utility>
 
 #include <cooperative_groups.h>
@@ -332,6 +333,136 @@ size_type orc_table_view::num_rows() const noexcept
   return columns.empty() ? 0 : columns.front().size();
 }
 
+orc_streams::orc_stream_offsets orc_streams::compute_offsets(
+  host_span<orc_column_view const> columns, size_t num_rowgroups) const
+{
+  std::vector<size_t> strm_offsets(streams.size());
+  size_t non_rle_data_size = 0;
+  size_t rle_data_size     = 0;
+  for (size_t i = 0; i < streams.size(); ++i) {
+    const auto& stream = streams[i];
+
+    auto const is_rle_data = [&]() {
+      // First stream is an index stream, don't check types, etc.
+      if (!stream.column_index().has_value()) return true;
+
+      auto const& column = columns[stream.column_index().value()];
+      // Dictionary encoded string column - dictionary characters or
+      // directly encoded string - column characters
+      if (column.orc_kind() == TypeKind::STRING &&
+          ((stream.kind == DICTIONARY_DATA && column.orc_encoding() == DICTIONARY_V2) ||
+           (stream.kind == DATA && column.orc_encoding() == DIRECT_V2)))
+        return false;
+      // Decimal data
+      if (column.orc_kind() == TypeKind::DECIMAL && stream.kind == DATA) return false;
+
+      // Everything else uses RLE
+      return true;
+    }();
+    // non-RLE and RLE streams are separated in the buffer that stores encoded data
+    // The computed offsets do not take the streams of the other type into account
+    if (is_rle_data) {
+      strm_offsets[i] = rle_data_size;
+      rle_data_size += (stream.length + 7) & ~7;
+    } else {
+      strm_offsets[i] = non_rle_data_size;
+      non_rle_data_size += stream.length;
+    }
+  }
+  non_rle_data_size = (non_rle_data_size + 7) & ~7;
+
+  return {std::move(strm_offsets), non_rle_data_size, rle_data_size};
+}
+
+namespace {
+struct string_length_functor {
+  __device__ inline size_type operator()(int const i) const
+  {
+    // we translate from 0 -> num_chunks * 2 because each statistic has a min and max
+    // string and we need to calculate lengths for both.
+    if (i >= num_chunks * 2) return 0;
+
+    // min strings are even values, max strings are odd values of i
+    auto const should_copy_min = i % 2 == 0;
+    // index of the chunk
+    auto const idx = i / 2;
+    auto& str_val  = should_copy_min ? stripe_stat_chunks[idx].min_value.str_val
+                                     : stripe_stat_chunks[idx].max_value.str_val;
+    auto const str = stripe_stat_merge[idx].stats_dtype == dtype_string;
+    return str ? str_val.length : 0;
+  }
+
+  int const num_chunks;
+  statistics_chunk const* stripe_stat_chunks;
+  statistics_merge_group const* stripe_stat_merge;
+};
+
+__global__ void copy_string_data(char* string_pool,
+                                 size_type* offsets,
+                                 statistics_chunk* chunks,
+                                 statistics_merge_group const* groups)
+{
+  auto const idx = blockIdx.x / 2;
+  if (groups[idx].stats_dtype == dtype_string) {
+    // min strings are even values, max strings are odd values of i
+    auto const should_copy_min = blockIdx.x % 2 == 0;
+    auto& str_val = should_copy_min ? chunks[idx].min_value.str_val : chunks[idx].max_value.str_val;
+    auto dst      = &string_pool[offsets[blockIdx.x]];
+    auto src      = str_val.ptr;
+
+    for (int i = threadIdx.x; i < str_val.length; i += blockDim.x) {
+      dst[i] = src[i];
+    }
+    if (threadIdx.x == 0) { str_val.ptr = dst; }
+  }
+}
+
+}  // namespace
+
+void persisted_statistics::persist(int num_table_rows,
+                                   bool single_write_mode,
+                                   intermediate_statistics& intermediate_stats,
+                                   rmm::cuda_stream_view stream)
+{
+  if (not single_write_mode) {
+    // persist the strings in the chunks into a string pool and update pointers
+    auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
+    // min offset and max offset + 1 for total size
+    rmm::device_uvector<size_type> offsets((num_chunks * 2) + 1, stream);
+
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0,
+      string_length_functor{num_chunks,
+                            intermediate_stats.stripe_stat_chunks.data(),
+                            intermediate_stats.stripe_stat_merge.device_ptr()});
+    thrust::exclusive_scan(rmm::exec_policy(stream), iter, iter + offsets.size(), offsets.begin());
+
+    // pull size back to host
+    auto const total_string_pool_size = offsets.element(num_chunks * 2, stream);
+    if (total_string_pool_size > 0) {
+      rmm::device_uvector<char> string_pool(total_string_pool_size, stream);
+
+      // offsets describes where in the string pool each string goes. Going with the simple
+      // approach for now, but it is possible something fancier with breaking up each thread into
+      // copying x bytes instead of a single string is the better method since we are dealing in
+      // min/max strings they almost certainly will not be uniform length.
+      copy_string_data<<<num_chunks * 2, 256, 0, stream.value()>>>(
+        string_pool.data(),
+        offsets.data(),
+        intermediate_stats.stripe_stat_chunks.data(),
+        intermediate_stats.stripe_stat_merge.device_ptr());
+      string_pools.emplace_back(std::move(string_pool));
+    }
+  }
+
+  stripe_stat_chunks.emplace_back(std::move(intermediate_stats.stripe_stat_chunks));
+  stripe_stat_merge.emplace_back(std::move(intermediate_stats.stripe_stat_merge));
+  stats_dtypes = std::move(intermediate_stats.stats_dtypes);
+  col_types    = std::move(intermediate_stats.col_types);
+  num_rows     = num_table_rows;
+}
+
+namespace {
 /**
  * @brief Gathers stripe information.
  *
@@ -418,7 +549,7 @@ void init_dictionaries(orc_table_view& orc_table,
                  [&](auto& col_idx) {
                    auto& str_column = orc_table.column(col_idx);
                    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     str_column.size(), stream);
+                     str_column.size(), stream, rmm::mr::get_current_device_resource());
                  });
 
   // Create views of the temporary buffers in device memory
@@ -428,7 +559,8 @@ void init_dictionaries(orc_table_view& orc_table,
     dict_indices.begin(), dict_indices.end(), std::back_inserter(dict_indices_views), [](auto& di) {
       return device_span<uint32_t>{di};
     });
-  auto d_dict_indices_views = cudf::detail::make_device_uvector_async(dict_indices_views, stream);
+  auto d_dict_indices_views = cudf::detail::make_device_uvector_async(
+    dict_indices_views, stream, rmm::mr::get_current_device_resource());
 
   gpu::InitDictionaryIndices(orc_table.d_columns,
                              *dict,
@@ -441,12 +573,26 @@ void init_dictionaries(orc_table_view& orc_table,
   dict->device_to_host(stream, true);
 }
 
-void writer::impl::build_dictionaries(orc_table_view& orc_table,
-                                      host_span<stripe_rowgroups const> stripe_bounds,
-                                      hostdevice_2dvector<gpu::DictionaryChunk> const& dict,
-                                      host_span<rmm::device_uvector<uint32_t>> dict_index,
-                                      host_span<bool const> dictionary_enabled,
-                                      hostdevice_2dvector<gpu::StripeDictionary>& stripe_dict)
+/**
+ * @brief Builds up per-stripe dictionaries for string columns.
+ *
+ * @param orc_table Non-owning view of a cuDF table w/ ORC-related info
+ * @param stripe_bounds List of stripe boundaries
+ * @param dict List of dictionary chunks [rowgroup][column]
+ * @param dict_index List of dictionary indices
+ * @param dictionary_enabled Whether dictionary encoding is enabled for a given column
+ * @param stripe_dict List of stripe dictionaries
+ * @param enable_dictionary Whether dictionary is enabled
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void build_dictionaries(orc_table_view& orc_table,
+                        host_span<stripe_rowgroups const> stripe_bounds,
+                        hostdevice_2dvector<gpu::DictionaryChunk> const& dict,
+                        host_span<rmm::device_uvector<uint32_t>> dict_index,
+                        host_span<bool const> dictionary_enabled,
+                        hostdevice_2dvector<gpu::StripeDictionary>& stripe_dict,
+                        bool enable_dictionary,
+                        rmm::cuda_stream_view stream)
 {
   const auto num_rowgroups = dict.size().first;
 
@@ -470,7 +616,7 @@ void writer::impl::build_dictionaries(orc_table_view& orc_table,
       sd.leaf_column = dict[0][dict_idx].leaf_column;
     }
 
-    if (enable_dictionary_) {
+    if (enable_dictionary) {
       struct string_column_cost {
         size_t direct     = 0;
         size_t dictionary = 0;
@@ -554,9 +700,20 @@ auto comp_block_alignment(CompressionKind compression_kind)
   return 1u << nvcomp::compress_output_alignment_bits(to_nvcomp_compression_type(compression_kind));
 }
 
-orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
-                                         file_segmentation const& segmentation,
-                                         std::map<uint32_t, size_t> const& decimal_column_sizes)
+/**
+ * @brief Builds up per-column streams.
+ *
+ * @param[in,out] columns List of columns
+ * @param[in] segmentation stripe and rowgroup ranges
+ * @param[in] decimal_column_sizes Sizes of encoded decimal columns
+ * @return List of stream descriptors
+ */
+orc_streams create_streams(host_span<orc_column_view> columns,
+                           file_segmentation const& segmentation,
+                           std::map<uint32_t, size_t> const& decimal_column_sizes,
+                           bool enable_dictionary,
+                           CompressionKind compression_kind,
+                           bool single_write_mode)
 {
   // 'column 0' row index stream
   std::vector<Stream> streams{{ROW_INDEX, 0}};  // TODO: Separate index and data streams?
@@ -599,7 +756,7 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
 
     auto add_stream =
       [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
-        auto const max_alignment_padding = uncomp_block_alignment(compression_kind_) - 1;
+        auto const max_alignment_padding = uncomp_block_alignment(compression_kind) - 1;
         const auto base                  = column.index() * gpu::CI_NUM_STREAMS;
         ids[base + index_type]           = streams.size();
         streams.push_back(orc::Stream{
@@ -636,7 +793,7 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         column.set_orc_encoding(DIRECT);
         break;
       case TypeKind::STRING: {
-        bool enable_dict           = enable_dictionary_;
+        bool enable_dict           = enable_dictionary;
         size_t dict_data_size      = 0;
         size_t dict_strings        = 0;
         size_t dict_lengths_div512 = 0;
@@ -711,47 +868,6 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
   return {std::move(streams), std::move(ids), std::move(types)};
 }
 
-orc_streams::orc_stream_offsets orc_streams::compute_offsets(
-  host_span<orc_column_view const> columns, size_t num_rowgroups) const
-{
-  std::vector<size_t> strm_offsets(streams.size());
-  size_t non_rle_data_size = 0;
-  size_t rle_data_size     = 0;
-  for (size_t i = 0; i < streams.size(); ++i) {
-    const auto& stream = streams[i];
-
-    auto const is_rle_data = [&]() {
-      // First stream is an index stream, don't check types, etc.
-      if (!stream.column_index().has_value()) return true;
-
-      auto const& column = columns[stream.column_index().value()];
-      // Dictionary encoded string column - dictionary characters or
-      // directly encoded string - column characters
-      if (column.orc_kind() == TypeKind::STRING &&
-          ((stream.kind == DICTIONARY_DATA && column.orc_encoding() == DICTIONARY_V2) ||
-           (stream.kind == DATA && column.orc_encoding() == DIRECT_V2)))
-        return false;
-      // Decimal data
-      if (column.orc_kind() == TypeKind::DECIMAL && stream.kind == DATA) return false;
-
-      // Everything else uses RLE
-      return true;
-    }();
-    // non-RLE and RLE streams are separated in the buffer that stores encoded data
-    // The computed offsets do not take the streams of the other type into account
-    if (is_rle_data) {
-      strm_offsets[i] = rle_data_size;
-      rle_data_size += (stream.length + 7) & ~7;
-    } else {
-      strm_offsets[i] = non_rle_data_size;
-      non_rle_data_size += stream.length;
-    }
-  }
-  non_rle_data_size = (non_rle_data_size + 7) & ~7;
-
-  return {std::move(strm_offsets), non_rle_data_size, rle_data_size};
-}
-
 std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
   orc_table_view const& orc_table,
   file_segmentation const& segmentation,
@@ -772,7 +888,8 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
                                 aligned_rgs.count() * sizeof(rowgroup_rows),
                                 cudaMemcpyDefault,
                                 stream.value()));
-  auto const d_stripes = cudf::detail::make_device_uvector_async(segmentation.stripes, stream);
+  auto const d_stripes = cudf::detail::make_device_uvector_async(
+    segmentation.stripes, stream, rmm::mr::get_current_device_resource());
 
   // One thread per column, per stripe
   thrust::for_each_n(
@@ -1091,11 +1208,23 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   return {std::move(encoded_data), std::move(chunk_streams)};
 }
 
-std::vector<StripeInformation> writer::impl::gather_stripes(
+/**
+ * @brief Returns stripe information after compacting columns' individual data
+ * chunks into contiguous data streams.
+ *
+ * @param[in] num_index_streams Total number of index streams
+ * @param[in] segmentation stripe and rowgroup ranges
+ * @param[in,out] enc_streams List of encoder chunk streams [column][rowgroup]
+ * @param[in,out] strm_desc List of stream descriptors [stripe][data_stream]
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @return The stripes' information
+ */
+std::vector<StripeInformation> gather_stripes(
   size_t num_index_streams,
   file_segmentation const& segmentation,
   hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
-  hostdevice_2dvector<gpu::StripeStream>* strm_desc)
+  hostdevice_2dvector<gpu::StripeStream>* strm_desc,
+  rmm::cuda_stream_view stream)
 {
   if (segmentation.num_stripes() == 0) { return {}; }
   std::vector<StripeInformation> stripes(segmentation.num_stripes());
@@ -1163,16 +1292,25 @@ hostdevice_vector<uint8_t> allocate_and_encode_blobs(
   return blobs;
 }
 
-writer::impl::intermediate_statistics writer::impl::gather_statistic_blobs(
-  statistics_freq const stats_freq,
-  orc_table_view const& orc_table,
-  file_segmentation const& segmentation)
+/**
+ * @brief Returns column statistics in an intermediate format.
+ *
+ * @param statistics_freq Frequency of statistics to be included in the output file
+ * @param orc_table Table information to be written
+ * @param segmentation stripe and rowgroup ranges
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The statistic information
+ */
+intermediate_statistics gather_statistic_blobs(statistics_freq const stats_freq,
+                                               orc_table_view const& orc_table,
+                                               file_segmentation const& segmentation,
+                                               rmm::cuda_stream_view stream)
 {
   auto const num_rowgroup_blobs     = segmentation.rowgroups.count();
   auto const num_stripe_blobs       = segmentation.num_stripes() * orc_table.num_columns();
   auto const are_statistics_enabled = stats_freq != statistics_freq::STATISTICS_NONE;
   if (not are_statistics_enabled or num_rowgroup_blobs + num_stripe_blobs == 0) {
-    return writer::impl::intermediate_statistics{stream};
+    return intermediate_statistics{stream};
   }
 
   hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
@@ -1290,8 +1428,17 @@ writer::impl::intermediate_statistics writer::impl::gather_statistic_blobs(
           std::move(col_types)};
 }
 
-writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
-  int num_stripes, writer::impl::persisted_statistics& per_chunk_stats)
+/**
+ * @brief Returns column statistics encoded in ORC protobuf format stored in the footer.
+ *
+ * @param num_stripes number of stripes in the data
+ * @param incoming_stats intermediate statistics returned from `gather_statistic_blobs`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The encoded statistic blobs
+ */
+encoded_footer_statistics finish_statistic_blobs(int num_stripes,
+                                                 persisted_statistics& per_chunk_stats,
+                                                 rmm::cuda_stream_view stream)
 {
   auto stripe_size_iter = thrust::make_transform_iterator(per_chunk_stats.stripe_stat_merge.begin(),
                                                           [](auto const& i) { return i.size(); });
@@ -1381,16 +1528,36 @@ writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
   return {std::move(stripe_blobs), std::move(file_blobs)};
 }
 
-void writer::impl::write_index_stream(int32_t stripe_id,
-                                      int32_t stream_id,
-                                      host_span<orc_column_view const> columns,
-                                      file_segmentation const& segmentation,
-                                      host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
-                                      host_2dspan<gpu::StripeStream const> strm_desc,
-                                      host_span<compression_result const> comp_res,
-                                      std::vector<ColStatsBlob> const& rg_stats,
-                                      StripeInformation* stripe,
-                                      orc_streams* streams)
+/**
+ * @brief Writes the specified column's row index stream.
+ *
+ * @param[in] stripe_id Stripe's identifier
+ * @param[in] stream_id Stream identifier (column id + 1)
+ * @param[in] columns List of columns
+ * @param[in] segmentation stripe and rowgroup ranges
+ * @param[in] enc_streams List of encoder chunk streams [column][rowgroup]
+ * @param[in] strm_desc List of stream descriptors
+ * @param[in] comp_res Output status for compressed streams
+ * @param[in] rg_stats row group level statistics
+ * @param[in,out] stripe Stream's parent stripe
+ * @param[in,out] streams List of all streams
+ * @param[in] compression_kind The compression kind
+ * @param[in] compression_blocksize The block size used for compression
+ * @param[in] out_sink Sink for writing data
+ */
+void write_index_stream(int32_t stripe_id,
+                        int32_t stream_id,
+                        host_span<orc_column_view const> columns,
+                        file_segmentation const& segmentation,
+                        host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
+                        host_2dspan<gpu::StripeStream const> strm_desc,
+                        host_span<compression_result const> comp_res,
+                        std::vector<ColStatsBlob> const& rg_stats,
+                        StripeInformation* stripe,
+                        orc_streams* streams,
+                        CompressionKind compression_kind,
+                        size_t compression_blocksize,
+                        std::unique_ptr<data_sink> const& out_sink)
 {
   row_group_index_info present;
   row_group_index_info data;
@@ -1402,7 +1569,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     row_group_index_info record;
     if (stream.ids[type] > 0) {
       record.pos = 0;
-      if (compression_kind_ != NONE) {
+      if (compression_kind != NONE) {
         auto const& ss   = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)];
         record.blk_pos   = ss.first_block;
         record.comp_pos  = 0;
@@ -1417,10 +1584,10 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     if (record.pos >= 0) {
       record.pos += stream.lengths[type];
       while ((record.pos >= 0) && (record.blk_pos >= 0) &&
-             (static_cast<size_t>(record.pos) >= compression_blocksize_) &&
+             (static_cast<size_t>(record.pos) >= compression_blocksize) &&
              (record.comp_pos + block_header_size + comp_res[record.blk_pos].bytes_written <
               static_cast<size_t>(record.comp_size))) {
-        record.pos -= compression_blocksize_;
+        record.pos -= compression_blocksize;
         record.comp_pos += block_header_size + comp_res[record.blk_pos].bytes_written;
         record.blk_pos += 1;
       }
@@ -1442,7 +1609,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     }
   }
 
-  ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
+  ProtobufWriter pbw((compression_kind != NONE) ? 3 : 0);
 
   // Add row index entries
   auto const& rowgroups_range = segmentation.stripes[stripe_id];
@@ -1467,22 +1634,39 @@ void writer::impl::write_index_stream(int32_t stripe_id,
   });
 
   (*streams)[stream_id].length = pbw.size();
-  if (compression_kind_ != NONE) {
+  if (compression_kind != NONE) {
     uint32_t uncomp_ix_len = (uint32_t)((*streams)[stream_id].length - 3) * 2 + 1;
     pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_ix_len >> 0);
     pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_ix_len >> 8);
     pbw.buffer()[2]        = static_cast<uint8_t>(uncomp_ix_len >> 16);
   }
-  out_sink_->host_write(pbw.data(), pbw.size());
+  out_sink->host_write(pbw.data(), pbw.size());
   stripe->indexLength += pbw.size();
 }
 
-std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_desc,
-                                                  gpu::encoder_chunk_streams const& enc_stream,
-                                                  uint8_t const* compressed_data,
-                                                  uint8_t* stream_out,
-                                                  StripeInformation* stripe,
-                                                  orc_streams* streams)
+/**
+ * @brief Write the specified column's data streams
+ *
+ * @param[in] strm_desc Stream's descriptor
+ * @param[in] enc_stream Chunk's streams
+ * @param[in] compressed_data Compressed stream data
+ * @param[in,out] stream_out Temporary host output buffer
+ * @param[in,out] stripe Stream's parent stripe
+ * @param[in,out] streams List of all streams
+ * @param[in] compression_kind The compression kind
+ * @param[in] out_sink Sink for writing data
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @return An std::future that should be synchronized to ensure the writing is complete
+ */
+std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
+                                    gpu::encoder_chunk_streams const& enc_stream,
+                                    uint8_t const* compressed_data,
+                                    uint8_t* stream_out,
+                                    StripeInformation* stripe,
+                                    orc_streams* streams,
+                                    CompressionKind compression_kind,
+                                    std::unique_ptr<data_sink> const& out_sink,
+                                    rmm::cuda_stream_view stream)
 {
   const auto length                                        = strm_desc.stream_size;
   (*streams)[enc_stream.ids[strm_desc.stream_type]].length = length;
@@ -1490,18 +1674,18 @@ std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_
     return std::async(std::launch::deferred, [] {});
   }
 
-  const auto* stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
-                                                      : (compressed_data + strm_desc.bfr_offset);
+  const auto* stream_in = (compression_kind == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
+                                                     : (compressed_data + strm_desc.bfr_offset);
 
   auto write_task = [&]() {
-    if (out_sink_->is_device_write_preferred(length)) {
-      return out_sink_->device_write_async(stream_in, length, stream);
+    if (out_sink->is_device_write_preferred(length)) {
+      return out_sink->device_write_async(stream_in, length, stream);
     } else {
       CUDF_CUDA_TRY(
         cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDefault, stream.value()));
       stream.synchronize();
 
-      out_sink_->host_write(stream_out, length);
+      out_sink->host_write(stream_out, length);
       return std::async(std::launch::deferred, [] {});
     }
   }();
@@ -1509,18 +1693,27 @@ std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_
   return write_task;
 }
 
-void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t>& v)
+/**
+ * @brief Insert 3-byte uncompressed block headers in a byte vector
+ *
+ * @param compression_kind The compression kind
+ * @param compression_blocksize The block size used for compression
+ * @param v The destitation byte vector to write, which must include initial 3-byte header
+ */
+void add_uncompressed_block_headers(CompressionKind compression_kind,
+                                    size_t compression_blocksize,
+                                    std::vector<uint8_t>& v)
 {
-  if (compression_kind_ != NONE) {
+  if (compression_kind != NONE) {
     size_t uncomp_len = v.size() - 3, pos = 0, block_len;
-    while (uncomp_len > compression_blocksize_) {
-      block_len  = compression_blocksize_ * 2 + 1;
+    while (uncomp_len > compression_blocksize) {
+      block_len  = compression_blocksize * 2 + 1;
       v[pos + 0] = static_cast<uint8_t>(block_len >> 0);
       v[pos + 1] = static_cast<uint8_t>(block_len >> 8);
       v[pos + 2] = static_cast<uint8_t>(block_len >> 16);
-      pos += 3 + compression_blocksize_;
+      pos += 3 + compression_blocksize;
       v.insert(v.begin() + pos, 3, 0);
-      uncomp_len -= compression_blocksize_;
+      uncomp_len -= compression_blocksize;
     }
     block_len  = uncomp_len * 2 + 1;
     v[pos + 0] = static_cast<uint8_t>(block_len >> 0);
@@ -1529,58 +1722,6 @@ void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t>& v)
   }
 }
 
-writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   orc_writer_options const& options,
-                   SingleWriteMode mode,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : _mr(mr),
-    stream(stream),
-    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
-    row_index_stride{options.get_row_index_stride()},
-    compression_kind_(to_orc_compression(options.get_compression())),
-    compression_blocksize_(compression_block_size(compression_kind_)),
-    stats_freq_(options.get_statistics_freq()),
-    single_write_mode(mode == SingleWriteMode::YES),
-    kv_meta(options.get_key_value_metadata()),
-    out_sink_(std::move(sink))
-{
-  if (options.get_metadata()) {
-    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
-  }
-  init_state();
-}
-
-writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   chunked_orc_writer_options const& options,
-                   SingleWriteMode mode,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : _mr(mr),
-    stream(stream),
-    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
-    row_index_stride{options.get_row_index_stride()},
-    compression_kind_(to_orc_compression(options.get_compression())),
-    compression_blocksize_(compression_block_size(compression_kind_)),
-    stats_freq_(options.get_statistics_freq()),
-    single_write_mode(mode == SingleWriteMode::YES),
-    kv_meta(options.get_key_value_metadata()),
-    out_sink_(std::move(sink))
-{
-  if (options.get_metadata()) {
-    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
-  }
-  init_state();
-}
-
-writer::impl::~impl() { close(); }
-
-void writer::impl::init_state()
-{
-  // Write file header
-  out_sink_->host_write(MAGIC, std::strlen(MAGIC));
-}
-
 void pushdown_lists_null_mask(orc_column_view const& col,
                               device_span<orc_column_device_view> d_columns,
                               bitmask_type const* parent_pd_mask,
@@ -1675,7 +1816,8 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
   }
 
   // Attach null masks to device column views (async)
-  auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(mask_ptrs, stream);
+  auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(
+    mask_ptrs, stream, rmm::mr::get_current_device_resource());
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator(0ul),
@@ -1765,7 +1907,8 @@ orc_table_view make_orc_table_view(table_view const& table,
     orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) {
       return orc_column.orc_kind();
     });
-  auto const d_type_kinds = cudf::detail::make_device_uvector_async(type_kinds, stream);
+  auto const d_type_kinds = cudf::detail::make_device_uvector_async(
+    type_kinds, stream, rmm::mr::get_current_device_resource());
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
   using stack_value_type = thrust::pair<column_device_view const*, thrust::optional<uint32_t>>;
@@ -1815,7 +1958,8 @@ orc_table_view make_orc_table_view(table_view const& table,
   return {std::move(orc_columns),
           std::move(d_orc_columns),
           str_col_indexes,
-          cudf::detail::make_device_uvector_sync(str_col_indexes, stream)};
+          cudf::detail::make_device_uvector_sync(
+            str_col_indexes, stream, rmm::mr::get_current_device_resource())};
 }
 
 hostdevice_2dvector<rowgroup_rows> calculate_rowgroup_bounds(orc_table_view const& orc_table,
@@ -1983,7 +2127,7 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
                  std::back_inserter(data),
                  [&](auto& idx) {
                    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     orc_table.columns[idx].size(), stream);
+                     orc_table.columns[idx].size(), stream, rmm::mr::get_current_device_resource());
                  });
   std::vector<rmm::device_uvector<uint32_t>> index;
   std::transform(orc_table.string_column_indices.begin(),
@@ -1991,7 +2135,7 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
                  std::back_inserter(index),
                  [&](auto& idx) {
                    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     orc_table.columns[idx].size(), stream);
+                     orc_table.columns[idx].size(), stream, rmm::mr::get_current_device_resource());
                  });
   stream.synchronize();
 
@@ -2006,53 +2150,13 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
 
   return {std::move(data),
           std::move(index),
-          cudf::detail::make_device_uvector_sync(data_ptrs, stream),
-          cudf::detail::make_device_uvector_sync(index_ptrs, stream),
+          cudf::detail::make_device_uvector_sync(
+            data_ptrs, stream, rmm::mr::get_current_device_resource()),
+          cudf::detail::make_device_uvector_sync(
+            index_ptrs, stream, rmm::mr::get_current_device_resource()),
           std::move(is_dict_enabled)};
 }
 
-struct string_length_functor {
-  __device__ inline size_type operator()(int const i) const
-  {
-    // we translate from 0 -> num_chunks * 2 because each statistic has a min and max
-    // string and we need to calculate lengths for both.
-    if (i >= num_chunks * 2) return 0;
-
-    // min strings are even values, max strings are odd values of i
-    auto const should_copy_min = i % 2 == 0;
-    // index of the chunk
-    auto const idx = i / 2;
-    auto& str_val  = should_copy_min ? stripe_stat_chunks[idx].min_value.str_val
-                                     : stripe_stat_chunks[idx].max_value.str_val;
-    auto const str = stripe_stat_merge[idx].stats_dtype == dtype_string;
-    return str ? str_val.length : 0;
-  }
-
-  int const num_chunks;
-  statistics_chunk const* stripe_stat_chunks;
-  statistics_merge_group const* stripe_stat_merge;
-};
-
-__global__ void copy_string_data(char* string_pool,
-                                 size_type* offsets,
-                                 statistics_chunk* chunks,
-                                 statistics_merge_group const* groups)
-{
-  auto const idx = blockIdx.x / 2;
-  if (groups[idx].stats_dtype == dtype_string) {
-    // min strings are even values, max strings are odd values of i
-    auto const should_copy_min = blockIdx.x % 2 == 0;
-    auto& str_val = should_copy_min ? chunks[idx].min_value.str_val : chunks[idx].max_value.str_val;
-    auto dst      = &string_pool[offsets[blockIdx.x]];
-    auto src      = str_val.ptr;
-
-    for (int i = threadIdx.x; i < str_val.length; i += blockDim.x) {
-      dst[i] = src[i];
-    }
-    if (threadIdx.x == 0) { str_val.ptr = dst; }
-  }
-}
-
 size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize)
 {
   if (compression_kind == NONE) return 0;
@@ -2061,60 +2165,14 @@ size_t max_compression_output_size(CompressionKind compression_kind, uint32_t co
                                         compression_blocksize);
 }
 
-void writer::impl::persisted_statistics::persist(int num_table_rows,
-                                                 bool single_write_mode,
-                                                 intermediate_statistics& intermediate_stats,
-                                                 rmm::cuda_stream_view stream)
-{
-  if (not single_write_mode) {
-    // persist the strings in the chunks into a string pool and update pointers
-    auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
-    // min offset and max offset + 1 for total size
-    rmm::device_uvector<size_type> offsets((num_chunks * 2) + 1, stream);
-
-    auto iter = cudf::detail::make_counting_transform_iterator(
-      0,
-      string_length_functor{num_chunks,
-                            intermediate_stats.stripe_stat_chunks.data(),
-                            intermediate_stats.stripe_stat_merge.device_ptr()});
-    thrust::exclusive_scan(rmm::exec_policy(stream), iter, iter + offsets.size(), offsets.begin());
-
-    // pull size back to host
-    auto const total_string_pool_size = offsets.element(num_chunks * 2, stream);
-    if (total_string_pool_size > 0) {
-      rmm::device_uvector<char> string_pool(total_string_pool_size, stream);
-
-      // offsets describes where in the string pool each string goes. Going with the simple
-      // approach for now, but it is possible something fancier with breaking up each thread into
-      // copying x bytes instead of a single string is the better method since we are dealing in
-      // min/max strings they almost certainly will not be uniform length.
-      copy_string_data<<<num_chunks * 2, 256, 0, stream.value()>>>(
-        string_pool.data(),
-        offsets.data(),
-        intermediate_stats.stripe_stat_chunks.data(),
-        intermediate_stats.stripe_stat_merge.device_ptr());
-      string_pools.emplace_back(std::move(string_pool));
-    }
-  }
-
-  stripe_stat_chunks.emplace_back(std::move(intermediate_stats.stripe_stat_chunks));
-  stripe_stat_merge.emplace_back(std::move(intermediate_stats.stripe_stat_merge));
-  stats_dtypes = std::move(intermediate_stats.stats_dtypes);
-  col_types    = std::move(intermediate_stats.col_types);
-  num_rows     = num_table_rows;
-}
-
-void writer::impl::write(table_view const& table)
+std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
 {
-  CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
-  auto const num_rows = table.num_rows();
-
-  if (not table_meta) { table_meta = std::make_unique<table_input_metadata>(table); }
+  auto table_meta = std::make_unique<table_input_metadata>(input);
 
   // Fill unnamed columns' names in table_meta
   std::function<void(column_in_metadata&, std::string)> add_default_name =
     [&](column_in_metadata& col_meta, std::string default_name) {
-      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      if (col_meta.get_name().empty()) { col_meta.set_name(default_name); }
       for (size_type i = 0; i < col_meta.num_children(); ++i) {
         add_default_name(col_meta.child(i), std::to_string(i));
       }
@@ -2123,9 +2181,51 @@ void writer::impl::write(table_view const& table)
     add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
   }
 
-  auto const d_table = table_device_view::create(table, stream);
+  return table_meta;
+}
+
+/**
+ * @brief Perform the processing steps needed to convert the input table into the output ORC data
+ * for writing, such as compression and ORC encoding.
+ *
+ * @param input The input table
+ * @param table_meta The table metadata
+ * @param max_stripe_size Maximum size of stripes in the output file
+ * @param row_index_stride The row index stride
+ * @param enable_dictionary Whether dictionary is enabled
+ * @param compression_kind The compression kind
+ * @param compression_blocksize The block size used for compression
+ * @param stats_freq Column statistics granularity type for parquet/orc writers
+ * @param single_write_mode Flag to indicate if there is only a single table write
+ * @param out_sink Sink for writing data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A tuple of the intermediate results containing the processed data
+ */
+std::tuple<orc_streams,
+           hostdevice_vector<compression_result>,
+           hostdevice_2dvector<gpu::StripeStream>,
+           encoded_data,
+           file_segmentation,
+           std::vector<StripeInformation>,
+           orc_table_view,
+           rmm::device_buffer,
+           intermediate_statistics,
+           pinned_buffer<uint8_t>>
+convert_table_to_orc_data(table_view const& input,
+                          table_input_metadata const& table_meta,
+                          stripe_size_limits max_stripe_size,
+                          size_type row_index_stride,
+                          bool enable_dictionary,
+                          CompressionKind compression_kind,
+                          size_t compression_blocksize,
+                          statistics_freq stats_freq,
+                          bool single_write_mode,
+                          data_sink const& out_sink,
+                          rmm::cuda_stream_view stream)
+{
+  auto const input_tview = table_device_view::create(input, stream);
 
-  auto orc_table = make_orc_table_view(table, *d_table, *table_meta, stream);
+  auto orc_table = make_orc_table_view(input, *input_tview, table_meta, stream);
 
   auto const pd_masks = init_pushdown_null_masks(orc_table, stream);
 
@@ -2145,7 +2245,7 @@ void writer::impl::write(table_view const& table)
   }
 
   // Decide stripe boundaries based on rowgroups and dict chunks
-  auto const segmentation =
+  auto segmentation =
     calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
 
   // Build stripe-level dictionaries
@@ -2157,15 +2257,22 @@ void writer::impl::write(table_view const& table)
                        dict,
                        dictionaries.index,
                        dictionaries.dictionary_enabled,
-                       stripe_dict);
+                       stripe_dict,
+                       enable_dictionary,
+                       stream);
   }
 
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
-  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind_);
-  auto const compressed_block_align   = comp_block_alignment(compression_kind_);
-  auto streams =
-    create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes));
+  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind);
+  auto const compressed_block_align   = comp_block_alignment(compression_kind);
+
+  auto streams  = create_streams(orc_table.columns,
+                                segmentation,
+                                decimal_column_sizes(dec_chunk_sizes.rg_sizes),
+                                enable_dictionary,
+                                compression_kind,
+                                single_write_mode);
   auto enc_data = encode_columns(orc_table,
                                  std::move(dictionaries),
                                  std::move(dec_chunk_sizes),
@@ -2174,152 +2281,314 @@ void writer::impl::write(table_view const& table)
                                  uncompressed_block_align,
                                  stream);
 
+  auto const num_rows = input.num_rows();
+
   // Assemble individual disparate column chunks into contiguous data streams
   size_type const num_index_streams = (orc_table.num_columns() + 1);
   const auto num_data_streams       = streams.size() - num_index_streams;
   hostdevice_2dvector<gpu::StripeStream> strm_descs(
     segmentation.num_stripes(), num_data_streams, stream);
-  auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
-
-  if (num_rows > 0) {
-    // Allocate intermediate output stream buffer
-    size_t compressed_bfr_size   = 0;
-    size_t num_compressed_blocks = 0;
-
-    auto const max_compressed_block_size =
-      max_compression_output_size(compression_kind_, compression_blocksize_);
-    auto const padded_max_compressed_block_size =
-      util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
-    auto const padded_block_header_size =
-      util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
-
-    auto stream_output = [&]() {
-      size_t max_stream_size = 0;
-      bool all_device_write  = true;
-
-      for (auto& ss : strm_descs.host_view().flat_view()) {
-        if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
-        size_t stream_size = ss.stream_size;
-        if (compression_kind_ != NONE) {
-          ss.first_block = num_compressed_blocks;
-          ss.bfr_offset  = compressed_bfr_size;
-
-          auto num_blocks = std::max<uint32_t>(
-            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-          stream_size += num_blocks * block_header_size;
-          num_compressed_blocks += num_blocks;
-          compressed_bfr_size +=
-            (padded_block_header_size + padded_max_compressed_block_size) * num_blocks;
-        }
-        max_stream_size = std::max(max_stream_size, stream_size);
-      }
+  auto stripes =
+    gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs, stream);
+
+  if (num_rows == 0) {
+    return {std::move(streams),
+            hostdevice_vector<compression_result>{},  // comp_results
+            std::move(strm_descs),
+            std::move(enc_data),
+            std::move(segmentation),
+            std::move(stripes),
+            std::move(orc_table),
+            rmm::device_buffer{},  // compressed_data
+            intermediate_statistics{stream},
+            pinned_buffer<uint8_t>{nullptr, cudaFreeHost}};
+  }
 
-      if (all_device_write) {
-        return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-      } else {
-        return pinned_buffer<uint8_t>{[](size_t size) {
-                                        uint8_t* ptr = nullptr;
-                                        CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
-                                        return ptr;
-                                      }(max_stream_size),
-                                      cudaFreeHost};
+  // Allocate intermediate output stream buffer
+  size_t compressed_bfr_size   = 0;
+  size_t num_compressed_blocks = 0;
+
+  auto const max_compressed_block_size =
+    max_compression_output_size(compression_kind, compression_blocksize);
+  auto const padded_max_compressed_block_size =
+    util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
+  auto const padded_block_header_size =
+    util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
+
+  auto stream_output = [&]() {
+    size_t max_stream_size = 0;
+    bool all_device_write  = true;
+
+    for (auto& ss : strm_descs.host_view().flat_view()) {
+      if (!out_sink.is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+      size_t stream_size = ss.stream_size;
+      if (compression_kind != NONE) {
+        ss.first_block = num_compressed_blocks;
+        ss.bfr_offset  = compressed_bfr_size;
+
+        auto num_blocks =
+          std::max<uint32_t>((stream_size + compression_blocksize - 1) / compression_blocksize, 1);
+        stream_size += num_blocks * block_header_size;
+        num_compressed_blocks += num_blocks;
+        compressed_bfr_size +=
+          (padded_block_header_size + padded_max_compressed_block_size) * num_blocks;
       }
-    }();
+      max_stream_size = std::max(max_stream_size, stream_size);
+    }
 
-    // Compress the data streams
-    rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-    hostdevice_vector<compression_result> comp_results(num_compressed_blocks, stream);
-    thrust::fill(rmm::exec_policy(stream),
-                 comp_results.d_begin(),
-                 comp_results.d_end(),
-                 compression_result{0, compression_status::FAILURE});
-    if (compression_kind_ != NONE) {
-      strm_descs.host_to_device(stream);
-      gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
-                                  num_compressed_blocks,
-                                  compression_kind_,
-                                  compression_blocksize_,
-                                  max_compressed_block_size,
-                                  compressed_block_align,
-                                  strm_descs,
-                                  enc_data.streams,
-                                  comp_results,
-                                  stream);
-
-      // deallocate encoded data as it is not needed anymore
-      enc_data.data = rmm::device_uvector<uint8_t>{0, stream};
-
-      strm_descs.device_to_host(stream);
-      comp_results.device_to_host(stream, true);
+    if (all_device_write) {
+      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+    } else {
+      return pinned_buffer<uint8_t>{[](size_t size) {
+                                      uint8_t* ptr = nullptr;
+                                      CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
+                                      return ptr;
+                                    }(max_stream_size),
+                                    cudaFreeHost};
     }
+  }();
+
+  // Compress the data streams
+  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+  hostdevice_vector<compression_result> comp_results(num_compressed_blocks, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               comp_results.d_begin(),
+               comp_results.d_end(),
+               compression_result{0, compression_status::FAILURE});
+  if (compression_kind != NONE) {
+    strm_descs.host_to_device(stream);
+    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+                                num_compressed_blocks,
+                                compression_kind,
+                                compression_blocksize,
+                                max_compressed_block_size,
+                                compressed_block_align,
+                                strm_descs,
+                                enc_data.streams,
+                                comp_results,
+                                stream);
+
+    // deallocate encoded data as it is not needed anymore
+    enc_data.data = rmm::device_uvector<uint8_t>{0, stream};
+
+    strm_descs.device_to_host(stream);
+    comp_results.device_to_host(stream, true);
+  }
 
-    auto intermediate_stats = gather_statistic_blobs(stats_freq_, orc_table, segmentation);
+  auto intermediate_stats = gather_statistic_blobs(stats_freq, orc_table, segmentation, stream);
+
+  return {std::move(streams),
+          std::move(comp_results),
+          std::move(strm_descs),
+          std::move(enc_data),
+          std::move(segmentation),
+          std::move(stripes),
+          std::move(orc_table),
+          std::move(compressed_data),
+          std::move(intermediate_stats),
+          std::move(stream_output)};
+}
 
-    if (intermediate_stats.stripe_stat_chunks.size() > 0) {
-      persisted_stripe_statistics.persist(
-        orc_table.num_rows(), single_write_mode, intermediate_stats, stream);
+}  // namespace
+
+writer::impl::impl(std::unique_ptr<data_sink> sink,
+                   orc_writer_options const& options,
+                   SingleWriteMode mode,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : _mr(mr),
+    stream(stream),
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
+    compression_kind_(to_orc_compression(options.get_compression())),
+    compression_blocksize_(compression_block_size(compression_kind_)),
+    stats_freq_(options.get_statistics_freq()),
+    single_write_mode(mode == SingleWriteMode::YES),
+    kv_meta(options.get_key_value_metadata()),
+    out_sink_(std::move(sink))
+{
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
+  }
+  init_state();
+}
+
+writer::impl::impl(std::unique_ptr<data_sink> sink,
+                   chunked_orc_writer_options const& options,
+                   SingleWriteMode mode,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : _mr(mr),
+    stream(stream),
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
+    compression_kind_(to_orc_compression(options.get_compression())),
+    compression_blocksize_(compression_block_size(compression_kind_)),
+    stats_freq_(options.get_statistics_freq()),
+    single_write_mode(mode == SingleWriteMode::YES),
+    kv_meta(options.get_key_value_metadata()),
+    out_sink_(std::move(sink))
+{
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
+  }
+  init_state();
+}
+
+writer::impl::~impl() { close(); }
+
+void writer::impl::init_state()
+{
+  // Write file header
+  out_sink_->host_write(MAGIC, std::strlen(MAGIC));
+}
+
+void writer::impl::write(table_view const& input)
+{
+  CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
+
+  if (not table_meta) { table_meta = make_table_meta(input); }
+
+  // All kinds of memory allocation and data compressions/encoding are performed here.
+  // If any error occurs, such as out-of-memory exception, the internal state of the current writer
+  // is still intact.
+  // Note that `out_sink_` is intentionally passed by const reference to prevent accidentally
+  // writing anything to it.
+  auto [streams,
+        comp_results,
+        strm_descs,
+        enc_data,
+        segmentation,
+        stripes,
+        orc_table,
+        compressed_data,
+        intermediate_stats,
+        stream_output] = [&] {
+    try {
+      return convert_table_to_orc_data(input,
+                                       *table_meta,
+                                       max_stripe_size,
+                                       row_index_stride,
+                                       enable_dictionary_,
+                                       compression_kind_,
+                                       compression_blocksize_,
+                                       stats_freq_,
+                                       single_write_mode,
+                                       *out_sink_,
+                                       stream);
+    } catch (...) {  // catch any exception type
+      CUDF_LOG_ERROR(
+        "ORC writer encountered exception during processing. "
+        "No data has been written to the sink.");
+      throw;  // this throws the same exception
     }
+  }();
 
-    // Write stripes
-    std::vector<std::future<void>> write_tasks;
-    for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-      auto& stripe = stripes[stripe_id];
-
-      stripe.offset = out_sink_->bytes_written();
-
-      // Column (skippable) index streams appear at the start of the stripe
-      for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
-        write_index_stream(stripe_id,
-                           stream_id,
-                           orc_table.columns,
-                           segmentation,
-                           enc_data.streams,
-                           strm_descs,
-                           comp_results,
-                           intermediate_stats.rowgroup_blobs,
-                           &stripe,
-                           &streams);
-      }
+  // Compression/encoding were all successful. Now write the intermediate results.
+  write_orc_data_to_sink(streams,
+                         comp_results,
+                         strm_descs,
+                         enc_data,
+                         segmentation,
+                         stripes,
+                         orc_table,
+                         compressed_data,
+                         intermediate_stats,
+                         stream_output.get());
+
+  // Update data into the footer. This needs to be called even when num_rows==0.
+  add_table_to_footer_data(orc_table, stripes);
+}
 
-      // Column data consisting one or more separate streams
-      for (auto const& strm_desc : strm_descs[stripe_id]) {
-        write_tasks.push_back(write_data_stream(
-          strm_desc,
-          enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
-          static_cast<uint8_t const*>(compressed_data.data()),
-          stream_output.get(),
-          &stripe,
-          &streams));
-      }
+void writer::impl::write_orc_data_to_sink(orc_streams& streams,
+                                          hostdevice_vector<compression_result> const& comp_results,
+                                          hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
+                                          encoded_data const& enc_data,
+                                          file_segmentation const& segmentation,
+                                          std::vector<StripeInformation>& stripes,
+                                          orc_table_view const& orc_table,
+                                          rmm::device_buffer const& compressed_data,
+                                          intermediate_statistics& intermediate_stats,
+                                          uint8_t* stream_output)
+{
+  if (orc_table.num_rows() == 0) { return; }
 
-      // Write stripefooter consisting of stream information
-      StripeFooter sf;
-      sf.streams = streams;
-      sf.columns.resize(orc_table.num_columns() + 1);
-      sf.columns[0].kind = DIRECT;
-      for (size_t i = 1; i < sf.columns.size(); ++i) {
-        sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
-        sf.columns[i].dictionarySize =
-          (sf.columns[i].kind == DICTIONARY_V2)
-            ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
-            : 0;
-        if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
-      }
-      ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
-      pbw.write(sf);
-      stripe.footerLength = pbw.size();
-      if (compression_kind_ != NONE) {
-        uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-        pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_sf_len >> 0);
-        pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_sf_len >> 8);
-        pbw.buffer()[2]        = static_cast<uint8_t>(uncomp_sf_len >> 16);
-      }
-      out_sink_->host_write(pbw.data(), pbw.size());
+  if (intermediate_stats.stripe_stat_chunks.size() > 0) {
+    persisted_stripe_statistics.persist(
+      orc_table.num_rows(), single_write_mode, intermediate_stats, stream);
+  }
+
+  // Write stripes
+  std::vector<std::future<void>> write_tasks;
+  for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
+    auto& stripe = stripes[stripe_id];
+
+    stripe.offset = out_sink_->bytes_written();
+
+    // Column (skippable) index streams appear at the start of the stripe
+    size_type const num_index_streams = (orc_table.num_columns() + 1);
+    for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
+      write_index_stream(stripe_id,
+                         stream_id,
+                         orc_table.columns,
+                         segmentation,
+                         enc_data.streams,
+                         strm_descs,
+                         comp_results,
+                         intermediate_stats.rowgroup_blobs,
+                         &stripe,
+                         &streams,
+                         compression_kind_,
+                         compression_blocksize_,
+                         out_sink_);
     }
-    for (auto const& task : write_tasks) {
-      task.wait();
+
+    // Column data consisting one or more separate streams
+    for (auto const& strm_desc : strm_descs[stripe_id]) {
+      write_tasks.push_back(write_data_stream(
+        strm_desc,
+        enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
+        static_cast<uint8_t const*>(compressed_data.data()),
+        stream_output,
+        &stripe,
+        &streams,
+        compression_kind_,
+        out_sink_,
+        stream));
     }
+
+    // Write stripefooter consisting of stream information
+    StripeFooter sf;
+    sf.streams = streams;
+    sf.columns.resize(orc_table.num_columns() + 1);
+    sf.columns[0].kind = DIRECT;
+    for (size_t i = 1; i < sf.columns.size(); ++i) {
+      sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
+      sf.columns[i].dictionarySize =
+        (sf.columns[i].kind == DICTIONARY_V2)
+          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+          : 0;
+      if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+    }
+    ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
+    pbw.write(sf);
+    stripe.footerLength = pbw.size();
+    if (compression_kind_ != NONE) {
+      uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
+      pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_sf_len >> 0);
+      pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_sf_len >> 8);
+      pbw.buffer()[2]        = static_cast<uint8_t>(uncomp_sf_len >> 16);
+    }
+    out_sink_->host_write(pbw.data(), pbw.size());
+  }
+  for (auto const& task : write_tasks) {
+    task.wait();
   }
+}
+
+void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table,
+                                            std::vector<StripeInformation>& stripes)
+{
   if (ff.headerLength == 0) {
     // First call
     ff.headerLength   = std::strlen(MAGIC);
@@ -2365,7 +2634,7 @@ void writer::impl::write(table_view const& table)
   ff.stripes.insert(ff.stripes.end(),
                     std::make_move_iterator(stripes.begin()),
                     std::make_move_iterator(stripes.end()));
-  ff.numberOfRows += num_rows;
+  ff.numberOfRows += orc_table.num_rows();
 }
 
 void writer::impl::close()
@@ -2374,7 +2643,8 @@ void writer::impl::close()
   closed = true;
   PostScript ps;
 
-  auto const statistics = finish_statistic_blobs(ff.stripes.size(), persisted_stripe_statistics);
+  auto const statistics =
+    finish_statistic_blobs(ff.stripes.size(), persisted_stripe_statistics, stream);
 
   // File-level statistics
   if (not statistics.file_level.empty()) {
@@ -2418,7 +2688,7 @@ void writer::impl::close()
   if (md.stripeStats.size() != 0) {
     ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
     pbw.write(md);
-    add_uncompressed_block_headers(pbw.buffer());
+    add_uncompressed_block_headers(compression_kind_, compression_blocksize_, pbw.buffer());
     ps.metadataLength = pbw.size();
     out_sink_->host_write(pbw.data(), pbw.size());
   } else {
@@ -2426,7 +2696,7 @@ void writer::impl::close()
   }
   ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
   pbw.write(ff);
-  add_uncompressed_block_headers(pbw.buffer());
+  add_uncompressed_block_headers(compression_kind_, compression_blocksize_, pbw.buffer());
 
   // Write postscript metadata
   ps.footerLength         = pbw.size();
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 691fba6bac2..27d74e45b46 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -176,6 +176,72 @@ struct stripe_size_limits {
   size_type rows;
 };
 
+/**
+ * @brief Statistics data stored between calls to write for chunked writes
+ *
+ */
+struct intermediate_statistics {
+  explicit intermediate_statistics(rmm::cuda_stream_view stream) : stripe_stat_chunks(0, stream) {}
+
+  intermediate_statistics(std::vector<ColStatsBlob> rb,
+                          rmm::device_uvector<statistics_chunk> sc,
+                          hostdevice_vector<statistics_merge_group> smg,
+                          std::vector<statistics_dtype> sdt,
+                          std::vector<data_type> sct)
+    : rowgroup_blobs(std::move(rb)),
+      stripe_stat_chunks(std::move(sc)),
+      stripe_stat_merge(std::move(smg)),
+      stats_dtypes(std::move(sdt)),
+      col_types(std::move(sct))
+  {
+  }
+
+  // blobs for the rowgroups. Not persisted
+  std::vector<ColStatsBlob> rowgroup_blobs;
+
+  rmm::device_uvector<statistics_chunk> stripe_stat_chunks;
+  hostdevice_vector<statistics_merge_group> stripe_stat_merge;
+  std::vector<statistics_dtype> stats_dtypes;
+  std::vector<data_type> col_types;
+};
+
+/**
+ * @brief used for chunked writes to persist data between calls to write.
+ *
+ */
+struct persisted_statistics {
+  void clear()
+  {
+    stripe_stat_chunks.clear();
+    stripe_stat_merge.clear();
+    string_pools.clear();
+    stats_dtypes.clear();
+    col_types.clear();
+    num_rows = 0;
+  }
+
+  void persist(int num_table_rows,
+               bool single_write_mode,
+               intermediate_statistics& intermediate_stats,
+               rmm::cuda_stream_view stream);
+
+  std::vector<rmm::device_uvector<statistics_chunk>> stripe_stat_chunks;
+  std::vector<hostdevice_vector<statistics_merge_group>> stripe_stat_merge;
+  std::vector<rmm::device_uvector<char>> string_pools;
+  std::vector<statistics_dtype> stats_dtypes;
+  std::vector<data_type> col_types;
+  int num_rows = 0;
+};
+
+/**
+ * @brief Protobuf encoded statistics created at file close
+ *
+ */
+struct encoded_footer_statistics {
+  std::vector<ColStatsBlob> stripe_level;
+  std::vector<ColStatsBlob> file_level;
+};
+
 /**
  * @brief Implementation for ORC writer
  */
@@ -227,7 +293,7 @@ class writer::impl {
   /**
    * @brief Writes a single subtable as part of a larger ORC file/table write.
    *
-   * @param[in] table The table information to be written
+   * @param table The table information to be written
    */
   void write(table_view const& table);
 
@@ -238,186 +304,41 @@ class writer::impl {
 
  private:
   /**
-   * @brief Builds up per-stripe dictionaries for string columns.
-   *
-   * @param orc_table Non-owning view of a cuDF table w/ ORC-related info
-   * @param stripe_bounds List of stripe boundaries
-   * @param dict List of dictionary chunks [rowgroup][column]
-   * @param dict_index List of dictionary indices
-   * @param dictionary_enabled Whether dictionary encoding is enabled for a given column
-   * @param stripe_dict List of stripe dictionaries
-   */
-  void build_dictionaries(orc_table_view& orc_table,
-                          host_span<stripe_rowgroups const> stripe_bounds,
-                          hostdevice_2dvector<gpu::DictionaryChunk> const& dict,
-                          host_span<rmm::device_uvector<uint32_t>> dict_index,
-                          host_span<bool const> dictionary_enabled,
-                          hostdevice_2dvector<gpu::StripeDictionary>& stripe_dict);
-
-  /**
-   * @brief Builds up per-column streams.
-   *
-   * @param[in,out] columns List of columns
-   * @param[in] segmentation stripe and rowgroup ranges
-   * @param[in] decimal_column_sizes Sizes of encoded decimal columns
-   * @return List of stream descriptors
-   */
-  orc_streams create_streams(host_span<orc_column_view> columns,
-                             file_segmentation const& segmentation,
-                             std::map<uint32_t, size_t> const& decimal_column_sizes);
-
-  /**
-   * @brief Returns stripe information after compacting columns' individual data
-   * chunks into contiguous data streams.
-   *
-   * @param[in] num_index_streams Total number of index streams
-   * @param[in] segmentation stripe and rowgroup ranges
-   * @param[in,out] enc_streams List of encoder chunk streams [column][rowgroup]
-   * @param[in,out] strm_desc List of stream descriptors [stripe][data_stream]
+   * @brief Write the intermediate ORC data into the data sink.
    *
-   * @return The stripes' information
-   */
-  std::vector<StripeInformation> gather_stripes(
-    size_t num_index_streams,
-    file_segmentation const& segmentation,
-    hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
-    hostdevice_2dvector<gpu::StripeStream>* strm_desc);
-
-  /**
-   * @brief Statistics data stored between calls to write for chunked writes
-   *
-   */
-  struct intermediate_statistics {
-    explicit intermediate_statistics(rmm::cuda_stream_view stream)
-      : stripe_stat_chunks(0, stream){};
-    intermediate_statistics(std::vector<ColStatsBlob> rb,
-                            rmm::device_uvector<statistics_chunk> sc,
-                            hostdevice_vector<statistics_merge_group> smg,
-                            std::vector<statistics_dtype> sdt,
-                            std::vector<data_type> sct)
-      : rowgroup_blobs(std::move(rb)),
-        stripe_stat_chunks(std::move(sc)),
-        stripe_stat_merge(std::move(smg)),
-        stats_dtypes(std::move(sdt)),
-        col_types(std::move(sct)){};
-
-    // blobs for the rowgroups. Not persisted
-    std::vector<ColStatsBlob> rowgroup_blobs;
-
-    rmm::device_uvector<statistics_chunk> stripe_stat_chunks;
-    hostdevice_vector<statistics_merge_group> stripe_stat_merge;
-    std::vector<statistics_dtype> stats_dtypes;
-    std::vector<data_type> col_types;
-  };
-
-  /**
-   * @brief used for chunked writes to persist data between calls to write.
-   *
-   */
-  struct persisted_statistics {
-    void clear()
-    {
-      stripe_stat_chunks.clear();
-      stripe_stat_merge.clear();
-      string_pools.clear();
-      stats_dtypes.clear();
-      col_types.clear();
-      num_rows = 0;
-    }
-
-    void persist(int num_table_rows,
-                 bool single_write_mode,
-                 intermediate_statistics& intermediate_stats,
-                 rmm::cuda_stream_view stream);
-
-    std::vector<rmm::device_uvector<statistics_chunk>> stripe_stat_chunks;
-    std::vector<hostdevice_vector<statistics_merge_group>> stripe_stat_merge;
-    std::vector<rmm::device_uvector<char>> string_pools;
-    std::vector<statistics_dtype> stats_dtypes;
-    std::vector<data_type> col_types;
-    int num_rows = 0;
-  };
-
-  /**
-   * @brief Protobuf encoded statistics created at file close
-   *
-   */
-  struct encoded_footer_statistics {
-    std::vector<ColStatsBlob> stripe_level;
-    std::vector<ColStatsBlob> file_level;
-  };
-
-  /**
-   * @brief Returns column statistics in an intermediate format.
-   *
-   * @param statistics_freq Frequency of statistics to be included in the output file
-   * @param orc_table Table information to be written
-   * @param segmentation stripe and rowgroup ranges
-   * @return The statistic information
-   */
-  intermediate_statistics gather_statistic_blobs(statistics_freq const statistics_freq,
-                                                 orc_table_view const& orc_table,
-                                                 file_segmentation const& segmentation);
-
-  /**
-   * @brief Returns column statistics encoded in ORC protobuf format stored in the footer.
-   *
-   * @param num_stripes number of stripes in the data
-   * @param incoming_stats intermediate statistics returned from `gather_statistic_blobs`
-   * @return The encoded statistic blobs
-   */
-  encoded_footer_statistics finish_statistic_blobs(
-    int num_stripes, writer::impl::persisted_statistics& incoming_stats);
-
-  /**
-   * @brief Writes the specified column's row index stream.
-   *
-   * @param[in] stripe_id Stripe's identifier
-   * @param[in] stream_id Stream identifier (column id + 1)
-   * @param[in] columns List of columns
-   * @param[in] segmentation stripe and rowgroup ranges
-   * @param[in] enc_streams List of encoder chunk streams [column][rowgroup]
-   * @param[in] strm_desc List of stream descriptors
-   * @param[in] comp_out Output status for compressed streams
-   * @param[in] rg_stats row group level statistics
-   * @param[in,out] stripe Stream's parent stripe
-   * @param[in,out] streams List of all streams
-   */
-  void write_index_stream(int32_t stripe_id,
-                          int32_t stream_id,
-                          host_span<orc_column_view const> columns,
-                          file_segmentation const& segmentation,
-                          host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
-                          host_2dspan<gpu::StripeStream const> strm_desc,
-                          host_span<compression_result const> comp_out,
-                          std::vector<ColStatsBlob> const& rg_stats,
-                          StripeInformation* stripe,
-                          orc_streams* streams);
-
-  /**
-   * @brief Write the specified column's data streams
+   * The intermediate data is generated from processing (compressing/encoding) an cuDF input table
+   * by `process_for_write` called in the `write()` function.
    *
-   * @param[in] strm_desc Stream's descriptor
-   * @param[in] enc_stream Chunk's streams
-   * @param[in] compressed_data Compressed stream data
-   * @param[in,out] stream_out Temporary host output buffer
-   * @param[in,out] stripe Stream's parent stripe
-   * @param[in,out] streams List of all streams
-   * @return An std::future that should be synchronized to ensure the writing is complete
+   * @param streams List of stream descriptors
+   * @param comp_results Status of data compression
+   * @param strm_descs List of stream descriptors
+   * @param enc_data ORC per-chunk streams of encoded data
+   * @param segmentation Description of how the ORC file is segmented into stripes and rowgroups
+   * @param stripes List of stripe description
+   * @param orc_table Non-owning view of a cuDF table that includes ORC-related information
+   * @param compressed_data Compressed stream data
+   * @param intermediate_stats Statistics data stored between calls to write
+   * @param stream_output Temporary host output buffer
    */
-  std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
-                                      gpu::encoder_chunk_streams const& enc_stream,
-                                      uint8_t const* compressed_data,
-                                      uint8_t* stream_out,
-                                      StripeInformation* stripe,
-                                      orc_streams* streams);
+  void write_orc_data_to_sink(orc_streams& streams,
+                              hostdevice_vector<compression_result> const& comp_results,
+                              hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
+                              encoded_data const& enc_data,
+                              file_segmentation const& segmentation,
+                              std::vector<StripeInformation>& stripes,
+                              orc_table_view const& orc_table,
+                              rmm::device_buffer const& compressed_data,
+                              intermediate_statistics& intermediate_stats,
+                              uint8_t* stream_output);
 
   /**
-   * @brief Insert 3-byte uncompressed block headers in a byte vector
+   * @brief Add the processed table data into the internal file footer.
    *
-   * @param byte_vector Raw data (must include initial 3-byte header)
+   * @param orc_table Non-owning view of a cuDF table that includes ORC-related information
+   * @param stripes List of stripe description
    */
-  void add_uncompressed_block_headers(std::vector<uint8_t>& byte_vector);
+  void add_table_to_footer_data(orc_table_view const& orc_table,
+                                std::vector<StripeInformation>& stripes);
 
  private:
   rmm::mr::device_memory_resource* _mr = nullptr;
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 6b5d4ba3640..e5c2b7aa842 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -468,10 +468,12 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
 
     host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
                                                              codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, stream);
+    auto const d_comp_in = cudf::detail::make_device_uvector_async(
+      comp_in_view, stream, rmm::mr::get_current_device_resource());
     host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
                                                         codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, stream);
+    auto const d_comp_out = cudf::detail::make_device_uvector_async(
+      comp_out_view, stream, rmm::mr::get_current_device_resource());
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
@@ -523,8 +525,10 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
 
   // now copy the uncompressed V2 def and rep level data
   if (not copy_in.empty()) {
-    auto const d_copy_in  = cudf::detail::make_device_uvector_async(copy_in, stream);
-    auto const d_copy_out = cudf::detail::make_device_uvector_async(copy_out, stream);
+    auto const d_copy_in = cudf::detail::make_device_uvector_async(
+      copy_in, stream, rmm::mr::get_current_device_resource());
+    auto const d_copy_out = cudf::detail::make_device_uvector_async(
+      copy_out, stream, rmm::mr::get_current_device_resource());
 
     gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
     stream.synchronize();
@@ -1489,8 +1493,8 @@ void reader::impl::preprocess_pages(size_t skip_rows,
     // Build index for string dictionaries since they can't be indexed
     // directly due to variable-sized elements
     _chunk_itm_data.str_dict_index =
-      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(total_str_dict_indexes,
-                                                                        _stream);
+      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+        total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
 
     // Update chunks with pointers to string dict indices
     for (size_t c = 0, page_count = 0, str_ofs = 0; c < chunks.size(); c++) {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5f407b5e774..e6e14908f36 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -853,7 +853,8 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   _nullability = std::vector<uint8_t>(r_nullability.crbegin(), r_nullability.crend());
   // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
   // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
-  _d_nullability = cudf::detail::make_device_uvector_async(_nullability, stream);
+  _d_nullability = cudf::detail::make_device_uvector_async(
+    _nullability, stream, rmm::mr::get_current_device_resource());
 
   _is_list = (_max_rep_level > 0);
 
@@ -928,7 +929,8 @@ void writer::impl::init_row_group_fragments(
   device_span<int const> part_frag_offset,
   uint32_t fragment_size)
 {
-  auto d_partitions = cudf::detail::make_device_uvector_async(partitions, stream);
+  auto d_partitions = cudf::detail::make_device_uvector_async(
+    partitions, stream, rmm::mr::get_current_device_resource());
   gpu::InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host(stream, true);
 }
@@ -936,7 +938,8 @@ void writer::impl::init_row_group_fragments(
 void writer::impl::calculate_page_fragments(device_span<gpu::PageFragment> frag,
                                             host_span<size_type const> frag_sizes)
 {
-  auto d_frag_sz = cudf::detail::make_device_uvector_async(frag_sizes, stream);
+  auto d_frag_sz = cudf::detail::make_device_uvector_async(
+    frag_sizes, stream, rmm::mr::get_current_device_resource());
   gpu::CalculatePageFragments(frag, d_frag_sz, stream);
 }
 
@@ -1507,7 +1510,8 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
     num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0);
   part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
 
-  auto d_part_frag_offset = cudf::detail::make_device_uvector_async(part_frag_offset, stream);
+  auto d_part_frag_offset = cudf::detail::make_device_uvector_async(
+    part_frag_offset, stream, rmm::mr::get_current_device_resource());
   cudf::detail::hostdevice_2dvector<gpu::PageFragment> row_group_fragments(
     num_columns, num_fragments, stream);
 
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index a0ba3e3ee35..afa260e215a 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -379,9 +379,11 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   // must be at least 32 when using warp-reduce on partials
   // must be at least 1 more than max possible concurrent tiles
   // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
-  auto num_tile_states  = std::max(32, TILES_PER_CHUNK * concurrency + 32);
-  auto tile_multistates = scan_tile_state<multistate>(num_tile_states, stream);
-  auto tile_offsets     = scan_tile_state<output_offset>(num_tile_states, stream);
+  auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32);
+  auto tile_multistates =
+    scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
+  auto tile_offsets =
+    scan_tile_state<output_offset>(num_tile_states, stream, rmm::mr::get_current_device_resource());
 
   multibyte_split_init_kernel<<<TILES_PER_CHUNK,
                                 THREADS_PER_TILE,
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 6c14975101c..430d7c4a26d 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -43,7 +45,8 @@ void column_buffer::create(size_type _size,
       // make_zeroed_device_uvector_async here and instead let it use the
       // default rmm memory resource.
       _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
-        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(size, stream));
+        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+          size, stream, rmm::mr::get_current_device_resource()));
       break;
 
     // list columns store a buffer of int32's as offsets to represent
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index a03789464cc..5c5cbd1c01d 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -126,7 +126,8 @@ cudf::size_type find_all_from_set(device_span<char const> data,
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
   const int grid_size = divCeil(data.size(), (size_t)block_size);
 
-  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
+  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
+    1, stream, rmm::mr::get_current_device_resource());
   for (char key : keys) {
     count_and_set_positions<T><<<grid_size, block_size, 0, stream.value()>>>(
       data.data(), data.size(), result_offset, key, d_count.data(), positions);
@@ -143,7 +144,8 @@ cudf::size_type find_all_from_set(host_span<char const> data,
                                   rmm::cuda_stream_view stream)
 {
   rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream);
-  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
+  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
+    1, stream, rmm::mr::get_current_device_resource());
 
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
diff --git a/cpp/src/io/utilities/trie.cu b/cpp/src/io/utilities/trie.cu
index bf03d6a6a89..e2ace7258f7 100644
--- a/cpp/src/io/utilities/trie.cu
+++ b/cpp/src/io/utilities/trie.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,7 +101,8 @@ rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<s
     // Only add the terminating character if any nodes were added
     if (has_children) { nodes.push_back(serial_trie_node(trie_terminating_character)); }
   }
-  return cudf::detail::make_device_uvector_sync(nodes, stream);
+  return cudf::detail::make_device_uvector_sync(
+    nodes, stream, rmm::mr::get_current_device_resource());
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 2585ff457ee..dcb6835ec09 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -64,8 +64,12 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   // Previously, the gather map was generated directly without this array but by calling to
   // `map.contains` inside the `thrust::copy_if` kernel. However, that led to increasing register
   // usage and reducing performance, as reported here: https://github.com/rapidsai/cudf/pull/10511.
-  auto const flagged =
-    cudf::detail::contains(right_keys, left_keys, compare_nulls, nan_equality::ALL_EQUAL, stream);
+  auto const flagged = cudf::detail::contains(right_keys,
+                                              left_keys,
+                                              compare_nulls,
+                                              nan_equality::ALL_EQUAL,
+                                              stream,
+                                              rmm::mr::get_current_device_resource());
 
   auto const left_num_rows = left_keys.num_rows();
   auto gather_map =
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 8b006548391..b890a0c82a2 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -246,7 +246,8 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
             auto const row_index = i % num_rows;
             return row_null_counts[row_index] != num_columns;
           },
-          stream);
+          stream,
+          rmm::mr::get_current_device_resource());
       }
       // NULLIFY_OUTPUT_ROW.  Output row is nullfied if any input row is null
       return cudf::detail::valid_if(
@@ -257,7 +258,8 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
           auto const row_index = i % num_rows;
           return row_null_counts[row_index] == 0;
         },
-        stream);
+        stream,
+        rmm::mr::get_current_device_resource());
     }();
     concat->set_null_mask(std::move(null_mask), null_count);
   }
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index c96a21df905..5136cc8cd37 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -266,7 +266,8 @@ dremel_data get_encoding(column_view h_col,
     max_vals_size += column_ends[l] - column_offsets[l];
   }
 
-  auto d_nullability = cudf::detail::make_device_uvector_async(nullability, stream);
+  auto d_nullability = cudf::detail::make_device_uvector_async(
+    nullability, stream, rmm::mr::get_current_device_resource());
 
   rmm::device_uvector<uint8_t> rep_level(max_vals_size, stream);
   rmm::device_uvector<uint8_t> def_level(max_vals_size, stream);
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index bff63871e29..5d4a20d1cb8 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -101,8 +101,10 @@ std::unique_ptr<cudf::column> make_index_child(size_type index,
  */
 std::unique_ptr<cudf::column> make_index_offsets(size_type num_lists, rmm::cuda_stream_view stream)
 {
-  return cudf::detail::sequence(
-    num_lists + 1, cudf::scalar_type_t<size_type>(0, true, stream), stream);
+  return cudf::detail::sequence(num_lists + 1,
+                                cudf::scalar_type_t<size_type>(0, true, stream),
+                                stream,
+                                rmm::mr::get_current_device_resource());
 }
 
 }  // namespace
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 8df99153d74..c05ef2fd644 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -83,8 +83,8 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   // Check existence for each row of the rhs_table in lhs_table.
-  auto const contained =
-    cudf::detail::contains(lhs_table, rhs_table, nulls_equal, nans_equal, stream);
+  auto const contained = cudf::detail::contains(
+    lhs_table, rhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
 
   auto const num_rows = lhs.size();
 
@@ -151,8 +151,8 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
   auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
-  auto const contained =
-    cudf::detail::contains(lhs_table, rhs_table, nulls_equal, nans_equal, stream);
+  auto const contained = cudf::detail::contains(
+    lhs_table, rhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
 
   auto const intersect_table = cudf::detail::copy_if(
     rhs_table,
@@ -195,8 +195,11 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
 
   // Algorithm: `return distinct(concatenate_rows(lhs, rhs))`.
 
-  auto const union_col = lists::detail::concatenate_rows(
-    table_view{{lhs.parent(), rhs.parent()}}, concatenate_null_policy::NULLIFY_OUTPUT_ROW, stream);
+  auto const union_col =
+    lists::detail::concatenate_rows(table_view{{lhs.parent(), rhs.parent()}},
+                                    concatenate_null_policy::NULLIFY_OUTPUT_ROW,
+                                    stream,
+                                    rmm::mr::get_current_device_resource());
 
   return cudf::lists::detail::distinct(
     lists_column_view{union_col->view()}, nulls_equal, nans_equal, stream, mr);
@@ -228,8 +231,8 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
   auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
-  auto const contained =
-    cudf::detail::contains(rhs_table, lhs_table, nulls_equal, nans_equal, stream);
+  auto const contained = cudf::detail::contains(
+    rhs_table, lhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
 
   auto const difference_table = cudf::detail::copy_if(
     lhs_table,
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index ec0cc5af44d..83ee6793efb 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -187,10 +187,12 @@ index_vector generate_merged_indices(table_view const& left_table,
   auto lhs_device_view = table_device_view::create(left_table, stream);
   auto rhs_device_view = table_device_view::create(right_table, stream);
 
-  auto d_column_order = cudf::detail::make_device_uvector_async(column_order, stream);
+  auto d_column_order = cudf::detail::make_device_uvector_async(
+    column_order, stream, rmm::mr::get_current_device_resource());
 
   if (nullable) {
-    auto d_null_precedence = cudf::detail::make_device_uvector_async(null_precedence, stream);
+    auto d_null_precedence = cudf::detail::make_device_uvector_async(
+      null_precedence, stream, rmm::mr::get_current_device_resource());
 
     auto ineq_op = detail::row_lexicographic_tagged_comparator<true>(
       *lhs_device_view, *rhs_device_view, d_column_order.data(), d_null_precedence.data());
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 54dffc85aca..13f46195392 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -493,11 +493,11 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     rmm::device_uvector<size_type>(grid_size * num_partitions, stream);
 
   // Holds the total number of rows in each partition
-  auto global_partition_sizes =
-    cudf::detail::make_zeroed_device_uvector_async<size_type>(num_partitions, stream);
+  auto global_partition_sizes = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+    num_partitions, stream, rmm::mr::get_current_device_resource());
 
-  auto row_partition_offset =
-    cudf::detail::make_zeroed_device_uvector_async<size_type>(num_rows, stream);
+  auto row_partition_offset = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+    num_rows, stream, rmm::mr::get_current_device_resource());
 
   auto const row_hasher = experimental::row::hash::row_hasher(table_to_hash, stream);
   auto const hasher =
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 785aa839956..4a9c2e3a902 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,7 +85,8 @@ struct quantile_functor {
     auto d_input  = column_device_view::create(input, stream);
     auto d_output = mutable_column_device_view::create(output->mutable_view(), stream);
 
-    auto q_device = cudf::detail::make_device_uvector_sync(q, stream);
+    auto q_device =
+      cudf::detail::make_device_uvector_sync(q, stream, rmm::mr::get_current_device_resource());
 
     if (!cudf::is_dictionary(input.type())) {
       auto sorted_data =
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index e71508bab09..c6760e77403 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,8 @@ std::unique_ptr<table> quantiles(table_view const& input,
     return detail::select_quantile<size_type>(selector, size, q, interp);
   };
 
-  auto const q_device = cudf::detail::make_device_uvector_async(q, stream);
+  auto const q_device =
+    cudf::detail::make_device_uvector_async(q, stream, rmm::mr::get_current_device_resource());
 
   auto quantile_idx_iter = thrust::make_transform_iterator(q_device.begin(), quantile_idx_lookup);
 
diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
index 796d10a3477..b2106066ff2 100644
--- a/cpp/src/reductions/struct_minmax_util.cuh
+++ b/cpp/src/reductions/struct_minmax_util.cuh
@@ -118,7 +118,8 @@ class comparison_binop_generator {
       // level structs column (which is stored at the first position in the null_orders array) to
       // achieve this purpose.
       if (input.has_nulls()) { null_orders.front() = cudf::null_order::AFTER; }
-      null_orders_dvec = cudf::detail::make_device_uvector_async(null_orders, stream);
+      null_orders_dvec = cudf::detail::make_device_uvector_async(
+        null_orders, stream, rmm::mr::get_current_device_resource());
     }
     // else: Don't need to generate nulls order to copy to device memory if we have all null orders
     // are BEFORE (that happens when we have is_min_op == false).
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 2b4b6373c35..b208e7cd980 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -467,8 +467,10 @@ get_null_bounds_for_orderby_column(column_view const& orderby_column,
       cudf::device_span<cudf::size_type const>(group_offsets.data(), num_groups);
 
     // When there are no nulls, just copy the input group offsets to the output.
-    return std::make_tuple(cudf::detail::make_device_uvector_async(group_offsets_span, stream),
-                           cudf::detail::make_device_uvector_async(group_offsets_span, stream));
+    return std::make_tuple(cudf::detail::make_device_uvector_async(
+                             group_offsets_span, stream, rmm::mr::get_current_device_resource()),
+                           cudf::detail::make_device_uvector_async(
+                             group_offsets_span, stream, rmm::mr::get_current_device_resource()));
   }
 }
 
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 177fcab03f9..8d273eff4bb 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -160,7 +160,8 @@ struct format_compiler {
     }
 
     // copy format_items to device memory
-    d_items = cudf::detail::make_device_uvector_async(items, stream);
+    d_items = cudf::detail::make_device_uvector_async(
+      items, stream, rmm::mr::get_current_device_resource());
   }
 
   device_span<format_item const> format_items() { return device_span<format_item const>(d_items); }
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index e3ee59c631f..92b71d128e1 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -85,7 +85,8 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
       return static_cast<size_t>(col.size());
     });
   thrust::inclusive_scan(thrust::host, offset_it, input_offsets.end(), offset_it);
-  auto d_input_offsets   = cudf::detail::make_device_uvector_async(input_offsets, stream);
+  auto d_input_offsets = cudf::detail::make_device_uvector_async(
+    input_offsets, stream, rmm::mr::get_current_device_resource());
   auto const output_size = input_offsets.back();
 
   // Compute the partition offsets and size of chars column
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 8a6a4d44b1e..3e38b5fa775 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -132,7 +132,8 @@ std::unique_ptr<column> filter_characters(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
     });
-  rmm::device_uvector<char_range> table = cudf::detail::make_device_uvector_async(htable, stream);
+  rmm::device_uvector<char_range> table =
+    cudf::detail::make_device_uvector_async(htable, stream, rmm::mr::get_current_device_resource());
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index c6ea47ec0f3..128d450cbe8 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -673,11 +673,10 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
   } while (op.type != path_operator_type::END);
 
   auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END;
-  return is_empty
-           ? std::pair(thrust::nullopt, 0)
-           : std::pair(
-               thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)),
-               max_stack_depth);
+  return is_empty ? std::pair(thrust::nullopt, 0)
+                  : std::pair(thrust::make_optional(cudf::detail::make_device_uvector_sync(
+                                h_operators, stream, rmm::mr::get_current_device_resource())),
+                              max_stack_depth);
 }
 
 #define PARSE_TRY(_x)                                                       \
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 383337c9088..d25af8c8931 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,9 +118,9 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
 
   // parse the repl string for back-ref indicators
   auto group_count = std::min(99, d_prog->group_counts());  // group count should NOT exceed 99
-  auto const parse_result = parse_backrefs(replacement, group_count);
-  rmm::device_uvector<backref_type> backrefs =
-    cudf::detail::make_device_uvector_async(parse_result.second, stream);
+  auto const parse_result                    = parse_backrefs(replacement, group_count);
+  rmm::device_uvector<backref_type> backrefs = cudf::detail::make_device_uvector_async(
+    parse_result.second, stream, rmm::mr::get_current_device_resource());
   string_scalar repl_scalar(parse_result.first, true, stream);
   string_view const d_repl_template = repl_scalar.value();
 
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index f3bc7fc82ec..50b2dc27671 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,7 +169,8 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                    prog->set_working_memory(d_buffer, size);
                    return *prog;
                  });
-  auto d_progs = cudf::detail::make_device_uvector_async(progs, stream);
+  auto d_progs =
+    cudf::detail::make_device_uvector_async(progs, stream, rmm::mr::get_current_device_resource());
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
   auto const d_repls   = column_device_view::create(replacements.parent(), stream);
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 7f134059ded..e7b637c52f3 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,7 +106,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
   });
   // copy translate table to device memory
   rmm::device_uvector<translate_table> table =
-    cudf::detail::make_device_uvector_async(htable, stream);
+    cudf::detail::make_device_uvector_async(htable, stream, rmm::mr::get_current_device_resource());
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 8a63a6f6411..0c6747f2d12 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -268,7 +268,8 @@ auto list_lex_preprocess(table_view table, rmm::cuda_stream_view stream)
       dremel_device_views.push_back(dremel_data.back());
     }
   }
-  auto d_dremel_device_views = detail::make_device_uvector_sync(dremel_device_views, stream);
+  auto d_dremel_device_views = detail::make_device_uvector_sync(
+    dremel_device_views, stream, rmm::mr::get_current_device_resource());
   return std::make_tuple(std::move(dremel_data), std::move(d_dremel_device_views));
 }
 
@@ -333,7 +334,7 @@ void check_shape_compatibility(table_view const& lhs, table_view const& rhs)
   CUDF_EXPECTS(lhs.num_columns() == rhs.num_columns(),
                "Cannot compare tables with different number of columns");
   for (size_type i = 0; i < lhs.num_columns(); ++i) {
-    CUDF_EXPECTS(column_types_equal(lhs.column(i), rhs.column(i)),
+    CUDF_EXPECTS(column_types_equivalent(lhs.column(i), rhs.column(i)),
                  "Cannot compare tables with different column types");
   }
 }
@@ -355,10 +356,13 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(
   auto [verticalized_lhs, new_column_order, new_null_precedence, verticalized_col_depths] =
     decompose_structs(t, column_order, null_precedence);
 
-  auto d_t               = table_device_view::create(verticalized_lhs, stream);
-  auto d_column_order    = detail::make_device_uvector_async(new_column_order, stream);
-  auto d_null_precedence = detail::make_device_uvector_async(new_null_precedence, stream);
-  auto d_depths          = detail::make_device_uvector_async(verticalized_col_depths, stream);
+  auto d_t            = table_device_view::create(verticalized_lhs, stream);
+  auto d_column_order = detail::make_device_uvector_async(
+    new_column_order, stream, rmm::mr::get_current_device_resource());
+  auto d_null_precedence = detail::make_device_uvector_async(
+    new_null_precedence, stream, rmm::mr::get_current_device_resource());
+  auto d_depths = detail::make_device_uvector_async(
+    verticalized_col_depths, stream, rmm::mr::get_current_device_resource());
 
   if (detail::has_nested_columns(t)) {
     auto [dremel_data, d_dremel_device_view] = list_lex_preprocess(verticalized_lhs, stream);
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 634fdd70831..b982a010e6e 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -500,7 +500,8 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   auto d_cols = contiguous_copy_column_device_views<column_device_view>(cols, stream);
 
   // move stack info to the gpu
-  rmm::device_uvector<column_info> d_info = cudf::detail::make_device_uvector_async(info, stream);
+  rmm::device_uvector<column_info> d_info =
+    cudf::detail::make_device_uvector_async(info, stream, rmm::mr::get_current_device_resource());
 
   // each thread needs to maintain a stack of row spans of size max_branch_depth. we will use
   // shared memory to do this rather than allocating a potentially gigantic temporary buffer
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index d297148de45..d6f5c65593a 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,4 +69,10 @@ bool column_types_equal(column_view const& lhs, column_view const& rhs)
   return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
 }
 
+bool column_types_equivalent(column_view const& lhs, column_view const& rhs)
+{
+  if (lhs.type().id() != rhs.type().id()) { return false; }
+  return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index 00ec7bd218b..7805828ad55 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -87,7 +87,7 @@ rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fil
 {
   if (!fill_valid) {
     return cudf::detail::make_zeroed_device_uvector_sync<cudf::bitmask_type>(
-      size, cudf::get_default_stream());
+      size, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   } else {
     auto ret = rmm::device_uvector<cudf::bitmask_type>(size, cudf::get_default_stream());
     CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(),
diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu
index cdc453be8e4..cb086cda179 100644
--- a/cpp/tests/bitmask/valid_if_tests.cu
+++ b/cpp/tests/bitmask/valid_if_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ TEST_F(ValidIfTest, EmptyRange)
   auto actual        = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(0),
                                        odds_valid{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   auto const& buffer = actual.first;
   EXPECT_EQ(0u, buffer.size());
   EXPECT_EQ(nullptr, buffer.data());
@@ -55,7 +56,8 @@ TEST_F(ValidIfTest, InvalidRange)
   EXPECT_THROW(cudf::detail::valid_if(thrust::make_counting_iterator(1),
                                       thrust::make_counting_iterator(0),
                                       odds_valid{},
-                                      cudf::get_default_stream()),
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource()),
                cudf::logic_error);
 }
 
@@ -66,7 +68,8 @@ TEST_F(ValidIfTest, OddsValid)
   auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(10000),
                                        odds_valid{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(5000, actual.second);
 }
@@ -78,7 +81,8 @@ TEST_F(ValidIfTest, AllValid)
   auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(10000),
                                        all_valid{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(0, actual.second);
 }
@@ -90,7 +94,8 @@ TEST_F(ValidIfTest, AllNull)
   auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(10000),
                                        all_null{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(10000, actual.second);
 }
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 1c51eab1f94..a35bbab0176 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -812,7 +812,7 @@ TYPED_TEST(StructGetValueTestTyped, mixed_types_valid)
   // col fields
   cudf::test::fixed_width_column_wrapper<TypeParam> f1{1, 2, 3};
   cudf::test::strings_column_wrapper f2{"aa", "bbb", "c"};
-  cudf::test::dictionary_column_wrapper<TypeParam, uint32_t> f3{42, 42, 24};
+  cudf::test::dictionary_column_wrapper<TypeParam, int32_t> f3{42, 42, 24};
   LCW f4{LCW{8, 8, 8}, LCW{9, 9}, LCW{10}};
 
   cudf::test::structs_column_wrapper col{f1, f2, f3, f4};
@@ -824,7 +824,7 @@ TYPED_TEST(StructGetValueTestTyped, mixed_types_valid)
   // expect fields
   cudf::test::fixed_width_column_wrapper<TypeParam> ef1{3};
   cudf::test::strings_column_wrapper ef2{"c"};
-  cudf::test::dictionary_column_wrapper<int32_t, TypeParam> ef3{24};
+  cudf::test::dictionary_column_wrapper<TypeParam, int32_t> ef3{24};
   LCW ef4{LCW{10}};
 
   cudf::table_view expect_data{{ef1, ef2, ef3, ef4}};
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 43874b84114..5694513647b 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -141,9 +141,10 @@ struct AtomicsTest : public cudf::test::BaseFixture {
     result_init[4] = result_init[1];
     result_init[5] = result_init[2];
 
-    auto dev_data = cudf::detail::make_device_uvector_sync(v, cudf::get_default_stream());
-    auto dev_result =
-      cudf::detail::make_device_uvector_sync(result_init, cudf::get_default_stream());
+    auto dev_data = cudf::detail::make_device_uvector_sync(
+      v, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto dev_result = cudf::detail::make_device_uvector_sync(
+      result_init, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
     if (block_size == 0) { block_size = vec_size; }
 
@@ -294,8 +295,10 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
     exact[2] = std::accumulate(
       v.begin(), v.end(), identity[2], [](T acc, uint64_t i) { return acc ^ T(i); });
 
-    auto dev_result = cudf::detail::make_device_uvector_sync(identity, cudf::get_default_stream());
-    auto dev_data   = cudf::detail::make_device_uvector_sync(v, cudf::get_default_stream());
+    auto dev_result = cudf::detail::make_device_uvector_sync(
+      identity, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto dev_data = cudf::detail::make_device_uvector_sync(
+      v, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
     if (block_size == 0) { block_size = vec_size; }
 
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index ab9970dc370..9631e433a5e 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,7 +83,8 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
 
   std::vector<decimal32> vec1(1000, decimal32{1, scale_type{-2}});
-  auto d_vec1 = cudf::detail::make_device_uvector_sync(vec1, cudf::get_default_stream());
+  auto d_vec1 = cudf::detail::make_device_uvector_sync(
+    vec1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto const sum = thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
                                   std::cbegin(d_vec1),
@@ -96,7 +97,8 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
   //       change inclusive scan to run on device (avoid copying to host)
   thrust::inclusive_scan(std::cbegin(vec1), std::cend(vec1), std::begin(vec1));
 
-  d_vec1 = cudf::detail::make_device_uvector_sync(vec1, cudf::get_default_stream());
+  d_vec1 = cudf::detail::make_device_uvector_sync(
+    vec1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   std::vector<int32_t> vec2(1000);
   std::iota(std::begin(vec2), std::end(vec2), 1);
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index d2b159fc208..3f4d5bcf20f 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -264,7 +264,7 @@ TEST_F(FromArrowTest, DictionaryIndicesType)
   auto arrow_table = arrow::Table::Make(schema, {array1, array2, array3});
 
   std::vector<std::unique_ptr<cudf::column>> columns;
-  auto col = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
+  auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index 94a7c8edcf9..0ae0360c4d9 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -586,8 +586,8 @@ TEST_F(JsonTest, TreeRepresentation)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
@@ -673,8 +673,8 @@ TEST_F(JsonTest, TreeRepresentation2)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
@@ -747,8 +747,8 @@ TEST_F(JsonTest, TreeRepresentation3)
   options.enable_lines(true);
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
@@ -772,8 +772,8 @@ TEST_F(JsonTest, TreeRepresentationError)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
   // This JSON is invalid and will raise an exception.
@@ -855,8 +855,8 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
                                                              static_cast<size_t>(d_scalar.size())};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   bool const is_array_of_arrays =
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 3c01bd4de25..5b797a00ca1 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -262,8 +262,8 @@ TEST_F(JsonTest, TokenStream)
     cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
 
   // Parse the JSON and get the token stream
-  auto [d_tokens_gpu, d_token_indices_gpu] =
-    cuio_json::detail::get_token_stream(d_input, default_options, stream);
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, default_options, stream, rmm::mr::get_current_device_resource());
   // Copy back the number of tokens that were written
   thrust::host_vector<PdaTokenT> const tokens_gpu =
     cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
@@ -398,8 +398,8 @@ TEST_F(JsonTest, TokenStream2)
     cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
 
   // Parse the JSON and get the token stream
-  auto [d_tokens_gpu, d_token_indices_gpu] =
-    cuio_json::detail::get_token_stream(d_input, default_options, stream);
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, default_options, stream, rmm::mr::get_current_device_resource());
   // Copy back the number of tokens that were written
   thrust::host_vector<PdaTokenT> const tokens_gpu =
     cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
@@ -470,7 +470,9 @@ TEST_P(JsonParserTest, ExtractColumn)
 
   std::string const input = R"( [{"a":0.0, "b":1.0}, {"a":0.1, "b":1.1}, {"a":0.2, "b":1.2}] )";
   auto const d_input      = cudf::detail::make_device_uvector_async(
-    cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+    cudf::host_span<char const>{input.c_str(), input.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, default_options, stream, mr);
 
@@ -508,7 +510,9 @@ TEST_P(JsonParserTest, UTF_JSON)
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])";
   auto const d_ascii_pass      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()}, stream);
+    cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
 
   CUDF_EXPECT_NO_THROW(json_parser(d_ascii_pass, default_options, stream, mr));
 
@@ -521,7 +525,9 @@ TEST_P(JsonParserTest, UTF_JSON)
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip ʒakotɛ"}}])";
   auto const d_utf_failed      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()}, stream);
+    cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_failed, default_options, stream, mr));
 
   // utf-8 string that passes parsing.
@@ -534,7 +540,9 @@ TEST_P(JsonParserTest, UTF_JSON)
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}},
   {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip ʒakotɛ"}}])";
   auto const d_utf_pass      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()}, stream);
+    cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_pass, default_options, stream, mr));
 }
 
@@ -555,7 +563,9 @@ TEST_P(JsonParserTest, ExtractColumnWithQuotes)
 
   std::string const input = R"( [{"a":"0.0", "b":1.0}, {"b":1.1}, {"b":2.1, "a":"2.0"}] )";
   auto const d_input      = cudf::detail::make_device_uvector_async(
-    cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+    cudf::host_span<char const>{input.c_str(), input.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, options, stream, mr);
 
@@ -599,14 +609,18 @@ TEST_P(JsonParserTest, ExpectFailMixStructAndList)
   // libcudf does not currently support a mix of lists and structs.
   for (auto const& input : inputs_fail) {
     auto const d_input = cudf::detail::make_device_uvector_async(
-      cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+      cudf::host_span<char const>{input.c_str(), input.size()},
+      stream,
+      rmm::mr::get_current_device_resource());
     EXPECT_THROW(auto const cudf_table = json_parser(d_input, options, stream, mr),
                  cudf::logic_error);
   }
 
   for (auto const& input : inputs_succeed) {
     auto const d_input = cudf::detail::make_device_uvector_async(
-      cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+      cudf::host_span<char const>{input.c_str(), input.size()},
+      stream,
+      rmm::mr::get_current_device_resource());
     CUDF_EXPECT_NO_THROW(auto const cudf_table = json_parser(d_input, options, stream, mr));
   }
 }
@@ -626,8 +640,10 @@ TEST_P(JsonParserTest, EmptyString)
   cudf::io::json_reader_options default_options{};
 
   std::string const input = R"([])";
-  auto const d_input      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+  auto const d_input =
+    cudf::detail::make_device_uvector_sync(cudf::host_span<char const>{input.c_str(), input.size()},
+                                           stream,
+                                           rmm::mr::get_current_device_resource());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, default_options, stream, mr);
 
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index e82b0c670b8..8a16fd9a05a 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -1128,7 +1128,8 @@ TEST_F(ParquetWriterTest, BufferSource)
     auto const d_input = cudf::detail::make_device_uvector_sync(
       cudf::host_span<uint8_t const>{reinterpret_cast<uint8_t const*>(out_buffer.data()),
                                      out_buffer.size()},
-      cudf::get_default_stream());
+      cudf::get_default_stream(),
+      rmm::mr::get_current_device_resource());
     auto const d_buffer = cudf::device_span<std::byte const>(
       reinterpret_cast<std::byte const*>(d_input.data()), d_input.size());
     cudf::io::parquet_reader_options in_opts =
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
index ea6eb9b93ef..81c6563cd2d 100644
--- a/cpp/tests/io/type_inference_test.cu
+++ b/cpp/tests/io/type_inference_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,12 +51,12 @@ TEST_F(TypeInference, Basic)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 4, 7};
-  auto const string_length = std::vector<std::size_t>{2, 2, 1};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 4, 7};
+  auto const string_length   = std::vector<std::size_t>{2, 2, 1};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -84,12 +84,12 @@ TEST_F(TypeInference, Null)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 1, 4};
-  auto const string_length = std::vector<std::size_t>{0, 2, 1};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 1, 4};
+  auto const string_length   = std::vector<std::size_t>{0, 2, 1};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -117,12 +117,12 @@ TEST_F(TypeInference, AllNull)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 1, 1};
-  auto const string_length = std::vector<std::size_t>{0, 0, 4};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 1, 1};
+  auto const string_length   = std::vector<std::size_t>{0, 0, 4};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -150,12 +150,12 @@ TEST_F(TypeInference, String)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 8, 12};
-  auto const string_length = std::vector<std::size_t>{6, 3, 4};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 8, 12};
+  auto const string_length   = std::vector<std::size_t>{6, 3, 4};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -183,12 +183,12 @@ TEST_F(TypeInference, Bool)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 6, 12};
-  auto const string_length = std::vector<std::size_t>{4, 5, 5};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 6, 12};
+  auto const string_length   = std::vector<std::size_t>{4, 5, 5};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -216,12 +216,12 @@ TEST_F(TypeInference, Timestamp)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 10};
-  auto const string_length = std::vector<std::size_t>{8, 9};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 10};
+  auto const string_length   = std::vector<std::size_t>{8, 9};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -250,12 +250,12 @@ TEST_F(TypeInference, InvalidInput)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 3, 5, 7, 9};
-  auto const string_length = std::vector<std::size_t>{1, 1, 1, 1, 1};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 3, 5, 7, 9};
+  auto const string_length   = std::vector<std::size_t>{1, 1, 1, 1, 1};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 894e117ba40..882de994e67 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,8 +87,8 @@ struct IteratorTest : public cudf::test::BaseFixture {
   {
     InputIterator d_in_last = d_in + num_items;
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
-    auto dev_expected =
-      cudf::detail::make_device_uvector_sync(expected, cudf::get_default_stream());
+    auto dev_expected = cudf::detail::make_device_uvector_sync(
+      expected, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
     // using a temporary vector and calling transform and all_of separately is
     // equivalent to thrust::equal but compiles ~3x faster
diff --git a/cpp/tests/iterator/value_iterator_test.cuh b/cpp/tests/iterator/value_iterator_test.cuh
index fa931d34a0e..8252ce88f39 100644
--- a/cpp/tests/iterator/value_iterator_test.cuh
+++ b/cpp/tests/iterator/value_iterator_test.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,8 @@ template <typename T>
 void non_null_iterator(IteratorTest<T>& testFixture)
 {
   auto host_array = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
-  auto dev_array  = cudf::detail::make_device_uvector_sync(host_array, cudf::get_default_stream());
+  auto dev_array  = cudf::detail::make_device_uvector_sync(
+    host_array, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // calculate the expected value by CPU.
   thrust::host_vector<T> replaced_array(host_array);
diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu
index 8b4080fa493..d0e62c09a03 100644
--- a/cpp/tests/iterator/value_iterator_test_strings.cu
+++ b/cpp/tests/iterator/value_iterator_test_strings.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,8 @@ auto strings_to_string_views(std::vector<std::string>& input_strings)
   std::vector<int32_t> offsets;
   std::tie(chars, offsets) = cudf::test::detail::make_chars_and_offsets(
     input_strings.begin(), input_strings.end(), all_valid);
-  auto dev_chars = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
+  auto dev_chars = cudf::detail::make_device_uvector_sync(
+    chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // calculate the expected value by CPU. (but contains device pointers)
   thrust::host_vector<cudf::string_view> replaced_array(input_strings.size());
@@ -51,8 +52,9 @@ TEST_F(StringIteratorTest, string_view_null_iterator)
   using T = cudf::string_view;
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
-  T init       = T{initmsg.data(), int(initmsg.size())};
+  auto initmsg = cudf::detail::make_device_uvector_sync(
+    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  T init = T{initmsg.data(), int(initmsg.size())};
 
   // data and valid arrays
   std::vector<std::string> host_values(
@@ -86,8 +88,9 @@ TEST_F(StringIteratorTest, string_view_no_null_iterator)
   // T init = T{"", 0};
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
-  T init       = T{initmsg.data(), int(initmsg.size())};
+  auto initmsg = cudf::detail::make_device_uvector_sync(
+    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  T init = T{initmsg.data(), int(initmsg.size())};
 
   // data array
   std::vector<std::string> host_values(
@@ -110,8 +113,9 @@ TEST_F(StringIteratorTest, string_scalar_iterator)
   // T init = T{"", 0};
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
-  T init       = T{initmsg.data(), int(initmsg.size())};
+  auto initmsg = cudf::detail::make_device_uvector_sync(
+    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  T init = T{initmsg.data(), int(initmsg.size())};
 
   // data array
   std::vector<std::string> host_values(100, zero);
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 9d206c5397d..a1508b5b973 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -308,8 +308,8 @@ void run_fixed_width_test(size_t cols,
 
   // Make a table view of the partition numbers
   constexpr cudf::data_type dtype{cudf::type_id::INT32};
-  auto d_partitions =
-    cudf::detail::make_device_uvector_sync(partitions, cudf::get_default_stream());
+  auto d_partitions = cudf::detail::make_device_uvector_sync(
+    partitions, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   cudf::column_view partitions_col(dtype, rows, d_partitions.data());
   cudf::table_view partitions_table({partitions_col});
 
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 47bcbb874cf..40b0d268580 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -49,9 +49,9 @@ TYPED_TEST(SegmentedReductionTest, SumExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{6, 4, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -97,9 +97,9 @@ TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{15, 15, 1, XXX, XXX, XXX},
                                                                         {1, 1, 1, 0, 0, 0}};
 
@@ -147,9 +147,9 @@ TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{3, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -195,9 +195,9 @@ TYPED_TEST(SegmentedReductionTest, MinExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{1, 1, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -244,9 +244,9 @@ TYPED_TEST(SegmentedReductionTest, AnyExcludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX},
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {false, false, true, true, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, true, true, true, false, true, true, false, false}};
@@ -284,9 +284,9 @@ TYPED_TEST(SegmentedReductionTest, AllExcludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0},
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {true, true, bool{XXX}, true, bool{XXX}, bool{XXX}, false, false, false},
     {true, true, false, true, false, false, true, true, true}};
@@ -335,9 +335,9 @@ TYPED_TEST(SegmentedReductionTest, SumIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{6, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -386,9 +386,9 @@ TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{15, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -439,9 +439,9 @@ TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{3, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -490,9 +490,9 @@ TYPED_TEST(SegmentedReductionTest, MinIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{1, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -542,9 +542,9 @@ TYPED_TEST(SegmentedReductionTest, AnyIncludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX},
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {false, bool{XXX}, true, bool{XXX}, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, false, true, false, false, true, true, false, false}};
@@ -605,9 +605,9 @@ TYPED_TEST(SegmentedReductionTest, AllIncludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0},
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {true, bool{XXX}, bool{XXX}, true, bool{XXX}, bool{XXX}, false, bool{XXX}, false},
     {true, false, false, true, false, false, true, false, true}};
@@ -670,9 +670,9 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
 
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>{
     {1, 2, 3, 4, 5, 6, 7}, {true, true, true, true, true, true, true}};
-  auto const offsets = std::vector<cudf::size_type>{1, 3, 4};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{1, 3, 4};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{{5, 4}, {true, true}};
 
   auto res =
@@ -720,10 +720,10 @@ TEST_F(SegmentedReductionTestUntyped, NonNullableInput)
   // outputs: {1, 5, 4}
   // output nullmask: {1, 1, 1}
 
-  auto const input   = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 3, 7};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 3, 7};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<int32_t>{{1, XXX, 5, 22}, {true, false, true, true}};
 
@@ -767,9 +767,9 @@ TEST_F(SegmentedReductionTestUntyped, Mean)
 {
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -786,9 +786,9 @@ TEST_F(SegmentedReductionTestUntyped, MeanNulls)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 20, 30, 40, 50, 60, 0, 80, 90}, {1, 1, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -806,9 +806,9 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquares)
 {
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
@@ -826,9 +826,9 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquaresNulls)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 20, 30, 40, 50, 60, 0, 80, 90}, {1, 1, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT64};
 
@@ -848,9 +848,9 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviation)
   constexpr float NaN{std::numeric_limits<float>::quiet_NaN()};
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -868,9 +868,9 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviationNulls)
   constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -890,9 +890,9 @@ TEST_F(SegmentedReductionTestUntyped, Variance)
   constexpr float NaN{std::numeric_limits<float>::quiet_NaN()};
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -910,9 +910,9 @@ TEST_F(SegmentedReductionTestUntyped, VarianceNulls)
   constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -931,9 +931,9 @@ TEST_F(SegmentedReductionTestUntyped, Errors)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const null_policy = cudf::null_policy::EXCLUDE;
   auto const output_type = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS};
   auto const str_input =
@@ -999,10 +999,10 @@ TEST_F(SegmentedReductionTestUntyped, Errors)
 
 TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
 {
-  auto const input   = cudf::test::fixed_width_column_wrapper<int32_t>{};
-  auto const offsets = std::vector<cudf::size_type>{0};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{};
+  auto const offsets   = std::vector<cudf::size_type>{0};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{};
 
   auto res =
@@ -1036,10 +1036,10 @@ TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
 
 TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
 {
-  auto const input   = cudf::test::fixed_width_column_wrapper<int32_t>{};
-  auto const offsets = std::vector<cudf::size_type>{0, 0, 0, 0, 0, 0};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{};
+  auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0, 0, 0};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<int32_t>{{XXX, XXX, XXX, XXX, XXX}, {0, 0, 0, 0, 0}};
 
@@ -1087,9 +1087,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxWithNulls)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1115,9 +1115,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinWithNulls)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1143,9 +1143,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxNonNullableInput)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 4, 4};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 4, 4};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1168,9 +1168,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinNonNullableInput)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 4, 4};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 4, 4};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1193,9 +1193,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, Sum)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1231,9 +1231,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, Product)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 12, 12};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 12, 12};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_product_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1268,9 +1268,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, SumOfSquares)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1431,10 +1431,10 @@ TEST_F(SegmentedReductionStringTest, MinExcludeNulls)
 
 TEST_F(SegmentedReductionStringTest, EmptyInputWithOffsets)
 {
-  auto const input   = cudf::test::strings_column_wrapper{};
-  auto const offsets = std::vector<cudf::size_type>{0, 0, 0, 0};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::strings_column_wrapper{};
+  auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::strings_column_wrapper({XXX, XXX, XXX}, {0, 0, 0});
 
   auto result =
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index c7365d63e1c..9e0f68573a5 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,7 +130,8 @@ TEST_F(StringScalarDeviceViewTest, Value)
 
   auto scalar_device_view = cudf::get_scalar_device_view(s);
   rmm::device_scalar<bool> result{cudf::get_default_stream()};
-  auto value_v = cudf::detail::make_device_uvector_sync(value, cudf::get_default_stream());
+  auto value_v = cudf::detail::make_device_uvector_sync(
+    value, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   test_string_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     scalar_device_view, value_v.data(), value.size(), result.data());
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 5331c4c34d8..316f24e4167 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -298,9 +298,11 @@ TEST_F(StringsContainsTests, HexTest)
   std::vector<cudf::offset_type> offsets(
     {thrust::make_counting_iterator<cudf::offset_type>(0),
      thrust::make_counting_iterator<cudf::offset_type>(0) + count + 1});
-  auto d_chars   = cudf::detail::make_device_uvector_sync(ascii_chars, cudf::get_default_stream());
-  auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
-  auto input     = cudf::make_strings_column(d_chars, d_offsets);
+  auto d_chars = cudf::detail::make_device_uvector_sync(
+    ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_offsets = cudf::detail::make_device_uvector_sync(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto input = cudf::make_strings_column(d_chars, d_offsets);
 
   auto strings_view = cudf::strings_column_view(input->view());
   for (auto ch : ascii_chars) {
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index e3df8db721d..77857049e7a 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -78,7 +78,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     }
     h_offsets[idx + 1] = offset;
   }
-  auto d_strings = cudf::detail::make_device_uvector_sync(strings, cudf::get_default_stream());
+  auto d_strings = cudf::detail::make_device_uvector_sync(
+    strings, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   CUDF_CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyDefault));
   auto column = cudf::make_strings_column(d_strings);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
@@ -143,10 +144,13 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   }
 
   std::vector<cudf::bitmask_type> h_nulls{h_null_mask};
-  auto d_buffer  = cudf::detail::make_device_uvector_sync(h_buffer, cudf::get_default_stream());
-  auto d_offsets = cudf::detail::make_device_uvector_sync(h_offsets, cudf::get_default_stream());
-  auto d_nulls   = cudf::detail::make_device_uvector_sync(h_nulls, cudf::get_default_stream());
-  auto column    = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count);
+  auto d_buffer = cudf::detail::make_device_uvector_sync(
+    h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_offsets = cudf::detail::make_device_uvector_sync(
+    h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_nulls = cudf::detail::make_device_uvector_sync(
+    h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto column = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
   EXPECT_EQ(column->null_count(), null_count);
   EXPECT_EQ(2, column->num_children());
@@ -184,8 +188,8 @@ TEST_F(StringsFactoriesTest, CreateScalar)
 TEST_F(StringsFactoriesTest, EmptyStringsColumn)
 {
   rmm::device_uvector<char> d_chars{0, cudf::get_default_stream()};
-  auto d_offsets =
-    cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(1, cudf::get_default_stream());
+  auto d_offsets = cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   rmm::device_uvector<cudf::bitmask_type> d_nulls{0, cudf::get_default_stream()};
 
   auto results = cudf::make_strings_column(d_chars, d_offsets, d_nulls, 0);
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 04e6886a08a..79e96ff5121 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -297,8 +297,9 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2));
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
-  auto d_integers = cudf::detail::make_device_uvector_sync(h_integers, cudf::get_default_stream());
-  auto integers   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
+  auto d_integers = cudf::detail::make_device_uvector_sync(
+    h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
   auto integers_view = integers->mutable_view();
   CUDF_CUDA_TRY(cudaMemcpy(integers_view.data<TypeParam>(),
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index 0542d007ca0..5127f69162f 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,8 +47,8 @@ void row_comparison(cudf::table_view input1,
 
   auto device_table_1 = cudf::table_device_view::create(input1, stream);
   auto device_table_2 = cudf::table_device_view::create(input2, stream);
-  auto d_column_order =
-    cudf::detail::make_device_uvector_sync(column_order, cudf::get_default_stream());
+  auto d_column_order = cudf::detail::make_device_uvector_sync(
+    column_order, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto comparator = cudf::row_lexicographic_comparator(
     cudf::nullate::NO{}, *device_table_1, *device_table_2, d_column_order.data());
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index 911911851f2..a27d8931ee6 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,8 @@ __global__ void dispatch_test_kernel(cudf::type_id id, bool* d_result)
 
 TYPED_TEST(TypedDispatcherTest, DeviceDispatch)
 {
-  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1, cudf::get_default_stream());
+  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
@@ -130,7 +131,8 @@ __global__ void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2
 
 TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch)
 {
-  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1, cudf::get_default_stream());
+  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   double_dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 6c441539621..3a94aac1cc9 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -24,7 +24,7 @@
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <cudf/table/row_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -46,6 +46,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/reduce.h>
+#include <thrust/remove.h>
 #include <thrust/scan.h>
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
@@ -371,55 +372,56 @@ struct column_property_comparator {
   }
 };
 
+template <typename DeviceComparator>
 class corresponding_rows_unequal {
  public:
-  corresponding_rows_unequal(table_device_view d_lhs,
-                             table_device_view d_rhs,
-                             column_device_view lhs_row_indices_,
+  corresponding_rows_unequal(column_device_view lhs_row_indices_,
                              column_device_view rhs_row_indices_,
-                             size_type /*fp_ulps*/)
-    : comp(cudf::nullate::YES{}, d_lhs, d_rhs, cudf::null_equality::EQUAL),
-      lhs_row_indices(lhs_row_indices_),
-      rhs_row_indices(rhs_row_indices_)
+                             size_type /*fp_ulps*/,
+                             DeviceComparator comp_,
+                             column_device_view /*lhs*/,
+                             column_device_view /*rhs*/)
+    : lhs_row_indices(lhs_row_indices_), rhs_row_indices(rhs_row_indices_), comp(comp_)
   {
   }
 
-  cudf::row_equality_comparator<cudf::nullate::YES> comp;
-
   __device__ bool operator()(size_type index)
   {
-    return !comp(lhs_row_indices.element<size_type>(index),
-                 rhs_row_indices.element<size_type>(index));
+    using cudf::experimental::row::lhs_index_type;
+    using cudf::experimental::row::rhs_index_type;
+
+    return !comp(lhs_index_type{lhs_row_indices.element<size_type>(index)},
+                 rhs_index_type{rhs_row_indices.element<size_type>(index)});
   }
 
   column_device_view lhs_row_indices;
   column_device_view rhs_row_indices;
+  DeviceComparator comp;
 };
 
+template <typename DeviceComparator>
 class corresponding_rows_not_equivalent {
-  table_device_view d_lhs;
-  table_device_view d_rhs;
-
   column_device_view lhs_row_indices;
   column_device_view rhs_row_indices;
-
   size_type const fp_ulps;
+  DeviceComparator comp;
+  column_device_view lhs;
+  column_device_view rhs;
 
  public:
-  corresponding_rows_not_equivalent(table_device_view d_lhs,
-                                    table_device_view d_rhs,
-                                    column_device_view lhs_row_indices_,
+  corresponding_rows_not_equivalent(column_device_view lhs_row_indices_,
                                     column_device_view rhs_row_indices_,
-                                    size_type fp_ulps_)
-    : d_lhs(d_lhs),
-      d_rhs(d_rhs),
-      comp(cudf::nullate::YES{}, d_lhs, d_rhs, null_equality::EQUAL),
-      lhs_row_indices(lhs_row_indices_),
+                                    size_type fp_ulps_,
+                                    DeviceComparator comp_,
+                                    column_device_view lhs_,
+                                    column_device_view rhs_)
+    : lhs_row_indices(lhs_row_indices_),
       rhs_row_indices(rhs_row_indices_),
-      fp_ulps(fp_ulps_)
+      fp_ulps(fp_ulps_),
+      comp(comp_),
+      lhs(lhs_),
+      rhs(rhs_)
   {
-    CUDF_EXPECTS(d_lhs.num_columns() == 1 and d_rhs.num_columns() == 1,
-                 "Unsupported number of columns");
   }
 
   struct typed_element_not_equivalent {
@@ -459,23 +461,17 @@ class corresponding_rows_not_equivalent {
     }
   };
 
-  cudf::row_equality_comparator<cudf::nullate::YES> comp;
-
   __device__ bool operator()(size_type index)
   {
+    using cudf::experimental::row::lhs_index_type;
+    using cudf::experimental::row::rhs_index_type;
+
     auto const lhs_index = lhs_row_indices.element<size_type>(index);
     auto const rhs_index = rhs_row_indices.element<size_type>(index);
 
-    if (not comp(lhs_index, rhs_index)) {
-      auto lhs_col = this->d_lhs.column(0);
-      auto rhs_col = this->d_rhs.column(0);
-      return type_dispatcher(lhs_col.type(),
-                             typed_element_not_equivalent{},
-                             lhs_col,
-                             rhs_col,
-                             lhs_index,
-                             rhs_index,
-                             fp_ulps);
+    if (not comp(lhs_index_type{lhs_index}, rhs_index_type{rhs_index})) {
+      return type_dispatcher(
+        lhs.type(), typed_element_not_equivalent{}, lhs, rhs, lhs_index, rhs_index, fp_ulps);
     }
     return false;
   }
@@ -536,25 +532,42 @@ struct column_comparator_impl {
                   size_type fp_ulps,
                   int depth)
   {
-    auto d_lhs = cudf::table_device_view::create(table_view{{lhs}});
-    auto d_rhs = cudf::table_device_view::create(table_view{{rhs}});
-
     auto d_lhs_row_indices = cudf::column_device_view::create(lhs_row_indices);
     auto d_rhs_row_indices = cudf::column_device_view::create(rhs_row_indices);
 
-    using ComparatorType = std::conditional_t<check_exact_equality,
-                                              corresponding_rows_unequal,
-                                              corresponding_rows_not_equivalent>;
+    auto d_lhs = cudf::column_device_view::create(lhs);
+    auto d_rhs = cudf::column_device_view::create(rhs);
+
+    auto lhs_tview = table_view{{lhs}};
+    auto rhs_tview = table_view{{rhs}};
+
+    auto const comparator = cudf::experimental::row::equality::two_table_comparator{
+      lhs_tview, rhs_tview, cudf::get_default_stream()};
+    auto const has_nulls = cudf::has_nested_nulls(lhs_tview) or cudf::has_nested_nulls(rhs_tview);
+
+    auto const device_comparator = comparator.equal_to<false>(cudf::nullate::DYNAMIC{has_nulls});
+
+    using ComparatorType =
+      std::conditional_t<check_exact_equality,
+                         corresponding_rows_unequal<decltype(device_comparator)>,
+                         corresponding_rows_not_equivalent<decltype(device_comparator)>>;
 
     auto differences = rmm::device_uvector<int>(
-      lhs.size(), cudf::get_default_stream());  // worst case: everything different
+      lhs_row_indices.size(), cudf::get_default_stream());  // worst case: everything different
     auto input_iter = thrust::make_counting_iterator(0);
-    auto diff_iter  = thrust::copy_if(
+
+    thrust::transform(
       rmm::exec_policy(cudf::get_default_stream()),
       input_iter,
       input_iter + lhs_row_indices.size(),
       differences.begin(),
-      ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps));
+      ComparatorType(
+        *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps, device_comparator, *d_lhs, *d_rhs));
+
+    auto diff_iter = thrust::remove(rmm::exec_policy(cudf::get_default_stream()),
+                                    differences.begin(),
+                                    differences.end(),
+                                    0);  // remove the zero entries
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
                        cudf::get_default_stream());  // shrink back down
diff --git a/cpp/tests/utilities/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu
index 15998e32bd0..d2e95812894 100644
--- a/cpp/tests/utilities/tdigest_utilities.cu
+++ b/cpp/tests/utilities/tdigest_utilities.cu
@@ -64,12 +64,12 @@ void tdigest_sample_compare(cudf::tdigest::tdigest_column_view const& tdv,
     });
   }
 
-  auto d_expected_src =
-    cudf::detail::make_device_uvector_async(h_expected_src, cudf::get_default_stream());
-  auto d_expected_mean =
-    cudf::detail::make_device_uvector_async(h_expected_mean, cudf::get_default_stream());
-  auto d_expected_weight =
-    cudf::detail::make_device_uvector_async(h_expected_weight, cudf::get_default_stream());
+  auto d_expected_src = cudf::detail::make_device_uvector_async(
+    h_expected_src, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_expected_mean = cudf::detail::make_device_uvector_async(
+    h_expected_mean, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_expected_weight = cudf::detail::make_device_uvector_async(
+    h_expected_weight, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto iter = thrust::make_counting_iterator(0);
   thrust::for_each(
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index a043e723eda..66f9fbfc0d6 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -238,8 +238,8 @@ __global__ void simple_device_kernel(device_span<bool> result) { result[0] = tru
 
 TEST(SpanTest, CanUseDeviceSpan)
 {
-  auto d_message =
-    cudf::detail::make_zeroed_device_uvector_async<bool>(1, cudf::get_default_stream());
+  auto d_message = cudf::detail::make_zeroed_device_uvector_async<bool>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_span = device_span<bool>(d_message.data(), d_message.size());
 
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index 84a2d15d477..f65c3652dc9 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -147,6 +147,7 @@ TEST_F(ColumnTypeCheckTest, DifferentFixedWidth)
   fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
 
   EXPECT_FALSE(column_types_equal(lhs5, rhs5));
+  EXPECT_TRUE(column_types_equivalent(lhs5, rhs5));
 
   // Different rep, same scale
   fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst
index 112df2fdf9f..5c28b4e7e85 100644
--- a/docs/cudf/source/api_docs/general_functions.rst
+++ b/docs/cudf/source/api_docs/general_functions.rst
@@ -27,6 +27,7 @@ Top-level conversions
 
     cudf.to_numeric
     cudf.from_dlpack
+    cudf.from_pandas
 
 Top-level dealing with datetimelike
 -----------------------------------
diff --git a/docs/dask_cudf/Makefile b/docs/dask_cudf/Makefile
new file mode 100644
index 00000000000..d0c3cbf1020
--- /dev/null
+++ b/docs/dask_cudf/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/dask_cudf/make.bat b/docs/dask_cudf/make.bat
new file mode 100644
index 00000000000..747ffb7b303
--- /dev/null
+++ b/docs/dask_cudf/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png b/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png
new file mode 100644
index 00000000000..d884e01374d
Binary files /dev/null and b/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png differ
diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst
new file mode 100644
index 00000000000..893f5dd7434
--- /dev/null
+++ b/docs/dask_cudf/source/api.rst
@@ -0,0 +1,79 @@
+===============
+ API reference
+===============
+
+This page provides a list of all publicly accessible modules, methods,
+and classes in the ``dask_cudf`` namespace.
+
+
+Creating and storing DataFrames
+===============================
+
+:doc:`Like Dask <dask:dataframe-create>`, Dask-cuDF supports creation
+of DataFrames from a variety of storage formats. For on-disk data that
+are not supported directly in Dask-cuDF, we recommend using Dask's
+data reading facilities, followed by calling
+:func:`.from_dask_dataframe` to obtain a Dask-cuDF object.
+
+.. automodule:: dask_cudf
+   :members:
+      from_cudf,
+      from_dask_dataframe,
+      read_csv,
+      read_json,
+      read_orc,
+      to_orc,
+      read_text,
+      read_parquet
+
+.. warning::
+
+   FIXME: where should the following live?
+
+   .. autofunction:: dask_cudf.concat
+
+   .. autofunction:: dask_cudf.from_delayed
+
+Grouping
+========
+
+As discussed in the :doc:`Dask documentation for groupby
+<dask:dataframe-groupby>`, ``groupby``, ``join``, and ``merge``, and
+similar operations that require matching up rows of a DataFrame become
+significantly more challenging in a parallel setting than they are in
+serial. Dask-cuDF has the same challenges, however for certain groupby
+operations, we can take advantage of functionality in cuDF that allows
+us to compute multiple aggregations at once. There are therefore two
+interfaces to grouping in Dask-cuDF, the general
+:meth:`DataFrame.groupby` which returns a
+:class:`.CudfDataFrameGroupBy` object, and a specialized
+:func:`.groupby_agg`. Generally speaking, you should not need to call
+:func:`.groupby_agg` directly, since Dask-cuDF will arrange to call it
+if possible.
+
+.. autoclass:: dask_cudf.groupby.CudfDataFrameGroupBy
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autofunction:: dask_cudf.groupby_agg
+
+
+DataFrames and Series
+=====================
+
+The core distributed objects provided by Dask-cuDF are the
+:class:`.DataFrame` and :class:`.Series`. These inherit respectively
+from :class:`dask.dataframe.DataFrame` and
+:class:`dask.dataframe.Series`, and so the API is essentially
+identical. The full API is provided below.
+
+.. autoclass:: dask_cudf.DataFrame
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: dask_cudf.Series
+   :members:
+   :inherited-members:
+   :show-inheritance:
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
new file mode 100644
index 00000000000..1341e7fd9e7
--- /dev/null
+++ b/docs/dask_cudf/source/conf.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "dask-cudf"
+copyright = "2018-2023, NVIDIA Corporation"
+author = "NVIDIA Corporation"
+version = "23.04"
+release = "23.04.00"
+
+language = "en"
+
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.autodoc",
+    "sphinx_copybutton",
+    "numpydoc",
+    "IPython.sphinxext.ipython_console_highlighting",
+    "IPython.sphinxext.ipython_directive",
+    "myst_nb",
+]
+
+templates_path = ["_templates"]
+exclude_patterns = []
+
+copybutton_prompt_text = ">>> "
+
+# Enable automatic generation of systematic, namespaced labels for sections
+myst_heading_anchors = 2
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "pydata_sphinx_theme"
+html_logo = "_static/RAPIDS-logo-purple.png"
+htmlhelp_basename = "dask-cudfdoc"
+html_use_modindex = True
+
+html_static_path = ["_static"]
+
+pygments_style = "sphinx"
+
+html_theme_options = {
+    "external_links": [],
+    "github_url": "https://github.com/rapidsai/cudf",
+    "twitter_url": "https://twitter.com/rapidsai",
+    "show_toc_level": 1,
+    "navbar_align": "right",
+}
+include_pandas_compat = True
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "cupy": ("https://docs.cupy.dev/en/stable/", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "pyarrow": ("https://arrow.apache.org/docs/", None),
+    "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None),
+    "dask": ("https://docs.dask.org/en/stable/", None),
+    "pandas": ("https://pandas.pydata.org/docs/", None),
+}
+
+numpydoc_show_inherited_class_members = True
+numpydoc_class_members_toctree = False
+numpydoc_attributes_as_param_list = False
+
+
+def setup(app):
+    app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
+    app.add_js_file(
+        "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
+    )
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
new file mode 100644
index 00000000000..0442ab0929a
--- /dev/null
+++ b/docs/dask_cudf/source/index.rst
@@ -0,0 +1,112 @@
+.. dask-cudf documentation coordinating file, created by
+   sphinx-quickstart on Mon Feb  6 18:48:11 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to dask-cudf's documentation!
+=====================================
+
+Dask-cuDF is an extension library for the `Dask <https://dask.org>`__
+parallel computing framework that provides a `cuDF
+<https://docs.rapids.ai/api/cudf/stable/>`__-backed distributed
+dataframe with the same API as `Dask dataframes
+<https://docs.dask.org/en/stable/dataframe.html>`__.
+
+If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
+`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask-cuDF
+should feel familiar to you. If not, we recommend starting with `10
+minutes to Dask
+<https://docs.dask.org/en/stable/10-minutes-to-dask.html>`__ followed
+by `10 minutes to cuDF and Dask-cuDF
+<https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
+
+When running on multi-GPU systems, `Dask-CUDA
+<https://docs.rapids.ai/api/dask-cuda/stable/>`__ is recommended to
+simplify the setup of the cluster, taking advantage of all features of
+the GPU and networking hardware.
+
+Using Dask-cuDF
+---------------
+
+When installed, Dask-cuDF registers itself as a dataframe backend for
+Dask. This means that in many cases, using cuDF-backed dataframes requires
+only small changes to an existing workflow. The minimal change is to
+select cuDF as the dataframe backend in :doc:`Dask's
+configuration <dask:configuration>`. To do so, we must set the option
+``dataframe.backend`` to ``cudf``. From Python, this can be achieved
+like so::
+
+  import dask
+
+  dask.config.set({"dataframe.backend": "cudf"})
+
+Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the
+environment before running your code.
+
+Dataframe creation from on-disk formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your workflow creates Dask dataframes from on-disk formats
+(for example using :func:`dask.dataframe.read_parquet`), then setting
+the backend may well be enough to migrate your workflow.
+
+For example, consider reading a dataframe from parquet::
+
+   import dask.dataframe as dd
+
+   # By default, we obtain a pandas-backed dataframe
+   df = dd.read_parquet("data.parquet", ...)
+
+
+To obtain a cuDF-backed dataframe, we must set the
+``dataframe.backend`` configuration option::
+
+  import dask
+  import dask.dataframe as dd
+
+  dask.config.set({"dataframe.backend": "cudf"})
+  # This gives us a cuDF-backed dataframe
+  df = dd.read_parquet("data.parquet", ...)
+
+This code will use cuDF's GPU-accelerated :func:`parquet reader
+<cudf.read_parquet>` to read partitions of the data.
+
+Dataframe creation from in-memory formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you already have a dataframe in memory and want to convert it to a
+cuDF-backend one, there are two options depending on whether the
+dataframe is already a Dask one or not. If you have a Dask dataframe,
+then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"``
+as the backend; if you have a pandas dataframe then you can either
+call :func:`dask.dataframe.from_pandas` followed by
+:func:`~dask.dataframe.to_backend` or first convert the dataframe with
+:func:`cudf.from_pandas` and then parallelise this with
+:func:`dask_cudf.from_cudf`.
+
+API Reference
+-------------
+
+Generally speaking, Dask-cuDF tries to offer exactly the same API as
+Dask itself. There are, however, some minor differences mostly because
+cuDF does not :doc:`perfectly mirror <cudf:user_guide/PandasCompat>`
+the pandas API, or because cuDF provides additional configuration
+flags (these mostly occur in data reading and writing interfaces).
+
+As a result, straightforward workflows can be migrated without too
+much trouble, but more complex ones that utilise more features may
+need a bit of tweaking. The API documentation describes details of the
+differences and all functionality that Dask-cuDF supports.
+
+.. toctree::
+   :maxdepth: 2
+
+   api
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 8a2c0b2b411..86c2add851a 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -56,7 +56,7 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
   auto [null_mask, null_count] = cudf::detail::valid_if(
       validity_begin, validity_end,
       [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
-      cudf::get_default_stream());
+      cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const exemplar_without_null_mask = cudf::column_view{
       exemplar.type(),
       exemplar.size(),
@@ -153,8 +153,9 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
                    });
 
   // Create a new nullmask from the validity data.
-  auto [new_null_mask, new_null_count] = cudf::detail::valid_if(
-      validity.begin(), validity.end(), thrust::identity{}, cudf::get_default_stream());
+  auto [new_null_mask, new_null_count] =
+      cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{},
+                             cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   if (new_null_count > 0) {
     // If the `overlap_result` column is nullable, perform `bitmask_and` of its nullmask and the
diff --git a/java/src/main/native/src/maps_column_view.cu b/java/src/main/native/src/maps_column_view.cu
index 23254c0d501..1af7689f972 100644
--- a/java/src/main/native/src/maps_column_view.cu
+++ b/java/src/main/native/src/maps_column_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,7 +55,8 @@ std::unique_ptr<column> get_values_for_impl(maps_column_view const &maps_view,
   CUDF_EXPECTS(lookup_keys.type().id() == keys_.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
   auto key_indices =
-      lists::detail::index_of(keys_, lookup_keys, lists::duplicate_find_option::FIND_LAST, stream);
+      lists::detail::index_of(keys_, lookup_keys, lists::duplicate_find_option::FIND_LAST, stream,
+                              rmm::mr::get_current_device_resource());
   auto constexpr absent_offset = size_type{-1};
   auto constexpr nullity_offset = std::numeric_limits<size_type>::min();
   thrust::replace(rmm::exec_policy(stream), key_indices->mutable_view().template begin<size_type>(),
@@ -86,7 +87,8 @@ std::unique_ptr<column> contains_impl(maps_column_view const &maps_view, KeyT co
   auto const keys = maps_view.keys();
   CUDF_EXPECTS(lookup_keys.type().id() == keys.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
-  auto const contains = lists::detail::contains(keys, lookup_keys, stream);
+  auto const contains =
+      lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
   // Replace nulls with BOOL8{false};
   auto const scalar_false = numeric_scalar<bool>{false, true, stream};
   return detail::replace_nulls(contains->view(), scalar_false, stream, mr);
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 5cf7658106f..84f84f8b46f 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -226,7 +226,8 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi
     std::copy_if(offsets_iter, offsets_iter + tbl.num_columns(),
                  std::back_inserter(offsets_iterators),
                  [](auto const &offset_ptr) { return offset_ptr != nullptr; });
-    return make_device_uvector_async(offsets_iterators, stream);
+    return make_device_uvector_async(offsets_iterators, stream,
+                                     rmm::mr::get_current_device_resource());
   }();
 
   auto const num_columns = static_cast<size_type>(d_offsets_iterators.size());
@@ -1256,7 +1257,7 @@ static std::unique_ptr<column> fixed_width_convert_to_rows(
 
   // Allocate and set the offsets row for the byte array
   std::unique_ptr<column> offsets =
-      cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
+      cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr);
 
   std::unique_ptr<column> data =
       make_numeric_column(data_type(type_id::INT8), static_cast<size_type>(total_allocation),
@@ -1539,7 +1540,9 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
     last_row_end = row_end;
   }
 
-  return {std::move(batch_row_offsets), make_device_uvector_async(batch_row_boundaries, stream),
+  return {std::move(batch_row_offsets),
+          make_device_uvector_async(batch_row_boundaries, stream,
+                                    rmm::mr::get_current_device_resource()),
           std::move(batch_row_boundaries), std::move(row_batches)};
 }
 
@@ -1750,8 +1753,10 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
     return table_view(cols);
   };
 
-  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream);
-  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream);
+  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream,
+                                                 rmm::mr::get_current_device_resource());
+  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream,
+                                                  rmm::mr::get_current_device_resource());
 
   // Get the pointers to the input columnar data ready
   auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const &c) {
@@ -1764,8 +1769,10 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
       thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { return c.null_mask(); });
   std::vector<bitmask_type const *> input_nm(nm_begin, nm_begin + tbl.num_columns());
 
-  auto dev_input_data = make_device_uvector_async(input_data, stream);
-  auto dev_input_nm = make_device_uvector_async(input_nm, stream);
+  auto dev_input_data =
+      make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource());
+  auto dev_input_nm =
+      make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource());
 
   // the first batch always exists unless we were sent an empty table
   auto const first_batch_size = batch_info.row_batches[0].row_count;
@@ -1811,7 +1818,8 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
   auto validity_tile_infos = detail::build_validity_tile_infos(
       tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches);
 
-  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream);
+  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream,
+                                                           rmm::mr::get_current_device_resource());
 
   auto const validity_offset = column_info.column_starts.back();
 
@@ -1847,9 +1855,10 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
     std::vector<int8_t const *> variable_width_input_data(
         variable_data_begin, variable_data_begin + variable_width_table.num_columns());
 
-    auto dev_variable_input_data = make_device_uvector_async(variable_width_input_data, stream);
-    auto dev_variable_col_output_offsets =
-        make_device_uvector_async(column_info.variable_width_column_starts, stream);
+    auto dev_variable_input_data = make_device_uvector_async(
+        variable_width_input_data, stream, rmm::mr::get_current_device_resource());
+    auto dev_variable_col_output_offsets = make_device_uvector_async(
+        column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource());
 
     for (uint i = 0; i < batch_info.row_batches.size(); i++) {
       auto const batch_row_offset = batch_info.batch_row_boundaries[i];
@@ -2076,8 +2085,10 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   // Ideally we would check that the offsets are all the same, etc. but for now this is probably
   // fine
   CUDF_EXPECTS(size_per_row * num_rows <= child.size(), "The layout of the data appears to be off");
-  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream);
-  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream);
+  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream,
+                                                  rmm::mr::get_current_device_resource());
+  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream,
+                                                 rmm::mr::get_current_device_resource());
 
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<column>> output_columns;
@@ -2118,16 +2129,20 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
     }
   }
 
-  auto dev_string_row_offsets = make_device_uvector_async(string_row_offsets, stream);
-  auto dev_string_lengths = make_device_uvector_async(string_lengths, stream);
+  auto dev_string_row_offsets =
+      make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource());
+  auto dev_string_lengths =
+      make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource());
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
   row_batches.push_back(
       {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
 
-  auto dev_output_data = make_device_uvector_async(output_data, stream);
-  auto dev_output_nm = make_device_uvector_async(output_nm, stream);
+  auto dev_output_data =
+      make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource());
+  auto dev_output_nm =
+      make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource());
 
   // only ever get a single batch when going from rows, so boundaries are 0, num_rows
   constexpr auto num_batches = 2;
@@ -2164,7 +2179,8 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   auto validity_tile_infos =
       detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches);
 
-  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream);
+  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream,
+                                                           rmm::mr::get_current_device_resource());
 
   dim3 const validity_blocks(validity_tile_infos.size());
 
@@ -2221,8 +2237,10 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
       string_col_offsets.push_back(std::move(output_string_offsets));
       string_data_cols.push_back(std::move(string_data));
     }
-    auto dev_string_col_offsets = make_device_uvector_async(string_col_offset_ptrs, stream);
-    auto dev_string_data_cols = make_device_uvector_async(string_data_col_ptrs, stream);
+    auto dev_string_col_offsets = make_device_uvector_async(string_col_offset_ptrs, stream,
+                                                            rmm::mr::get_current_device_resource());
+    auto dev_string_data_cols = make_device_uvector_async(string_data_col_ptrs, stream,
+                                                          rmm::mr::get_current_device_resource());
 
     dim3 const string_blocks(
         std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS),
@@ -2274,8 +2292,10 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
     // fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = make_device_uvector_async(column_start, stream);
-    auto dev_column_size = make_device_uvector_async(column_size, stream);
+    auto dev_column_start =
+        make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource());
+    auto dev_column_size =
+        make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource());
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<column>> output_columns;
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 6590cf2940d..ccf730c91fb 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from collections import abc
 
@@ -279,12 +279,8 @@ def cut(
     if labels is not None:
         if labels is not ordered and len(set(labels)) != len(labels):
             # when we have duplicate labels and ordered is False, we
-            # should allow duplicate categories. The categories are
-            # returned in order
-            new_data = [interval_labels[i][0] for i in index_labels.values]
-            return cudf.CategoricalIndex(
-                new_data, categories=sorted(set(labels)), ordered=False
-            )
+            # should allow duplicate categories.
+            return interval_labels[index_labels]
 
     col = build_categorical_column(
         categories=interval_labels,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e50c324a8f4..672e663d316 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -259,9 +259,12 @@ def _getitem_tuple_arg(self, arg):
 
             else:
                 if isinstance(arg, tuple):
-                    return columns_df.index._get_row_major(columns_df, arg[0])
+                    row_arg = arg[0]
+                elif is_scalar(arg):
+                    row_arg = (arg,)
                 else:
-                    return columns_df.index._get_row_major(columns_df, arg)
+                    row_arg = arg
+                return columns_df.index._get_row_major(columns_df, row_arg)
         else:
             if isinstance(arg[0], slice):
                 out = _get_label_range_or_mask(
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 413e005b798..d1408fec160 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1403,7 +1403,7 @@ def __repr__(self):
     @_cudf_nvtx_annotate
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
-        if not isinstance(index, int):
+        if isinstance(res, ColumnBase):
             res = as_index(res)
             res.name = self.name
         return res
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 79927c60a85..8ec08b7c92a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -250,7 +250,11 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
         if isinstance(self._frame.index, cudf.MultiIndex) and not isinstance(
             arg, cudf.MultiIndex
         ):
-            result = self._frame.index._get_row_major(self._frame, arg)
+            if is_scalar(arg):
+                row_arg = (arg,)
+            else:
+                row_arg = arg
+            result = self._frame.index._get_row_major(self._frame, row_arg)
             if (
                 isinstance(arg, tuple)
                 and len(arg) == self._frame._index.nlevels
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index d043b917251..0b0c5fba7fa 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2886,3 +2886,22 @@ def test_index_to_pandas_nullable(data, expected_dtype):
     expected = pd.Index(data, dtype=expected_dtype)
 
     assert_eq(pi, expected)
+
+
+class TestIndexScalarGetItem:
+    @pytest.fixture(
+        params=[range(1, 10, 2), [1, 2, 3], ["a", "b", "c"], [1.5, 2.5, 3.5]]
+    )
+    def index_values(self, request):
+        return request.param
+
+    @pytest.fixture(params=[int, np.int8, np.int32, np.int64])
+    def i(self, request):
+        return request.param(1)
+
+    def test_scalar_getitem(self, index_values, i):
+        index = cudf.Index(index_values)
+
+        assert not isinstance(index[i], cudf.Index)
+        assert index[i] == index_values[i]
+        assert_eq(index, index.to_pandas())
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 5012ae0979f..95936c48b7c 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1446,6 +1446,8 @@ def test_loc_zero_dim_array():
                 reason="https://github.com/pandas-dev/pandas/issues/46704"
             ),
         ),
+        1,
+        2,
     ],
 )
 def test_loc_series_multiindex(arg):
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 0f04e8c0f2d..a0e027d4c86 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -319,6 +319,9 @@ def test_multiindex_getitem(pdf, gdf, pdfIndex):
         (("a", "store"), slice(None)),
         # return 2 rows, n-1 remaining keys = dataframe with n-k index columns
         ("a",),
+        "a",
+        "b",
+        "c",
         (("a",), slice(None)),
         # return 1 row, 0 remaining keys = dataframe with entire index
         ("a", "store", "storm", "smoke"),
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 60bbe5d9571..d2858876fcd 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -1,6 +1,7 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import math
+import textwrap
 import warnings
 
 import numpy as np
@@ -68,6 +69,18 @@ def to_dask_dataframe(self, **kwargs):
 
 
 class DataFrame(_Frame, dd.core.DataFrame):
+    """
+    A distributed Dask DataFrame where the backing dataframe is a
+    :class:`cuDF DataFrame <cudf:cudf.DataFrame>`.
+
+    Typically you would not construct this object directly, but rather
+    use one of Dask-cuDF's IO routines.
+
+    Most operations on :doc:`Dask DataFrames <dask:dataframe>` are
+    supported, with many of the same caveats.
+
+    """
+
     _partition_type = cudf.DataFrame
 
     @_dask_cudf_nvtx_annotate
@@ -671,12 +684,35 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
 
 
 from_cudf.__doc__ = (
-    "Wraps main-line Dask from_pandas...\n" + dd.from_pandas.__doc__
+    textwrap.dedent(
+        """
+        Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`.
+
+        This function is a thin wrapper around
+        :func:`dask.dataframe.from_pandas`, accepting the same
+        arguments (described below) excepting that it operates on cuDF
+        rather than pandas objects.\n
+        """
+    )
+    + textwrap.dedent(dd.from_pandas.__doc__)
 )
 
 
 @_dask_cudf_nvtx_annotate
 def from_dask_dataframe(df):
+    """
+    Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
+    one.
+
+    Parameters
+    ----------
+    df : dask.dataframe.DataFrame
+        The Dask dataframe to convert
+
+    Returns
+    -------
+    dask_cudf.DataFrame : A new Dask collection backed by cuDF objects
+    """
     return df.map_partitions(cudf.from_pandas)
 
 
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index f91738bdab0..f4bbcaf4dd1 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from functools import wraps
 from typing import Set
@@ -433,22 +433,55 @@ def groupby_agg(
 ):
     """Optimized groupby aggregation for Dask-CuDF.
 
-    This aggregation algorithm only supports the following options:
-
-    - "count"
-    - "mean"
-    - "std"
-    - "var"
-    - "sum"
-    - "min"
-    - "max"
-    - "collect"
-    - "first"
-    - "last"
-
-    This "optimized" approach is more performant than the algorithm
-    in `dask.dataframe`, because it allows the cudf backend to
-    perform multiple aggregations at once.
+    Parameters
+    ----------
+    ddf : DataFrame
+        DataFrame object to perform grouping on.
+    gb_cols : str or list[str]
+        Column names to group by.
+    aggs_in : str, list, or dict
+        Aggregations to perform.
+    split_every : int (optional)
+        How to group intermediate aggregates.
+    dropna : bool
+        Drop grouping key values corresponding to NA values.
+    as_index : bool
+        Currently ignored.
+    sort : bool
+        Sort the group keys, better performance is obtained when
+        not sorting.
+    shuffle : str (optional)
+        Control how shuffling of the DataFrame is performed.
+    sep : str
+        Internal usage.
+
+
+    Notes
+    -----
+    This "optimized" approach is more performant than the algorithm in
+    implemented in :meth:`DataFrame.apply` because it allows the cuDF
+    backend to perform multiple aggregations at once.
+
+    This aggregation algorithm only supports the following options
+
+    * "collect"
+    * "count"
+    * "first"
+    * "last"
+    * "max"
+    * "mean"
+    * "min"
+    * "std"
+    * "sum"
+    * "var"
+
+
+    See Also
+    --------
+    DataFrame.groupby : generic groupby of a DataFrame
+    dask.dataframe.apply_concat_apply : for more description of the
+        split_every argument.
+
     """
     # Assert that aggregations are supported
     aggs = _redirect_aggs(aggs_in)
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index b4d080fd182..fd27083bbf4 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -16,9 +16,10 @@
 
 def read_csv(path, blocksize="default", **kwargs):
     """
-    Read CSV files into a dask_cudf.DataFrame
+    Read CSV files into a :class:`.DataFrame`.
 
-    This API parallelizes the ``cudf.read_csv`` function in the following ways:
+    This API parallelizes the :func:`cudf:cudf.read_csv` function in
+    the following ways:
 
     It supports loading many files at once using globstrings:
 
@@ -34,23 +35,26 @@ def read_csv(path, blocksize="default", **kwargs):
     >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
     >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
 
-    Internally ``dask_cudf.read_csv`` uses ``cudf.read_csv`` and supports
-    many of the same keyword arguments with the same performance guarantees.
-    See the docstring for ``cudf.read_csv()`` for more information on available
+    Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and
+    supports many of the same keyword arguments with the same
+    performance guarantees. See the docstring for
+    :func:`cudf:cudf.read_csv` for more information on available
     keyword arguments.
 
     Parameters
     ----------
     path : str, path object, or file-like object
-        Either a path to a file (a str, pathlib.Path, or
-        py._path.local.LocalPath), URL (including http, ftp, and S3 locations),
-        or any object with a read() method (such as builtin open() file
-        handler function or StringIO).
+        Either a path to a file (a str, :py:class:`pathlib.Path`, or
+        py._path.local.LocalPath), URL (including http, ftp, and S3
+        locations), or any object with a read() method (such as
+        builtin :py:func:`open` file handler function or
+        :py:class:`~io.StringIO`).
     blocksize : int or str, default "256 MiB"
-        The target task partition size. If `None`, a single block
+        The target task partition size. If ``None``, a single block
         is used for each file.
     **kwargs : dict
-        Passthrough key-word arguments that are sent to ``cudf.read_csv``.
+        Passthrough key-word arguments that are sent to
+        :func:`cudf:cudf.read_csv`.
 
     Examples
     --------
@@ -61,6 +65,7 @@ def read_csv(path, blocksize="default", **kwargs):
     0  1     hi
     1  2  hello
     2  3     ai
+
     """
 
     # Handle `chunksize` deprecation
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index bb3d0f3c601..2a6ad603414 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -10,30 +10,33 @@
 
 
 def read_json(url_path, engine="auto", **kwargs):
-    """Create a dask_cudf DataFrame collection from JSON data
+    """Read JSON data into a :class:`.DataFrame`.
 
-    This function wraps ``dask.dataframe.read_json``, and passes
+    This function wraps :func:`dask.dataframe.read_json`, and passes
     ``engine=partial(cudf.read_json, engine="auto")`` by default.
 
     Parameters
     ----------
-    url_path: str, list of str
+    url_path : str, list of str
         Location to read from. If a string, can include a glob character to
         find a set of file names.
         Supports protocol specifications such as ``"s3://"``.
     engine : str or Callable, default "auto"
-        If str, this value will be used as the ``engine`` argument when
-        ``cudf.read_json`` is used to create each partition. If Callable,
-        this value will be used as the underlying function used to create
-        each partition from JSON data. The default value is "auto", so
-        that ``engine=partial(cudf.read_json, engine="auto")`` will be
-        passed to ``dask.dataframe.read_json`` by default.
+
+        If str, this value will be used as the ``engine`` argument
+        when :func:`cudf.read_json` is used to create each partition.
+        If a :obj:`~typing.Callable`, this value will be used as the
+        underlying function used to create each partition from JSON
+        data. The default value is "auto", so that
+        ``engine=partial(cudf.read_json, engine="auto")`` will be
+        passed to :func:`dask.dataframe.read_json` by default.
+
     **kwargs :
-        Key-word arguments to pass through to ``dask.dataframe.read_json``.
+        Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
 
     Returns
     -------
-    dask_cudf.DataFrame
+    :class:`.DataFrame`
 
     Examples
     --------
@@ -53,7 +56,8 @@ def read_json(url_path, engine="auto", **kwargs):
 
     See Also
     --------
-    dask.dataframe.io.json.read_json
+    dask.dataframe.read_json
+
     """
 
     # TODO: Add optimized code path to leverage the
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index e731057ed90..49fea0d7602 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from io import BufferedWriter, IOBase
 
@@ -25,37 +25,45 @@ def _read_orc_stripe(fs, path, stripe, columns, kwargs=None):
 
 
 def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
-    """Read cudf dataframe from ORC file(s).
+    """Read ORC files into a :class:`.DataFrame`.
 
     Note that this function is mostly borrowed from upstream Dask.
 
     Parameters
     ----------
-    path: str or list(str)
+    path : str or list[str]
         Location of file(s), which can be a full URL with protocol specifier,
         and may include glob character if a single string.
-    columns: None or list(str)
+    columns : None or list[str]
         Columns to load. If None, loads all.
     filters : None or list of tuple or list of lists of tuples
-        If not None, specifies a filter predicate used to filter out row groups
-        using statistics stored for each row group as Parquet metadata. Row
-        groups that do not match the given filter predicate are not read. The
-        predicate is expressed in disjunctive normal form (DNF) like
-        `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical
-        combinations of single column predicates. The innermost tuples each
-        describe a single column predicate. The list of inner predicates is
-        interpreted as a conjunction (AND), forming a more selective and
-        multiple column predicate. Finally, the outermost list combines
-        these filters as a disjunction (OR). Predicates may also be passed
-        as a list of tuples. This form is interpreted as a single conjunction.
-        To express OR in predicates, one must use the (preferred) notation of
-        list of lists of tuples.
-    storage_options: None or dict
+        If not None, specifies a filter predicate used to filter out
+        row groups using statistics stored for each row group as
+        Parquet metadata. Row groups that do not match the given
+        filter predicate are not read. The predicate is expressed in
+        `disjunctive normal form (DNF)
+        <https://en.wikipedia.org/wiki/Disjunctive_normal_form>`__
+        like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary
+        boolean logical combinations of single column predicates. The
+        innermost tuples each describe a single column predicate. The
+        list of inner predicates is interpreted as a conjunction
+        (AND), forming a more selective and multiple column predicate.
+        Finally, the outermost list combines these filters as a
+        disjunction (OR). Predicates may also be passed as a list of
+        tuples. This form is interpreted as a single conjunction. To
+        express OR in predicates, one must use the (preferred)
+        notation of list of lists of tuples.
+    storage_options : None or dict
         Further parameters to pass to the bytes backend.
 
+    See Also
+    --------
+    dask.dataframe.read_orc
+
     Returns
     -------
-    cudf.DataFrame
+    dask_cudf.DataFrame
+
     """
 
     storage_options = storage_options or {}
@@ -133,22 +141,25 @@ def to_orc(
     compute=True,
     **kwargs,
 ):
-    """Write a dask_cudf dataframe to ORC file(s) (one file per partition).
+    """
+    Write a :class:`.DataFrame` to ORC file(s) (one file per partition).
 
     Parameters
     ----------
-    df : dask_cudf.DataFrame
-    path: string or pathlib.Path
+    df : DataFrame
+    path : str or pathlib.Path
         Destination directory for data.  Prepend with protocol like ``s3://``
         or ``hdfs://`` for remote data.
     write_index : boolean, optional
         Whether or not to write the index. Defaults to True.
-    storage_options: None or dict
+    storage_options : None or dict
         Further parameters to pass to the bytes backend.
     compression : string or dict, optional
     compute : bool, optional
-        If True (default) then the result is computed immediately. If False
-        then a ``dask.delayed`` object is returned for future computation.
+        If True (default) then the result is computed immediately. If
+        False then a :class:`~dask.delayed.Delayed` object is returned
+        for future computation.
+
     """
 
     from dask import compute as dask_compute, delayed
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 452f2f8914a..b03ac256b05 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -438,13 +438,14 @@ def set_object_dtypes_from_pa_schema(df, schema):
 
 
 def read_parquet(path, columns=None, **kwargs):
-    """Read parquet files into a Dask DataFrame
+    """
+    Read parquet files into a :class:`.DataFrame`.
 
-    Calls ``dask.dataframe.read_parquet`` with ``engine=CudfEngine``
-    to coordinate the execution of ``cudf.read_parquet``, and to
-    ultimately create a ``dask_cudf.DataFrame`` collection.
+    Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine``
+    to coordinate the execution of :func:`cudf.read_parquet`, and to
+    ultimately create a :class:`.DataFrame` collection.
 
-    See the ``dask.dataframe.read_parquet`` documentation for
+    See the :func:`dask.dataframe.read_parquet` documentation for
     all available options.
 
     Examples
@@ -469,6 +470,7 @@ def read_parquet(path, columns=None, **kwargs):
     See Also
     --------
     cudf.read_parquet
+    dask.dataframe.read_parquet
     """
     if isinstance(columns, str):
         columns = [columns]

File	Compile time	Size
File	Compile time	Size	t-cmp
", name, "	", build_time_str, "	", file_size_str, "
", file_size_str, "	", + diff_time_str, + "