diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d02825b73d1..dd4482375b9 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -133,5 +133,6 @@ jobs:
     with:
       build_type: pull-request
       package-name: dask_cudf
-      test-before: "RAPIDS_PY_WHEEL_NAME=cudf_cu11 rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl"
+      # Install the cudf we just built, and also test against latest dask/distributed/dask-cuda.
+      test-before: "RAPIDS_PY_WHEEL_NAME=cudf_cu11 rapids-download-wheels-from-s3 ./local-cudf-dep && python -m pip install --no-deps ./local-cudf-dep/cudf*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
       test-unittest: "python -m pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index c808e1475e6..a4bd14439b0 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -97,4 +97,6 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       package-name: dask_cudf
+      # Test against latest dask/distributed/dask-cuda.
+      test-before: "pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.04"
       test-unittest: "python -m pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1eb2c508db9..8b46eb25950 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -161,7 +161,7 @@ repos:
                     ^CHANGELOG.md$
                   )
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.4.0
+        rev: v1.5.1
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/build.sh b/build.sh
index bee66d819b4..7cbd0fceb5a 100755
--- a/build.sh
+++ b/build.sh
@@ -300,8 +300,7 @@ if buildAll || hasArg libcudf; then
     # Record build times
     if [[ "$BUILD_REPORT_METRICS" == "ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
         echo "Formatting build metrics"
-        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml
-        MSG="<p>"
+        MSG=""
         # get some sccache stats after the compile
         if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" && -x "$(command -v sccache)" ]]; then
            COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }')
@@ -318,7 +317,9 @@ if buildAll || hasArg libcudf; then
         BMR_DIR=${RAPIDS_ARTIFACTS_DIR:-"${LIB_BUILD_DIR}"}
         echo "Metrics output dir: [$BMR_DIR]"
         mkdir -p ${BMR_DIR}
-        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${BMR_DIR}/ninja_log.html
+        MSG_OUTFILE="$(mktemp)"
+        echo "$MSG" > "${MSG_OUTFILE}"
+        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "${MSG_OUTFILE}" > ${BMR_DIR}/ninja_log.html
         cp ${LIB_BUILD_DIR}/.ninja_log ${BMR_DIR}/ninja.log
     fi
 
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index b68c2bdbef6..bc27e7d76b0 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -14,29 +14,3 @@ rapids-logger "Begin cpp build"
 rapids-mamba-retry mambabuild conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
-
-if [[ -d $RAPIDS_ARTIFACTS_DIR ]]; then
-  ls -l ${RAPIDS_ARTIFACTS_DIR}
-fi
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
-
-FILE=${RAPIDS_ARTIFACTS_DIR}/ninja.log
-if [[ -f $FILE ]]; then
-  echo -e "\x1B[33;1m\x1B[48;5;240m Ninja log for this build available at the following link \x1B[0m"
-  UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).ninja.log
-  rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}"
-fi
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
-
-FILE=${RAPIDS_ARTIFACTS_DIR}/ninja_log.html
-if [[ -f $FILE ]]; then
-  echo -e "\x1B[33;1m\x1B[48;5;240m Build Metrics Report for this build available at the following link \x1B[0m"
-  UPLOAD_NAME=cpp_cuda${RAPIDS_CUDA_VERSION%%.*}_$(arch).BuildMetricsReport.html
-  rapids-upload-to-s3 "${UPLOAD_NAME}" "${FILE}"
-fi
-
-echo "++++++++++++++++++++++++++++++++++++++++++++"
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 6daedb59733..4955fe08982 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -33,16 +33,25 @@ aws s3 cp s3://rapidsai-docs/librmm/${VERSION_NUMBER}/html/rmm.tag . || echo "Fa
 doxygen Doxyfile
 popd
 
-rapids-logger "Build Sphinx docs"
+rapids-logger "Build cuDF Sphinx docs"
 pushd docs/cudf
 sphinx-build -b dirhtml source _html
 sphinx-build -b text source _text
 popd
 
 
+rapids-logger "Build dask-cuDF Sphinx docs"
+pushd docs/dask_cudf
+sphinx-build -b dirhtml source _html
+sphinx-build -b text source _text
+popd
+
+
 if [[ ${RAPIDS_BUILD_TYPE} == "branch" ]]; then
   rapids-logger "Upload Docs to S3"
   aws s3 sync --no-progress --delete cpp/doxygen/html "s3://rapidsai-docs/libcudf/${VERSION_NUMBER}/html"
   aws s3 sync --no-progress --delete docs/cudf/_html "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/html"
   aws s3 sync --no-progress --delete docs/cudf/_text "s3://rapidsai-docs/cudf/${VERSION_NUMBER}/txt"
+  aws s3 sync --no-progress --delete docs/dask_cudf/_html "s3://rapidsai-docs/dask-cudf/${VERSION_NUMBER}/html"
+  aws s3 sync --no-progress --delete docs/dask_cudf/_text "s3://rapidsai-docs/dask-cudf/${VERSION_NUMBER}/txt"
 fi
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
index 9d9758f1f15..0c55c4b9141 100755
--- a/ci/release/apply_wheel_modifications.sh
+++ b/ci/release/apply_wheel_modifications.sh
@@ -6,12 +6,6 @@
 VERSION=${1}
 CUDA_SUFFIX=${2}
 
-# __init__.py versions
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/cudf/cudf/__init__.py
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/dask_cudf/dask_cudf/__init__.py
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/cudf_kafka/cudf_kafka/__init__.py
-sed -i "s/__version__ = .*/__version__ = \"${VERSION}\"/g" python/custreamz/custreamz/__init__.py
-
 # pyproject.toml versions
 sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/cudf/pyproject.toml
 sed -i "s/^version = .*/version = \"${VERSION}\"/g" python/dask_cudf/pyproject.toml
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index e5c9ba0569f..dc5ea6015f9 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -24,6 +24,11 @@ NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
+# Need to distutils-normalize the versions for some use cases
+CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
+echo "current is ${CURRENT_SHORT_TAG_PEP440}, next is ${NEXT_SHORT_TAG_PEP440}"
+
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
 # Inplace sed replace; workaround for Linux and Mac
@@ -70,9 +75,10 @@ sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/
 
 # bump rmm & dask-cuda
 for FILE in conda/environments/*.yaml dependencies.yaml; do
-  sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE};
-  sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
-  sed_runner "s/rmm-cu11=${CURRENT_SHORT_TAG}/rmm-cu11=${NEXT_SHORT_TAG}/g" ${FILE};
+  sed_runner "s/dask-cuda==${CURRENT_SHORT_TAG_PEP440}/dask-cuda==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
+  sed_runner "s/rmm==${CURRENT_SHORT_TAG_PEP440}/rmm==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
+  sed_runner "s/cudf==${CURRENT_SHORT_TAG_PEP440}/cudf==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
+  sed_runner "s/cudf_kafka==${CURRENT_SHORT_TAG_PEP440}/cudf_kafka==${NEXT_SHORT_TAG_PEP440}/g" ${FILE};
 done
 
 # Doxyfile update
@@ -86,13 +92,11 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt
 
-# Need to distutils-normalize the original version
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-
 # Dependency versions in pyproject.toml
 sed_runner "s/rmm==.*\",/rmm==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/cudf/pyproject.toml
 sed_runner "s/cudf==.*\",/cudf==${NEXT_SHORT_TAG_PEP440}.*\",/g" python/dask_cudf/pyproject.toml
 
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
 done
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index bd7a82afbea..846b90c78e5 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -8,35 +8,34 @@ trap "EXITCODE=1" ERR
 set +e
 
 # Get library for finding incorrect default stream usage.
-STREAM_IDENTIFY_LIB="${CONDA_PREFIX}/lib/libcudf_identify_stream_usage.so"
+STREAM_IDENTIFY_LIB_MODE_CUDF="${CONDA_PREFIX}/lib/libcudf_identify_stream_usage_mode_cudf.so"
+STREAM_IDENTIFY_LIB_MODE_TESTING="${CONDA_PREFIX}/lib/libcudf_identify_stream_usage_mode_testing.so"
 
-echo "STREAM_IDENTIFY_LIB=${STREAM_IDENTIFY_LIB}"
+echo "STREAM_IDENTIFY_LIB=${STREAM_IDENTIFY_LIB_MODE_CUDF}"
 
 # Run libcudf and libcudf_kafka gtests from libcudf-tests package
 rapids-logger "Run gtests"
 
-# TODO: exit code handling is too verbose. Find a cleaner solution.
-
-for gt in "$CONDA_PREFIX"/bin/gtests/{libcudf,libcudf_kafka}/* ; do
-    test_name=$(basename ${gt})
-    echo "Running gtest $test_name"
-
-    # TODO: This strategy for using the stream lib will need to change when we
-    # switch to invoking ctest. For one, we will want to set the test
-    # properties to use the lib (which means that the decision will be made at
-    # CMake-configure time instead of runtime). We may also need to leverage
-    # something like gtest_discover_tests to be able to filter on the
-    # underlying test names.
-    if [[ ${test_name} == "SPAN_TEST" ]]; then
-        # This one test is specifically designed to test using a thrust device
-        # vector, so we expect and allow it to include default stream usage.
-        gtest_filter="SpanTest.CanConstructFromDeviceContainers"
-        GTEST_CUDF_STREAM_MODE="custom" LD_PRELOAD=${STREAM_IDENTIFY_LIB} ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR} --gtest_filter="-${gtest_filter}" && \
-            ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR} --gtest_filter="${gtest_filter}"
-    else
-        GTEST_CUDF_STREAM_MODE="custom" LD_PRELOAD=${STREAM_IDENTIFY_LIB} ${gt} --gtest_output=xml:${RAPIDS_TESTS_DIR}
-    fi
-done
+cd $CONDA_PREFIX/bin/gtests/libcudf/
+export GTEST_CUDF_STREAM_MODE="new_cudf_default"
+export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/
+export LD_PRELOAD=${STREAM_IDENTIFY_LIB_MODE_CUDF}
+
+ctest -E SPAN_TEST -j20 --output-on-failure
+
+# This one test is specifically designed to test using a thrust device vector,
+# so we expect and allow it to include default stream usage.
+_allowlist_filter="SpanTest.CanConstructFromDeviceContainers"
+GTEST_FILTER="-${_allowlist_filter}" ctest -R SPAN_TEST -VV
+LD_PRELOAD= GTEST_CUDF_STREAM_MODE=default GTEST_FILTER="${_allowlist_filter}" ctest -R SPAN_TEST -VV
+
+SUITEERROR=$?
+
+if (( ${SUITEERROR} == 0 )); then
+    cd $CONDA_PREFIX/bin/gtests/libcudf_kafka/
+    ctest -j20 --output-on-failure
+    SUITEERROR=$?
+fi
 
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test_cpp_memcheck.sh b/ci/test_cpp_memcheck.sh
index db9ce143d51..0e85268cb72 100755
--- a/ci/test_cpp_memcheck.sh
+++ b/ci/test_cpp_memcheck.sh
@@ -11,7 +11,7 @@ set +e
 rapids-logger "Memcheck gtests with rmm_mode=cuda"
 export GTEST_CUDF_RMM_MODE=cuda
 COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck"
-for gt in "$CONDA_PREFIX"/bin/gtests/libcudf/* ; do
+for gt in "$CONDA_PREFIX"/bin/gtests/libcudf/*_TEST ; do
     test_name=$(basename ${gt})
     if [[ "$test_name" == "ERROR_TEST" ]] || [[ "$test_name" == "STREAM_IDENTIFICATION_TEST" ]]; then
         continue
diff --git a/ci/test_java.sh b/ci/test_java.sh
index f905aaa1178..e4df62501cc 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -38,7 +38,7 @@ set +e
 
 rapids-logger "Run Java tests"
 pushd java
-mvn test -B -DCUDF_JNI_ARROW_STATIC=OFF -DCUDF_JNI_ENABLE_PROFILING=OFF
+mvn test -B -DCUDF_JNI_ENABLE_PROFILING=OFF
 popd
 
 rapids-logger "Test script exiting with value: $EXITCODE"
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 66d375910d4..890cb199419 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cupy>=9.5.0,<12.0.0a0
 - cxx-compiler
 - cython>=0.29,<0.30
-- dask-cuda=23.04.*
+- dask-cuda==23.4.*
 - dask>=2023.1.1
 - distributed>=2023.1.1
 - dlpack>=0.5,<0.6.0a0
@@ -30,18 +30,21 @@ dependencies:
 - fmt>=9.1.0,<10
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
+- gmock==1.10.0.*
+- gtest==1.10.0.*
 - hypothesis
 - ipython
-- libarrow=10
+- libarrow==10.0.1.*
 - librdkafka=1.7.0
-- librmm=23.04.*
+- librmm==23.4.*
 - mimesis>=4.1.0
 - moto>=4.0.8
+- msgpack-python
 - myst-nb
 - nbsphinx
 - ninja
 - notebook
-- numba>=0.56.2
+- numba>=0.56.4,<0.57
 - numpy>=1.21
 - numpydoc
 - nvcc_linux-64=11.8
@@ -53,7 +56,7 @@ dependencies:
 - pre-commit
 - protobuf>=4.21.6,<4.22
 - ptxcompiler
-- pyarrow=10
+- pyarrow==10.0.1.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
@@ -61,11 +64,11 @@ dependencies:
 - pytest-cases
 - pytest-cov
 - pytest-xdist
-- python-confluent-kafka=1.7.0
+- python-confluent-kafka==1.7.0
 - python-snappy>=0.6.0
 - python>=3.8,<3.11
 - pytorch<1.12.0
-- rmm=23.04.*
+- rmm==23.4.*
 - s3fs>=2022.3.0
 - scikit-build>=0.13.1
 - scipy
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 6b23c8953d3..bbd9961320a 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -52,7 +52,7 @@ requirements:
     - cython >=0.29,<0.30
     - scikit-build >=0.13.1
     - setuptools
-    - numba >=0.56.2
+    - numba >=0.56.4,<0.57
     - dlpack >=0.5,<0.6.0a0
     - pyarrow =10
     - libcudf ={{ version }}
@@ -64,7 +64,7 @@ requirements:
     - typing_extensions
     - pandas >=1.3,<1.6.0dev0
     - cupy >=9.5.0,<12.0.0a0
-    - numba >=0.56.2
+    - numba >=0.56.4,<0.57
     - numpy >=1.21
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
     - libcudf {{ version }}
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 770a234b56e..469c25fb673 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -51,6 +51,8 @@ requirements:
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
     - spdlog {{ spdlog_version }}
+    - gtest {{ gtest_version }}
+    - gmock {{ gtest_version }}
 
 outputs:
   - name: libcudf
@@ -71,10 +73,14 @@ outputs:
         - librmm ={{ minor_version }}
         - libarrow {{ libarrow_version }}
         - dlpack {{ dlpack_version }}
+        - gtest {{ gtest_version }}
+        - gmock {{ gtest_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
         - test -f $PREFIX/lib/libcudftestutil.a
+        - test -f $PREFIX/lib/libcudf_identify_stream_usage_mode_cudf.so
+        - test -f $PREFIX/lib/libcudf_identify_stream_usage_mode_testing.so
         - test -f $PREFIX/include/cudf/aggregation.hpp
         - test -f $PREFIX/include/cudf/ast/detail/expression_parser.hpp
         - test -f $PREFIX/include/cudf/ast/detail/operators.hpp
@@ -86,6 +92,7 @@ outputs:
         - test -f $PREFIX/include/cudf/concatenate.hpp
         - test -f $PREFIX/include/cudf/copying.hpp
         - test -f $PREFIX/include/cudf/datetime.hpp
+        - test -f $PREFIX/include/cudf/timezone.hpp
         - test -f $PREFIX/include/cudf/detail/aggregation/aggregation.hpp
         - test -f $PREFIX/include/cudf/detail/aggregation/result_cache.hpp
         - test -f $PREFIX/include/cudf/detail/binaryop.hpp
@@ -107,7 +114,6 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/nvtx/nvtx3.hpp
         - test -f $PREFIX/include/cudf/detail/nvtx/ranges.hpp
         - test -f $PREFIX/include/cudf/detail/quantiles.hpp
-        - test -f $PREFIX/include/cudf/detail/reduction_functions.hpp
         - test -f $PREFIX/include/cudf/detail/repeat.hpp
         - test -f $PREFIX/include/cudf/detail/replace.hpp
         - test -f $PREFIX/include/cudf/detail/reshape.hpp
@@ -116,12 +122,13 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/scan.hpp
         - test -f $PREFIX/include/cudf/detail/scatter.hpp
         - test -f $PREFIX/include/cudf/detail/search.hpp
-        - test -f $PREFIX/include/cudf/detail/segmented_reduction_functions.hpp
         - test -f $PREFIX/include/cudf/detail/sequence.hpp
         - test -f $PREFIX/include/cudf/detail/sorting.hpp
         - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
         - test -f $PREFIX/include/cudf/detail/structs/utilities.hpp
         - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp
+        - test -f $PREFIX/include/cudf/detail/timezone.cuh
+        - test -f $PREFIX/include/cudf/detail/timezone.hpp
         - test -f $PREFIX/include/cudf/detail/transform.hpp
         - test -f $PREFIX/include/cudf/detail/transpose.hpp
         - test -f $PREFIX/include/cudf/detail/unary.hpp
@@ -209,6 +216,8 @@ outputs:
         - test -f $PREFIX/include/cudf/partitioning.hpp
         - test -f $PREFIX/include/cudf/quantiles.hpp
         - test -f $PREFIX/include/cudf/reduction.hpp
+        - test -f $PREFIX/include/cudf/reduction/detail/reduction_functions.hpp
+        - test -f $PREFIX/include/cudf/reduction/detail/segmented_reduction_functions.hpp
         - test -f $PREFIX/include/cudf/replace.hpp
         - test -f $PREFIX/include/cudf/reshape.hpp
         - test -f $PREFIX/include/cudf/rolling.hpp
@@ -294,11 +303,12 @@ outputs:
         - test -f $PREFIX/include/cudf_test/column_wrapper.hpp
         - test -f $PREFIX/include/cudf_test/cudf_gtest.hpp
         - test -f $PREFIX/include/cudf_test/cxxopts.hpp
+        - test -f $PREFIX/include/cudf_test/default_stream.hpp
         - test -f $PREFIX/include/cudf_test/detail/column_utilities.hpp
         - test -f $PREFIX/include/cudf_test/file_utilities.hpp
         - test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp
         - test -f $PREFIX/include/cudf_test/iterator_utilities.hpp
-        - test -f $PREFIX/include/cudf_test/stream_checking_resource_adapter.hpp
+        - test -f $PREFIX/include/cudf_test/stream_checking_resource_adaptor.hpp
         - test -f $PREFIX/include/cudf_test/table_utilities.hpp
         - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
         - test -f $PREFIX/include/cudf_test/type_list_utilities.hpp
@@ -376,8 +386,6 @@ outputs:
         - {{ pin_subpackage('libcudf', exact=True) }}
         - {{ pin_subpackage('libcudf_kafka', exact=True) }}
         - cudatoolkit {{ cuda_spec }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
diff --git a/conda/recipes/libcudf/post-link.sh b/conda/recipes/libcudf/post-link.sh
index 64e0b1ad305..8ae2349f791 100644
--- a/conda/recipes/libcudf/post-link.sh
+++ b/conda/recipes/libcudf/post-link.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 # Only add the license notice to libcudf and not our examples / tests
 if [[ "$PKG_NAME" == "libcudf" ]]; then
-  cat ./nvlink.txt >> $PREFIX/.messages.txt
+  cat ./nvcomp.txt >> $PREFIX/.messages.txt
 fi
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a261049d3f0..127df03c54d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -73,7 +73,7 @@ option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compila
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 
 set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON)
-if(${CUDA_STATIC_RUNTIME})
+if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS)
   set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL OFF)
 endif()
 option(
@@ -369,7 +369,7 @@ add_library(
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu
   src/io/orc/stripe_init.cu
-  src/io/orc/timezone.cpp
+  src/datetime/timezone.cpp
   src/io/orc/writer_impl.cu
   src/io/parquet/compact_protocol_reader.cpp
   src/io/parquet/compact_protocol_writer.cpp
@@ -464,6 +464,7 @@ add_library(
   src/reductions/segmented/max.cu
   src/reductions/segmented/mean.cu
   src/reductions/segmented/min.cu
+  src/reductions/segmented/nunique.cu
   src/reductions/segmented/product.cu
   src/reductions/segmented/reductions.cpp
   src/reductions/segmented/std.cu
@@ -547,6 +548,7 @@ add_library(
   src/strings/regex/regex_program.cpp
   src/strings/repeat_strings.cu
   src/strings/replace/backref_re.cu
+  src/strings/replace/multi.cu
   src/strings/replace/multi_re.cu
   src/strings/replace/replace.cu
   src/strings/replace/replace_re.cu
@@ -739,6 +741,35 @@ add_library(cudf::cudf ALIAS cudf)
 # * build cudftestutil ----------------------------------------------------------------------------
 
 if(CUDF_BUILD_TESTUTIL)
+  add_library(
+    cudftest_default_stream
+    # When compiled as a dynamic library allows us to use LD_PRELOAD injection of symbols. We
+    # currently leverage this for stream-related library validation and may make use of it for
+    # other similar features in the future.
+    tests/utilities/default_stream.cpp
+  )
+  set_target_properties(
+    cudftest_default_stream
+    PROPERTIES BUILD_RPATH "\$ORIGIN"
+               INSTALL_RPATH "\$ORIGIN"
+               # set target compile options
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
+  target_link_libraries(
+    cudftest_default_stream
+    PUBLIC cudf
+    PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
+
+  add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
+
+  # Needs to be static so that we support usage of static builds of gtest which doesn't compile with
+  # fPIC enabled and therefore can't be embedded into shared libraries.
   add_library(
     cudftestutil STATIC
     tests/io/metadata_utilities.cpp
@@ -768,7 +799,7 @@ if(CUDF_BUILD_TESTUTIL)
 
   target_link_libraries(
     cudftestutil
-    PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf
+    PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf cudftest_default_stream
     PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
   )
 
@@ -790,18 +821,27 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
     )
   endif()
 
-  # Libraries for stream-related testing.
-  add_library(cudf_identify_stream_usage SHARED tests/utilities/identify_stream_usage.cpp)
+  # Libraries for stream-related testing. We build the library twice, one with STREAM_MODE_TESTING
+  # on and one with it set to off. Each test will then be configured to use the appropriate library
+  # depending via ctest and whether it has been updated to expose public stream APIs.
+  foreach(_mode cudf testing)
+    set(_tgt "cudf_identify_stream_usage_mode_${_mode}")
+    add_library(${_tgt} SHARED tests/utilities/identify_stream_usage.cpp)
+
+    set_target_properties(
+      ${_tgt}
+      PROPERTIES # set target compile options
+                 CXX_STANDARD 17
+                 CXX_STANDARD_REQUIRED ON
+                 POSITION_INDEPENDENT_CODE ON
+    )
+    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm)
+    add_library(cudf::${_tgt} ALIAS ${_tgt})
 
-  set_target_properties(
-    cudf_identify_stream_usage
-    PROPERTIES # set target compile options
-               CXX_STANDARD 17
-               CXX_STANDARD_REQUIRED ON
-               POSITION_INDEPENDENT_CODE ON
-  )
-  target_link_libraries(cudf_identify_stream_usage PUBLIC CUDA::cudart rmm::rmm)
-  add_library(cudf::cudf_identify_stream_usage ALIAS cudf_identify_stream_usage)
+    if("${_mode}" STREQUAL "testing")
+      target_compile_definitions(${_tgt} PUBLIC STREAM_MODE_TESTING)
+    endif()
+  endforeach()
 endif()
 
 # ##################################################################################################
@@ -851,33 +891,23 @@ install(
   EXPORT cudf-exports
 )
 
-install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf ${CUDF_SOURCE_DIR}/include/cudf_test
-                  ${CUDF_SOURCE_DIR}/include/nvtext DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-)
-
-if(CUDF_BUILD_TESTUTIL)
+set(_components_export_string)
+if(TARGET cudftestutil)
   install(
-    TARGETS cudftestutil
+    TARGETS cudftest_default_stream cudftestutil
     DESTINATION ${lib_dir}
     EXPORT cudf-testing-exports
   )
-
-  install(
-    EXPORT cudf-testing-exports
-    FILE cudf-testing-targets.cmake
-    NAMESPACE cudf::
-    DESTINATION "${lib_dir}/cmake/cudf"
-  )
-
-  include("${rapids-cmake-dir}/export/write_dependencies.cmake")
-  rapids_export_write_dependencies(
-    INSTALL cudf-testing-exports
-    "${PROJECT_BINARY_DIR}/rapids-cmake/cudf/export/cudf-testing-dependencies.cmake"
-  )
+  set(_components_export_string COMPONENTS testing COMPONENTS_EXPORT_SET cudf-testing-exports)
 endif()
 
+install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf ${CUDF_SOURCE_DIR}/include/cudf_test
+                  ${CUDF_SOURCE_DIR}/include/nvtext DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
+
 if(CUDF_BUILD_STREAMS_TEST_UTIL)
-  install(TARGETS cudf_identify_stream_usage DESTINATION ${lib_dir})
+  install(TARGETS cudf_identify_stream_usage_mode_cudf DESTINATION ${lib_dir})
+  install(TARGETS cudf_identify_stream_usage_mode_testing DESTINATION ${lib_dir})
 endif()
 
 set(doc_string
@@ -936,12 +966,6 @@ string(
   [=[
 if(testing IN_LIST cudf_FIND_COMPONENTS)
   enable_language(CUDA)
-  if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake")
-  endif()
-  if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
-  endif()
 endif()
 ]=]
 )
@@ -949,8 +973,8 @@ string(APPEND install_code_string "${common_code_string}")
 
 rapids_export(
   INSTALL cudf
-  EXPORT_SET cudf-exports
-  GLOBAL_TARGETS cudf
+  EXPORT_SET cudf-exports ${_components_export_string}
+  GLOBAL_TARGETS cudf cudftestutil
   NAMESPACE cudf::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK install_code_string
@@ -973,23 +997,13 @@ string(APPEND build_code_string "${common_code_string}")
 
 rapids_export(
   BUILD cudf
-  EXPORT_SET cudf-exports
-  GLOBAL_TARGETS cudf
+  EXPORT_SET cudf-exports ${_components_export_string}
+  GLOBAL_TARGETS cudf cudftestutil
   NAMESPACE cudf::
   DOCUMENTATION doc_string
   FINAL_CODE_BLOCK build_code_string
 )
 
-if(CUDF_BUILD_TESTUTIL)
-  export(
-    EXPORT cudf-testing-exports
-    FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake
-    NAMESPACE cudf::
-  )
-  rapids_export_write_dependencies(
-    BUILD cudf-testing-exports "${CUDF_BINARY_DIR}/cudf-testing-dependencies.cmake"
-  )
-endif()
 # ##################################################################################################
 # * make documentation ----------------------------------------------------------------------------
 
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index cc0b642a337..b9c15e244de 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -150,6 +150,7 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
   STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/unique.cpp
+  stream_compaction/unique_count.cpp
 )
 
 # ##################################################################################################
@@ -191,7 +192,7 @@ ConfigureBench(
 )
 ConfigureNVBench(
   REDUCTION_NVBENCH reduction/distinct_count.cpp reduction/rank.cpp reduction/scan_structs.cpp
-  reduction/segment_reduce.cu
+  reduction/segmented_reduce.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index edb19b7b0ca..762e9640d12 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
@@ -429,8 +430,12 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
                    null_mask.begin());
   }
 
-  auto [result_bitmask, null_count] = cudf::detail::valid_if(
-    null_mask.begin(), null_mask.end(), thrust::identity<bool>{}, cudf::get_default_stream());
+  auto [result_bitmask, null_count] =
+    cudf::detail::valid_if(null_mask.begin(),
+                           null_mask.end(),
+                           thrust::identity<bool>{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
   return std::make_unique<cudf::column>(
     dtype,
@@ -508,8 +513,12 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
                      thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1),
                      num_rows,
                      string_generator{chars.data(), engine});
-  auto [result_bitmask, null_count] = cudf::detail::valid_if(
-    null_mask.begin(), null_mask.end() - 1, thrust::identity<bool>{}, cudf::get_default_stream());
+  auto [result_bitmask, null_count] =
+    cudf::detail::valid_if(null_mask.begin(),
+                           null_mask.end() - 1,
+                           thrust::identity<bool>{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(
     num_rows,
     std::move(offsets),
@@ -542,7 +551,8 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
                                         sample_indices,
                                         cudf::out_of_bounds_policy::DONT_CHECK,
                                         cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                        cudf::get_default_stream());
+                                        cudf::get_default_stream(),
+                                        rmm::mr::get_current_device_resource());
   return std::move(str_table->release()[0]);
 }
 
@@ -626,8 +636,11 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
       auto [null_mask, null_count] = [&]() {
         if (profile.get_null_probability().has_value()) {
           auto valids = valid_dist(engine, num_rows);
-          return cudf::detail::valid_if(
-            valids.begin(), valids.end(), thrust::identity<bool>{}, cudf::get_default_stream());
+          return cudf::detail::valid_if(valids.begin(),
+                                        valids.end(),
+                                        thrust::identity<bool>{},
+                                        cudf::get_default_stream(),
+                                        rmm::mr::get_current_device_resource());
         }
         return std::pair<rmm::device_buffer, cudf::size_type>{};
       }();
@@ -710,9 +723,12 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
     auto offsets_column = std::make_unique<cudf::column>(
       cudf::data_type{cudf::type_id::INT32}, num_rows + 1, offsets.release());
 
-    auto [null_mask, null_count] = cudf::detail::valid_if(
-      valids.begin(), valids.end(), thrust::identity<bool>{}, cudf::get_default_stream());
-    list_column = cudf::make_lists_column(
+    auto [null_mask, null_count] = cudf::detail::valid_if(valids.begin(),
+                                                          valids.end(),
+                                                          thrust::identity<bool>{},
+                                                          cudf::get_default_stream(),
+                                                          rmm::mr::get_current_device_resource());
+    list_column                  = cudf::make_lists_column(
       num_rows,
       std::move(offsets_column),
       std::move(current_child_column),
@@ -838,7 +854,8 @@ std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
     return cudf::detail::valid_if(thrust::make_counting_iterator<cudf::size_type>(0),
                                   thrust::make_counting_iterator<cudf::size_type>(size),
                                   bool_generator{seed, 1.0 - *null_probability},
-                                  cudf::get_default_stream());
+                                  cudf::get_default_stream(),
+                                  rmm::mr::get_current_device_resource());
   }
 }
 
diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp
index 416cf403671..d03f36ca81f 100644
--- a/cpp/benchmarks/io/json/nested_json.cpp
+++ b/cpp/benchmarks/io/json/nested_json.cpp
@@ -171,7 +171,8 @@ void BM_NESTED_JSON(nvbench::state& state)
     cudf::io::json::detail::device_parse_nested_json(
       cudf::device_span<char const>{input->data(), static_cast<size_t>(input->size())},
       default_options,
-      cudf::get_default_stream());
+      cudf::get_default_stream(),
+      rmm::mr::get_current_device_resource());
   });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
@@ -202,7 +203,7 @@ void BM_NESTED_JSON_DEPTH(nvbench::state& state)
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     // Allocate device-side temporary storage & run algorithm
     cudf::io::json::detail::device_parse_nested_json(
-      input, default_options, cudf::get_default_stream());
+      input, default_options, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index 73060200d00..1b1cf9b7e9d 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -140,8 +140,8 @@ void BM_iterator(benchmark::State& state)
   cudf::column_view hasnull_F = wrap_hasnull_F;
 
   // Initialize dev_result to false
-  auto dev_result =
-    cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(1, cudf::get_default_stream());
+  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
@@ -210,7 +210,7 @@ void BM_pair_iterator(benchmark::State& state)
 
   // Initialize dev_result to false
   auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
-    1, cudf::get_default_stream());
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index e37a4ca1193..70036a95377 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -104,8 +104,11 @@ void BM_join(state_type& state, Join JoinFunc)
     // roughly 75% nulls
     auto validity =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{});
-    return cudf::detail::valid_if(
-             validity, validity + size, thrust::identity<bool>{}, cudf::get_default_stream())
+    return cudf::detail::valid_if(validity,
+                                  validity + size,
+                                  thrust::identity<bool>{},
+                                  cudf::get_default_stream(),
+                                  rmm::mr::get_current_device_resource())
       .first;
   };
 
diff --git a/cpp/benchmarks/reduction/segment_reduce.cu b/cpp/benchmarks/reduction/segmented_reduce.cpp
similarity index 58%
rename from cpp/benchmarks/reduction/segment_reduce.cu
rename to cpp/benchmarks/reduction/segmented_reduce.cpp
index 127b3598dae..590a014ad76 100644
--- a/cpp/benchmarks/reduction/segment_reduce.cu
+++ b/cpp/benchmarks/reduction/segmented_reduce.cpp
@@ -20,17 +20,15 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
-#include <cudf/detail/iterator.cuh>
+#include <cudf/filling.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/device_vector.h>
-
 #include <memory>
-#include <type_traits>
 
 bool constexpr is_boolean_output_agg(cudf::segmented_reduce_aggregation::Kind kind)
 {
@@ -38,8 +36,15 @@ bool constexpr is_boolean_output_agg(cudf::segmented_reduce_aggregation::Kind ki
          kind == cudf::segmented_reduce_aggregation::ANY;
 }
 
+bool constexpr is_float_output_agg(cudf::segmented_reduce_aggregation::Kind kind)
+{
+  return kind == cudf::segmented_reduce_aggregation::MEAN ||
+         kind == cudf::segmented_reduce_aggregation::VARIANCE ||
+         kind == cudf::segmented_reduce_aggregation::STD;
+}
+
 template <cudf::segmented_reduce_aggregation::Kind kind>
-std::unique_ptr<cudf::segmented_reduce_aggregation> make_simple_aggregation()
+std::unique_ptr<cudf::segmented_reduce_aggregation> make_reduce_aggregation()
 {
   switch (kind) {
     case cudf::segmented_reduce_aggregation::SUM:
@@ -54,12 +59,22 @@ std::unique_ptr<cudf::segmented_reduce_aggregation> make_simple_aggregation()
       return cudf::make_all_aggregation<cudf::segmented_reduce_aggregation>();
     case cudf::segmented_reduce_aggregation::ANY:
       return cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>();
-    default: CUDF_FAIL("Unsupported simple segmented aggregation");
+    case cudf::segmented_reduce_aggregation::SUM_OF_SQUARES:
+      return cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
+    case cudf::segmented_reduce_aggregation::MEAN:
+      return cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
+    case cudf::segmented_reduce_aggregation::VARIANCE:
+      return cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
+    case cudf::segmented_reduce_aggregation::STD:
+      return cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
+    case cudf::segmented_reduce_aggregation::NUNIQUE:
+      return cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
+    default: CUDF_FAIL("Unsupported segmented reduce aggregation in this benchmark");
   }
 }
 
 template <typename DataType>
-std::pair<std::unique_ptr<cudf::column>, thrust::device_vector<cudf::size_type>> make_test_data(
+std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>> make_test_data(
   nvbench::state& state)
 {
   auto const column_size{cudf::size_type(state.get_int64("column_size"))};
@@ -72,28 +87,30 @@ std::pair<std::unique_ptr<cudf::column>, thrust::device_vector<cudf::size_type>>
     dtype, distribution_id::UNIFORM, 0, 100);
   auto input = create_random_column(dtype, row_count{column_size}, profile);
 
-  auto offset_it = cudf::detail::make_counting_transform_iterator(
-    0, [column_size, segment_length] __device__(auto i) {
-      return column_size < i * segment_length ? column_size : i * segment_length;
-    });
-
-  thrust::device_vector<cudf::size_type> d_offsets(offset_it, offset_it + num_segments + 1);
-
-  return std::pair(std::move(input), d_offsets);
+  auto offsets = cudf::sequence(num_segments + 1,
+                                cudf::numeric_scalar<cudf::size_type>(0),
+                                cudf::numeric_scalar<cudf::size_type>(segment_length));
+  return std::pair(std::move(input), std::move(offsets));
 }
 
 template <typename DataType, cudf::aggregation::Kind kind>
-void BM_Simple_Segmented_Reduction(nvbench::state& state,
-                                   nvbench::type_list<DataType, nvbench::enum_type<kind>>)
+void BM_Segmented_Reduction(nvbench::state& state,
+                            nvbench::type_list<DataType, nvbench::enum_type<kind>>)
 {
   auto const column_size{cudf::size_type(state.get_int64("column_size"))};
   auto const num_segments{cudf::size_type(state.get_int64("num_segments"))};
 
   auto [input, offsets] = make_test_data<DataType>(state);
-  auto agg              = make_simple_aggregation<kind>();
+  auto agg              = make_reduce_aggregation<kind>();
 
-  auto output_type = is_boolean_output_agg(kind) ? cudf::data_type{cudf::type_id::BOOL8}
-                                                 : cudf::data_type{cudf::type_to_id<DataType>()};
+  auto const output_type = [] {
+    if (is_boolean_output_agg(kind)) { return cudf::data_type{cudf::type_id::BOOL8}; }
+    if (is_float_output_agg(kind)) { return cudf::data_type{cudf::type_id::FLOAT64}; }
+    if (kind == cudf::segmented_reduce_aggregation::NUNIQUE) {
+      return cudf::data_type{cudf::type_to_id<cudf::size_type>()};
+    }
+    return cudf::data_type{cudf::type_to_id<DataType>()};
+  }();
 
   state.add_element_count(column_size);
   state.add_global_memory_reads<DataType>(column_size);
@@ -103,8 +120,10 @@ void BM_Simple_Segmented_Reduction(nvbench::state& state,
     state.add_global_memory_writes<DataType>(num_segments);
   }
 
-  auto const input_view  = input->view();
-  auto const offset_span = cudf::device_span<cudf::size_type>{offsets};
+  auto const input_view   = input->view();
+  auto const offsets_view = offsets->view();
+  auto const offset_span  = cudf::device_span<cudf::size_type const>{
+    offsets_view.template data<cudf::size_type>(), static_cast<std::size_t>(offsets_view.size())};
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(
@@ -115,13 +134,17 @@ void BM_Simple_Segmented_Reduction(nvbench::state& state,
 
 using Types = nvbench::type_list<bool, int32_t, float, double>;
 // Skip benchmarking MAX/ANY since they are covered by MIN/ALL respectively.
+// Also VARIANCE includes STD calculation.
 using AggKinds = nvbench::enum_type_list<cudf::aggregation::SUM,
                                          cudf::aggregation::PRODUCT,
                                          cudf::aggregation::MIN,
-                                         cudf::aggregation::ALL>;
+                                         cudf::aggregation::ALL,
+                                         cudf::aggregation::MEAN,
+                                         cudf::aggregation::VARIANCE,
+                                         cudf::aggregation::NUNIQUE>;
 
-NVBENCH_BENCH_TYPES(BM_Simple_Segmented_Reduction, NVBENCH_TYPE_AXES(Types, AggKinds))
-  .set_name("segmented_reduction_simple")
+NVBENCH_BENCH_TYPES(BM_Segmented_Reduction, NVBENCH_TYPE_AXES(Types, AggKinds))
+  .set_name("segmented_reduction")
   .set_type_axes_names({"DataType", "AggregationKinds"})
   .add_int64_axis("column_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
   .add_int64_axis("num_segments", {1'000, 10'000, 100'000});
diff --git a/cpp/benchmarks/stream_compaction/unique_count.cpp b/cpp/benchmarks/stream_compaction/unique_count.cpp
new file mode 100644
index 00000000000..f8319e0385c
--- /dev/null
+++ b/cpp/benchmarks/stream_compaction/unique_count.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void nvbench_unique_count(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("NumRows"));
+  auto const nulls    = state.get_float64("NullProbability");
+
+  data_profile profile = data_profile_builder().cardinality(0).null_probability(nulls).distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows / 100);
+
+  auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+  auto sorted_table  = cudf::sort(cudf::table_view({source_column->view()}));
+
+  auto input = sorted_table->view();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    cudf::unique_count(input, cudf::null_equality::EQUAL);
+  });
+}
+
+using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+
+NVBENCH_BENCH_TYPES(nvbench_unique_count, NVBENCH_TYPE_AXES(data_type))
+  .set_name("unique_count")
+  .set_type_axes_names({"Type"})
+  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000})
+  .add_float64_axis("NullProbability", {0.0, 0.1});
diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp
index b25af14ec2a..cb570020f0e 100644
--- a/cpp/benchmarks/string/replace.cpp
+++ b/cpp/benchmarks/string/replace.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
   int const row_mult   = 8;
   int const min_rowlen = 1 << 5;
   int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
+  int const len_mult   = 2;
   generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
 }
 
diff --git a/cpp/cmake/config.json b/cpp/cmake/config.json
index f7d7b001856..a65afe9e58d 100644
--- a/cpp/cmake/config.json
+++ b/cpp/cmake/config.json
@@ -13,7 +13,11 @@
         }
       },
       "ConfigureTest": {
-        "flags": ["TEST_NAME", "TEST_SRC"]
+        "flags": ["TEST_NAME", "TEST_SRC"],
+        "kwargs": {
+          "GPUS": 1,
+          "PERCENT": 1
+        }
       },
       "ConfigureBench": {
         "flags": ["BENCH_NAME", "BENCH_SRC"]
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 943b89238e0..a716995182d 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -379,6 +379,8 @@ endfunction()
 
 if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
+      # This version must be kept in sync with the libarrow version pinned for builds in
+      # dependencies.yaml.
       10.0.1
       CACHE STRING "The version of Arrow to find (or build)"
   )
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 8cd4f8c6d27..91c3dccfdc6 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -121,8 +121,8 @@ recommend watching Sean Parent's [C++ Seasoning talk](https://www.youtube.com/wa
 and we try to follow his rules: "No raw loops. No raw pointers. No raw synchronization primitives."
 
  * Prefer algorithms from STL and Thrust to raw loops.
- * Prefer libcudf and RMM [owning data structures and views](#libcudf-data-structures) to raw pointers
-   and raw memory allocation.
+ * Prefer libcudf and RMM [owning data structures and views](#libcudf-data-structures) to raw
+   pointers and raw memory allocation.
  * libcudf doesn't have a lot of CPU-thread concurrency, but there is some. And currently libcudf
    does use raw synchronization primitives. So we should revisit Parent's third rule and improve
    here.
@@ -146,8 +146,8 @@ The following guidelines apply to organizing `#include` lines.
  * Separate groups by a blank line.
  * Order the groups from "nearest" to "farthest". In other words, local includes, then includes
    from other RAPIDS libraries, then includes from related libraries, like `<thrust/...>`, then
-   includes from dependencies installed with cuDF, and then standard headers (for example `<string>`,
-   `<iostream>`).
+   includes from dependencies installed with cuDF, and then standard headers (for example
+   `<string>`, `<iostream>`).
  * Use `<>` instead of `""` unless the header is in the same directory as the source file.
  * Tools like `clangd` often auto-insert includes when they can, but they usually get the grouping
    and brackets wrong.
@@ -269,6 +269,15 @@ An *immutable*, non-owning view of a table.
 
 A *mutable*, non-owning view of a table.
 
+## cudf::size_type
+
+The `cudf::size_type` is the type used for the number of elements in a column, offsets to elements
+within a column, indices to address specific elements, segments for subsets of column elements, etc.
+It is equivalent to a signed, 32-bit integer type and therefore has a maximum value of 2147483647.
+Some APIs also accept negative index values and those functions support a minimum value of
+-2147483648. This fundamental type also influences output values not just for column size limits
+but for counting elements as well.
+
 ## Spans
 
 libcudf provides `span` classes that mimic C++20 `std::span`, which is a lightweight
@@ -336,8 +345,8 @@ auto s1 = static_cast<ScalarType *>(s.get());
 ```
 
 ### Passing to device
-Each scalar type, except `list_scalar`, has a corresponding non-owning device view class which allows
-access to the value and its validity from the device. This can be obtained using the function
+Each scalar type, except `list_scalar`, has a corresponding non-owning device view class which
+allows access to the value and its validity from the device. This can be obtained using the function
 `get_scalar_device_view(ScalarType s)`. Note that a device view is not provided for a base scalar
 object, only for the derived typed scalar class objects.
 
@@ -348,68 +357,84 @@ data, a specialized device view for list columns can be constructed via
 
 # libcudf Policies and Design Principles
 
-`libcudf` is designed to provide thread-safe, single-GPU accelerated algorithm primitives for solving a wide variety of problems that arise in data science.
-APIs are written to execute on the default GPU, which can be controlled by the caller through standard CUDA device APIs or environment variables like `CUDA_VISIBLE_DEVICES`.
-Our goal is to enable diverse use cases like Spark or Pandas to benefit from the performance of GPUs, and libcudf relies on these higher-level layers like Spark or Dask to orchestrate multi-GPU tasks.
+`libcudf` is designed to provide thread-safe, single-GPU accelerated algorithm primitives for
+solving a wide variety of problems that arise in data science.  APIs are written to execute on the
+default GPU, which can be controlled by the caller through standard CUDA device APIs or environment
+variables like `CUDA_VISIBLE_DEVICES`.  Our goal is to enable diverse use cases like Spark or Pandas
+to benefit from the performance of GPUs, and libcudf relies on these higher-level layers like Spark
+or Dask to orchestrate multi-GPU tasks.
 
-To best satisfy these use-cases, libcudf prioritizes performance and flexibility, which sometimes may come at the cost of convenience.
-While we welcome users to use libcudf directly, we design with the expectation that most users will be consuming libcudf through higher-level layers like Spark or cuDF Python that handle some of details that direct users of libcudf must handle on their own.
-We document these policies and the reasons behind them here.
+To best satisfy these use-cases, libcudf prioritizes performance and flexibility, which sometimes
+may come at the cost of convenience.  While we welcome users to use libcudf directly, we design with
+the expectation that most users will be consuming libcudf through higher-level layers like Spark or
+cuDF Python that handle some of details that direct users of libcudf must handle on their own.  We
+document these policies and the reasons behind them here.
 
 ## libcudf does not introspect data
 
 libcudf APIs generally do not perform deep introspection and validation of input data.
 There are numerous reasons for this:
 1. It violates the single responsibility principle: validation is separate from execution.
-2. Since libcudf data structures store data on the GPU, any validation incurs _at minimum_ the overhead of a kernel launch, and may in general be prohibitively expensive.
+2. Since libcudf data structures store data on the GPU, any validation incurs _at minimum_ the
+   overhead of a kernel launch, and may in general be prohibitively expensive.
 3. API promises around data introspection often significantly complicate implementation.
 
 Users are therefore responsible for passing valid data into such APIs.
 _Note that this policy does not mean that libcudf performs no validation whatsoever_.
 libcudf APIs should still perform any validation that does not require introspection.
-To give some idea of what should or should not be validated, here are (non-exhaustive) lists of examples.
+To give some idea of what should or should not be validated, here are (non-exhaustive) lists of
+examples.
 
 **Things that libcudf should validate**:
-- Input column/table sizes or dtypes
+- Input column/table sizes or data types
 
 **Things that libcudf should not validate**:
 - Integer overflow
-- Ensuring that outputs will not exceed the 2GB size limit for a given set of inputs
+- Ensuring that outputs will not exceed the [2GB size](#cudfsize_type) limit for a given set of
+  inputs
 
 
 ## libcudf expects nested types to have sanitized null masks
 
-Various libcudf APIs accepting columns of nested dtypes (such as `LIST` or `STRUCT`) may assume that these columns have been sanitized.
-In this context, sanitization refers to ensuring that the null elements in a column with a nested dtype are compatible with the elements of nested columns.
+Various libcudf APIs accepting columns of nested data types (such as `LIST` or `STRUCT`) may assume
+that these columns have been sanitized. In this context, sanitization refers to ensuring that the
+null elements in a column with a nested dtype are compatible with the elements of nested columns.
 Specifically:
-- Null elements of list columns should also be empty. The starting offset of a null element should be equal to the ending offset.
+- Null elements of list columns should also be empty. The starting offset of a null element should
+  be equal to the ending offset.
 - Null elements of struct columns should also be null elements in the underlying structs.
-- For compound columns, nulls should only be present at the level of the parent column. Child columns should not contain nulls.
+- For compound columns, nulls should only be present at the level of the parent column. Child
+  columns should not contain nulls.
 - Slice operations on nested columns do not propagate offsets to child columns.
 
-libcudf APIs _should_ promise to never return "dirty" columns, i.e. columns containing unsanitized data.
-Therefore, the only problem is if users construct input columns that are not correctly sanitized and then pass those into libcudf APIs.
+libcudf APIs _should_ promise to never return "dirty" columns, i.e. columns containing unsanitized
+data. Therefore, the only problem is if users construct input columns that are not correctly
+sanitized and then pass those into libcudf APIs.
 
 ## Treat libcudf APIs as if they were asynchronous
 
 libcudf APIs called on the host do not guarantee that the stream is synchronized before returning.
-Work in libcudf occurs on `cudf::get_default_stream().value`, which defaults to the CUDA default stream (stream 0).
-Note that the stream 0 behavior differs if [per-thread default stream is enabled](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html) via `CUDF_USE_PER_THREAD_DEFAULT_STREAM`.
-Any data provided to or returned by libcudf that uses a separate non-blocking stream requires synchronization with the default libcudf stream to ensure stream safety.
+Work in libcudf occurs on `cudf::get_default_stream().value`, which defaults to the CUDA default
+stream (stream 0). Note that the stream 0 behavior differs if [per-thread default stream is
+enabled](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html) via
+`CUDF_USE_PER_THREAD_DEFAULT_STREAM`. Any data provided to or returned by libcudf that uses a
+separate non-blocking stream requires synchronization with the default libcudf stream to ensure
+stream safety.
 
 ## libcudf generally does not make ordering guarantees
 
-Functions like merge or groupby in libcudf make no guarantees about the order of entries in the output.
-Promising deterministic ordering is not, in general, conducive to fast parallel algorithms.
+Functions like merge or groupby in libcudf make no guarantees about the order of entries in the
+output. Promising deterministic ordering is not, in general, conducive to fast parallel algorithms.
 Calling code is responsible for performing sorts after the fact if sorted outputs are needed.
 
 ## libcudf does not promise specific exception messages
 
-libcudf documents the exceptions that will be thrown by an API for different kinds of invalid inputs.
-The types of those exceptions (e.g. `cudf::logic_error`) are part of the public API.
-However, the explanatory string returned by the `what` method of those exceptions is not part of the API and is subject to change.
-Calling code should not rely on the contents of libcudf error messages to determine the nature of the error.
-For information on the types of exceptions that libcudf throws under different circumstances, see the [section on error handling](#errors).
+libcudf documents the exceptions that will be thrown by an API for different kinds of invalid
+inputs. The types of those exceptions (e.g. `cudf::logic_error`) are part of the public API.
+However, the explanatory string returned by the `what` method of those exceptions is not part of the
+API and is subject to change. Calling code should not rely on the contents of libcudf error
+messages to determine the nature of the error. For information on the types of exceptions that
+libcudf throws under different circumstances, see the [section on error handling](#errors).
 
 # libcudf API and Implementation
 
@@ -468,14 +493,6 @@ asynchrony if and when we add an asynchronous API to libcudf.
 **Note:** `cudaDeviceSynchronize()` should *never* be used.
 This limits the ability to do any multi-stream/multi-threaded work with libcudf APIs.
 
- ### NVTX Ranges
-
-In order to aid in performance optimization and debugging, all compute intensive libcudf functions
-should have a corresponding NVTX range. In libcudf, we have a convenience macro `CUDF_FUNC_RANGE()`
-that will automatically annotate the lifetime of the enclosing function and use the function's name
-as the name of the NVTX range. For more information about NVTX, see
-[here](https://github.com/NVIDIA/NVTX/tree/dev/c).
-
  ### Stream Creation
 
 There may be times in implementing libcudf features where it would be advantageous to use streams
@@ -487,8 +504,8 @@ should avoid creating streams (even if it is slightly less efficient). It is a g
 
 ## Memory Allocation
 
-Device [memory resources](#rmmdevice_memory_resource) are used in libcudf to abstract and control how device
-memory is allocated.
+Device [memory resources](#rmmdevice_memory_resource) are used in libcudf to abstract and control
+how device memory is allocated.
 
 ### Output Memory
 
@@ -508,6 +525,12 @@ std::unique_ptr<column> returns_output_memory(
 void does_not_allocate_output_memory(...);
 ```
 
+This rule automatically applies to all detail APIs that allocates memory. Any detail API may be
+called by any public API, and therefore could be allocating memory that is returned to the user.
+To support such uses cases, all detail APIs allocating memory resources should accept an `mr`
+parameter. Callers are responsible for either passing through a provided `mr` or
+`rmm::mr::get_current_device_resource()` as needed.
+
 ### Temporary Memory
 
 Not all memory allocated within a libcudf API is returned to the caller. Often algorithms must
@@ -528,7 +551,7 @@ rmm::device_buffer some_function(
 ### Memory Management
 
 libcudf code generally eschews raw pointers and direct memory allocation. Use RMM classes built to
-use `device_memory_resource`(*)s for device memory allocation with automated lifetime management.
+use `device_memory_resource`s for device memory allocation with automated lifetime management.
 
 #### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
@@ -610,6 +633,32 @@ rmm::mr::device_memory_resource * mr = new my_custom_resource{...};
 rmm::device_uvector<int32_t> v2{100, s, mr};
 ```
 
+## Default Parameters
+
+While public libcudf APIs are free to include default function parameters, detail functions should
+not. Default memory resource parameters make it easy for developers to accidentally allocate memory
+using the incorrect resource. Avoiding default memory resources forces developers to consider each
+memory allocation carefully.
+
+While streams are not currently exposed in libcudf's API, we plan to do so eventually. As a result,
+the same reasons for memory resources also apply to streams. Public APIs default to using
+`cudf::get_default_stream()`. However, including the same default in detail APIs opens the door for
+developers to forget to pass in a user-provided stream if one is passed to a public API. Forcing
+every detail API call to explicitly pass a stream is intended to prevent such mistakes.
+
+The memory resources (and eventually, the stream) are the final parameters for essentially all
+public APIs. For API consistency, the same is true throughout libcudf's internals. Therefore, a
+consequence of not allowing default streams or MRs is that no parameters in detail APIs may have
+defaults.
+
+## NVTX Ranges
+
+In order to aid in performance optimization and debugging, all compute intensive libcudf functions
+should have a corresponding NVTX range. libcudf has a convenience macro `CUDF_FUNC_RANGE()` that
+automatically annotates the lifetime of the enclosing function and uses the function's name as
+the name of the NVTX range. For more information about NVTX, see
+[here](https://github.com/NVIDIA/NVTX/tree/dev/c).
+
 ## Input/Output Style
 
 The preferred style for how inputs are passed in and outputs are returned is the following:
@@ -746,8 +795,8 @@ where compile time was a problem is in types used to store indices, which can be
 The "Indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
 used for index types (integers) without requiring a type-specific instance. It can be used for any
 iterator interface for reading an array of integer values of type `int8`, `int16`, `int32`,
-`int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always return a
-`cudf::size_type` integer.
+`int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always returns a
+[`cudf::size_type`](#cudfsize_type) integer.
 
 Use the `indexalator_factory` to create an appropriate input iterator from a column_view. Example
 input iterator usage:
@@ -879,9 +928,9 @@ CUDF_FAIL("This code path should not be reached.");
 
 ### CUDA Error Checking
 
-Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
-macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The
-thrown exception includes a description of the CUDA error code in its `what()` message.
+Use the `CUDF_CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions.
+This macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`.
+The thrown exception includes a description of the CUDA error code in its `what()` message.
 
 Example:
 
@@ -1104,8 +1153,8 @@ For list columns, the parent column's type is `LIST` and contains no data, but i
 the number of lists in the column, and its null mask represents the validity of each list element.
 The parent has two children.
 
-1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each list
-   in a dense column of elements.
+1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the
+   beginning of each list in a dense column of elements.
 2. A column containing the actual data and optional null mask for all elements of all the lists
    packed together.
 
@@ -1152,7 +1201,7 @@ a non-nullable column of `INT8` data. The parent column's type is `STRING` and c
 but its size represents the number of strings in the column, and its null mask represents the
 validity of each string. To summarize, the strings column children are:
 
-1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each
+1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each
    string in a dense column of all characters.
 2. A non-nullable column of `INT8` elements of all the characters across all the strings packed
    together.
@@ -1264,9 +1313,9 @@ libcudf provides view types for nested column types as well as for the data elem
 `cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of
 any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore
 `cudf::string_view` is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
-data type for a `cudf::column` of type `INT32`. As it's name implies, this is a read-only object
-instance that points to device memory inside the strings column. It's lifespan is the same (or less)
-as the column it views.
+data type for a `cudf::column` of type [`size_type`](#cudfsize_type). As its name implies, this is a
+read-only object instance that points to device memory inside the strings column. It's lifespan is
+the same (or less) as the column it views.
 
 Use the `column_device_view::element` method to access an individual row element. Like any other
 column, do not call `element()` on a row that is null.
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index e269d4d2e13..b688bf3d445 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -535,7 +535,9 @@ class argmin_aggregation final : public rolling_aggregation, public groupby_aggr
 /**
  * @brief Derived class for specifying a nunique aggregation
  */
-class nunique_aggregation final : public groupby_aggregation, public reduce_aggregation {
+class nunique_aggregation final : public groupby_aggregation,
+                                  public reduce_aggregation,
+                                  public segmented_reduce_aggregation {
  public:
   nunique_aggregation(null_policy null_handling)
     : aggregation{NUNIQUE}, _null_handling{null_handling}
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index ffd8be971ab..e5609568d10 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,13 +30,12 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  std::string const& ptx,
-  data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         std::string const& ptx,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::binary_operation(scalar const&, column_view const&, binary_operator,
@@ -44,13 +43,12 @@ std::unique_ptr<column> binary_operation(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> binary_operation(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(scalar const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator,
@@ -58,13 +56,12 @@ std::unique_ptr<column> binary_operation(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         scalar const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
@@ -72,12 +69,11 @@ std::unique_ptr<column> binary_operation(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/calendrical_month_sequence.cuh b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
index 9dba0ba8961..59fb6758973 100644
--- a/cpp/include/cudf/detail/calendrical_month_sequence.cuh
+++ b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ struct calendrical_month_sequence_functor {
     scalar const& input,
     size_type months,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    rmm::mr::device_memory_resource* mr)
   {
     // Return empty column if n = 0
     if (n == 0) return cudf::make_empty_column(input.type());
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 925029597a6..442814bc4fd 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,20 +33,18 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns_to_concat,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::concatenate(host_span<table_view const>,rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> concatenate(
-  host_span<table_view const> tables_to_concat,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 8c3f315284d..83395f8fa90 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,12 +144,11 @@ std::vector<table_view> split(table_view const& input,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> shift(
-  column_view const& input,
-  size_type offset,
-  scalar const& fill_value,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> shift(column_view const& input,
+                              size_type offset,
+                              scalar const& fill_value,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Performs segmented shifts for specified values.
@@ -184,24 +183,22 @@ std::unique_ptr<column> shift(
  *
  * @note If `offset == 0`, a copy of @p segmented_values is returned.
  */
-std::unique_ptr<column> segmented_shift(
-  column_view const& segmented_values,
-  device_span<size_type const> segment_offsets,
-  size_type offset,
-  scalar const& fill_value,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_shift(column_view const& segmented_values,
+                                        device_span<size_type const> segment_offsets,
+                                        size_type offset,
+                                        scalar const& fill_value,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::contiguous_split
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  **/
-std::vector<packed_table> contiguous_split(
-  cudf::table_view const& input,
-  std::vector<size_type> const& splits,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::vector<packed_table> contiguous_split(cudf::table_view const& input,
+                                           std::vector<size_type> const& splits,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::pack
@@ -210,7 +207,7 @@ std::vector<packed_table> contiguous_split(
  **/
 packed_columns pack(cudf::table_view const& input,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::allocate_like(column_view const&, size_type, mask_allocation_policy,
@@ -218,12 +215,11 @@ packed_columns pack(cudf::table_view const& input,
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> allocate_like(
-  column_view const& input,
-  size_type size,
-  mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> allocate_like(column_view const& input,
+                                      size_type size,
+                                      mask_allocation_policy mask_alloc,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, column_view const&,
@@ -231,12 +227,11 @@ std::unique_ptr<column> allocate_like(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> copy_if_else(
-  column_view const& lhs,
-  column_view const& rhs,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> copy_if_else(column_view const& lhs,
+                                     column_view const& rhs,
+                                     column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, column_view const&,
@@ -244,12 +239,11 @@ std::unique_ptr<column> copy_if_else(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> copy_if_else(
-  scalar const& lhs,
-  column_view const& rhs,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> copy_if_else(scalar const& lhs,
+                                     column_view const& rhs,
+                                     column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, scalar const&,
@@ -257,12 +251,11 @@ std::unique_ptr<column> copy_if_else(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> copy_if_else(
-  column_view const& lhs,
-  scalar const& rhs,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> copy_if_else(column_view const& lhs,
+                                     scalar const& rhs,
+                                     column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, scalar const&,
@@ -270,36 +263,33 @@ std::unique_ptr<column> copy_if_else(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> copy_if_else(
-  scalar const& lhs,
-  scalar const& rhs,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> copy_if_else(scalar const& lhs,
+                                     scalar const& rhs,
+                                     column_view const& boolean_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::sample
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> sample(
-  table_view const& input,
-  size_type const n,
-  sample_with_replacement replacement = sample_with_replacement::FALSE,
-  int64_t const seed                  = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> sample(table_view const& input,
+                              size_type const n,
+                              sample_with_replacement replacement,
+                              int64_t const seed,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::get_element
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<scalar> get_element(
-  column_view const& input,
-  size_type index,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> get_element(column_view const& input,
+                                    size_type index,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::has_nonempty_nulls
@@ -320,10 +310,9 @@ bool may_have_nonempty_nulls(column_view const& input, rmm::cuda_stream_view str
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> purge_nonempty_nulls(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 6eea72a1e0d..2870a891f87 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -229,14 +229,13 @@ struct DeviceType<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
 template <typename Filter, int block_size>
 struct scatter_gather_functor {
   template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(
-    cudf::column_view const& input,
-    cudf::size_type const& output_size,
-    cudf::size_type const* block_offsets,
-    Filter filter,
-    cudf::size_type per_thread,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
+                                           cudf::size_type const& output_size,
+                                           cudf::size_type const* block_offsets,
+                                           Filter filter,
+                                           cudf::size_type per_thread,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     auto output_column = cudf::detail::allocate_like(
       input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
@@ -277,14 +276,13 @@ struct scatter_gather_functor {
 
   template <typename T,
             std::enable_if_t<!cudf::is_fixed_width<T>() and !cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(
-    cudf::column_view const& input,
-    cudf::size_type const& output_size,
-    cudf::size_type const*,
-    Filter filter,
-    cudf::size_type,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
+                                           cudf::size_type const& output_size,
+                                           cudf::size_type const*,
+                                           Filter filter,
+                                           cudf::size_type,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     rmm::device_uvector<cudf::size_type> indices(output_size, stream);
 
@@ -320,11 +318,10 @@ struct scatter_gather_functor {
  * @return unique_ptr<table> The table generated from filtered `input`.
  */
 template <typename Filter>
-std::unique_ptr<table> copy_if(
-  table_view const& input,
-  Filter filter,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> copy_if(table_view const& input,
+                               Filter filter,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index b20753239ab..083b12edbf8 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,15 +145,14 @@ __launch_bounds__(block_size) __global__
  *                    by `filter[i]`
  */
 template <typename FilterFn, typename LeftIter, typename RightIter>
-std::unique_ptr<column> copy_if_else(
-  bool nullable,
-  LeftIter lhs_begin,
-  LeftIter lhs_end,
-  RightIter rhs,
-  FilterFn filter,
-  cudf::data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> copy_if_else(bool nullable,
+                                     LeftIter lhs_begin,
+                                     LeftIter lhs_end,
+                                     RightIter rhs,
+                                     FilterFn filter,
+                                     cudf::data_type output_type,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   // This is the type of the thrust::optional element in the passed iterators
   using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 22714e97dfa..0d5aa509e08 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -203,14 +203,13 @@ void copy_range_in_place(column_view const& source,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return std::unique_ptr<column> The result target column
  */
-std::unique_ptr<column> copy_range(
-  column_view const& source,
-  column_view const& target,
-  size_type source_begin,
-  size_type source_end,
-  size_type target_begin,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> copy_range(column_view const& source,
+                                   column_view const& target,
+                                   size_type source_begin,
+                                   size_type source_end,
+                                   size_type target_begin,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index c2e3c32b65f..c5160958165 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,70 +29,63 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_year(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_year(cudf::column_view const& column,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_month(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_month(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_month(cudf::column_view const& column,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_day(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_day(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_day(cudf::column_view const& column,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_weekday(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_weekday(cudf::column_view const& column,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_hour(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_hour(cudf::column_view const& column,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_minute(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_minute(cudf::column_view const& column,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_second(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_second(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_second(cudf::column_view const& column,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&,
@@ -100,10 +93,9 @@ std::unique_ptr<cudf::column> extract_second(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_millisecond_fraction(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_millisecond_fraction(cudf::column_view const& column,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&,
@@ -111,10 +103,9 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_microsecond_fraction(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_microsecond_fraction(cudf::column_view const& column,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&,
@@ -122,30 +113,27 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> extract_nanosecond_fraction(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_nanosecond_fraction(cudf::column_view const& column,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> last_day_of_month(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> last_day_of_month(cudf::column_view const& column,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> day_of_year(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&,
@@ -153,11 +141,10 @@ std::unique_ptr<cudf::column> day_of_year(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> add_calendrical_months(
-  cudf::column_view const& timestamps,
-  cudf::column_view const& months,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
+                                                     cudf::column_view const& months,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&,
@@ -165,26 +152,23 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> add_calendrical_months(
-  cudf::column_view const& timestamps,
-  cudf::scalar const& months,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
+                                                     cudf::scalar const& months,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> is_leap_year(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> is_leap_year(cudf::column_view const& column,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<cudf::column> extract_quarter(
-  cudf::column_view const& column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> extract_quarter(cudf::column_view const& column,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace datetime
diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index e34acfff6b9..caaccfb4851 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,13 +43,12 @@ void fill_in_place(mutable_column_view& destination,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> fill(
-  column_view const& input,
-  size_type begin,
-  size_type end,
-  scalar const& value,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> fill(column_view const& input,
+                             size_type begin,
+                             size_type end,
+                             scalar const& value,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 57d834e6277..5460a0e5a76 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -583,10 +583,12 @@ void gather_bitmask(table_view const& source,
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
-  auto d_target_masks = make_device_uvector_async(target_masks, stream);
+  auto d_target_masks =
+    make_device_uvector_async(target_masks, stream, rmm::mr::get_current_device_resource());
 
   auto const device_source = table_device_view::create(source, stream);
-  auto d_valid_counts      = make_zeroed_device_uvector_async<size_type>(target.size(), stream);
+  auto d_valid_counts      = make_zeroed_device_uvector_async<size_type>(
+    target.size(), stream, rmm::mr::get_current_device_resource());
 
   // Dispatch operation enum to get implementation
   auto const impl = [op]() {
@@ -647,13 +649,12 @@ void gather_bitmask(table_view const& source,
  * @return cudf::table Result of the gather
  */
 template <typename MapIterator>
-std::unique_ptr<table> gather(
-  table_view const& source_table,
-  MapIterator gather_map_begin,
-  MapIterator gather_map_end,
-  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> gather(table_view const& source_table,
+                              MapIterator gather_map_begin,
+                              MapIterator gather_map_end,
+                              out_of_bounds_policy bounds_policy,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   std::vector<std::unique_ptr<column>> destination_columns;
 
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 9d61a8de184..034eb6c1282 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,13 +61,12 @@ enum class negative_index_policy : bool { ALLOWED, NOT_ALLOWED };
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Result of the gather
  */
-std::unique_ptr<table> gather(
-  table_view const& source_table,
-  column_view const& gather_map,
-  out_of_bounds_policy bounds_policy,
-  negative_index_policy neg_indices,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> gather(table_view const& source_table,
+                              column_view const& gather_map,
+                              out_of_bounds_policy bounds_policy,
+                              negative_index_policy neg_indices,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::detail::gather(table_view const&,column_view const&,table_view
@@ -76,13 +75,12 @@ std::unique_ptr<table> gather(
  *
  * @throws cudf::logic_error if `gather_map` span size is larger than max of `size_type`.
  */
-std::unique_ptr<table> gather(
-  table_view const& source_table,
-  device_span<size_type const> const gather_map,
-  out_of_bounds_policy bounds_policy,
-  negative_index_policy neg_indices,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> gather(table_view const& source_table,
+                              device_span<size_type const> const gather_map,
+                              out_of_bounds_policy bounds_policy,
+                              negative_index_policy neg_indices,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
index 9e64048b7b4..e081a626c75 100644
--- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,12 +36,11 @@ namespace detail {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param[in] mr Device memory resource used to allocate device memory of the returned column.
  */
-std::unique_ptr<column> group_replace_nulls(
-  cudf::column_view const& grouped_value,
-  device_span<size_type const> group_labels,
-  cudf::replace_policy replace_policy,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_value,
+                                            device_span<size_type const> group_labels,
+                                            cudf::replace_policy replace_policy,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index e2510d75a83..663ff44ca56 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -85,10 +85,9 @@ struct sort_groupby_helper {
    * @param values The value column to group and sort
    * @return the sorted and grouped column
    */
-  std::unique_ptr<column> sorted_values(
-    column_view const& values,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::unique_ptr<column> sorted_values(column_view const& values,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Groups a column of values according to `keys`
@@ -100,28 +99,25 @@ struct sort_groupby_helper {
    * @param values The value column to group
    * @return the grouped column
    */
-  std::unique_ptr<column> grouped_values(
-    column_view const& values,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::unique_ptr<column> grouped_values(column_view const& values,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Get a table of sorted unique keys
    *
    * @return a new table in which each row is a unique row in the sorted key table.
    */
-  std::unique_ptr<table> unique_keys(
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::unique_ptr<table> unique_keys(rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Get a table of sorted keys
    *
    * @return a new table containing the sorted keys.
    */
-  std::unique_ptr<table> sorted_keys(
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::unique_ptr<table> sorted_keys(rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Get the number of groups in `keys`
diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index b7469d80a8d..771b3e150ec 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,29 +31,25 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> hash(
-  table_view const& input,
-  hash_id hash_function               = hash_id::HASH_MURMUR3,
-  uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> hash(table_view const& input,
+                             hash_id hash_function,
+                             uint32_t seed,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> murmur_hash3_32(
-  table_view const& input,
-  uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> murmur_hash3_32(table_view const& input,
+                                        uint32_t seed,
+                                        rmm::cuda_stream_view,
+                                        rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> spark_murmur_hash3_32(
-  table_view const& input,
-  uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> spark_murmur_hash3_32(table_view const& input,
+                                              uint32_t seed,
+                                              rmm::cuda_stream_view,
+                                              rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> md5_hash(
-  table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> md5_hash(table_view const& input,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /* Copyright 2005-2014 Daniel James.
  *
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 25ce5b09eb8..7117517487c 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,20 +40,18 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> from_dlpack(
-  DLManagedTensor const* managed_tensor,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::to_dlpack
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-DLManagedTensor* to_dlpack(
-  table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+DLManagedTensor* to_dlpack(table_view const& input,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr);
 
 // Creating arrow as per given type_id and buffer arguments
 template <typename... Ts>
@@ -111,19 +109,18 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type);
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata = {},
-                                       rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                                       arrow::MemoryPool* ar_mr     = arrow::default_memory_pool());
+                                       std::vector<column_metadata> const& metadata,
+                                       rmm::cuda_stream_view stream,
+                                       arrow::MemoryPool* ar_mr);
 
 /**
  * @copydoc cudf::arrow_to_cudf
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> from_arrow(
-  arrow::Table const& input_table,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
index f556c81c371..7f3cf033e66 100644
--- a/cpp/include/cudf/detail/label_bins.hpp
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,14 +45,13 @@ namespace detail {
  *
  * @param stream Stream view on which to allocate resources and queue execution.
  */
-std::unique_ptr<column> label_bins(
-  column_view const& input,
-  column_view const& left_edges,
-  inclusive left_inclusive,
-  column_view const& right_edges,
-  inclusive right_inclusive,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> label_bins(column_view const& input,
+                                   column_view const& left_edges,
+                                   inclusive left_inclusive,
+                                   column_view const& right_edges,
+                                   inclusive right_inclusive,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 /** @} */  // end of group
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index cb9ced6fc28..3ff3bb4cf3c 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -114,13 +114,12 @@ __global__ void offset_bitmask_binop(Binop op,
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 template <typename Binop>
-std::pair<rmm::device_buffer, size_type> bitmask_binop(
-  Binop op,
-  host_span<bitmask_type const* const> masks,
-  host_span<size_type const> masks_begin_bits,
-  size_type mask_size_bits,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::pair<rmm::device_buffer, size_type> bitmask_binop(Binop op,
+                                                       host_span<bitmask_type const* const> masks,
+                                                       host_span<size_type const> masks_begin_bits,
+                                                       size_type mask_size_bits,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
 {
   auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};
   auto null_count =
@@ -426,7 +425,8 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
 
   // Construct a contiguous host buffer of indices and copy to device.
   auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
-  auto const d_indices = make_device_uvector_async(h_indices, stream);
+  auto const d_indices =
+    make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
 
   // Compute the bit counts over each segment.
   auto first_bit_indices_begin = thrust::make_transform_iterator(
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index a0e04d7b215..7f1b15893c5 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,11 +31,10 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-rmm::device_buffer create_null_mask(
-  size_type size,
-  mask_state state,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_buffer create_null_mask(size_type size,
+                                    mask_state state,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::set_null_mask(bitmask_type*, size_type, size_type, bool)
@@ -209,22 +208,20 @@ std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-rmm::device_buffer copy_bitmask(
-  bitmask_type const* mask,
-  size_type begin_bit,
-  size_type end_bit,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_buffer copy_bitmask(bitmask_type const* mask,
+                                size_type begin_bit,
+                                size_type end_bit,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::copy_bitmask(column_view const& view, rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-rmm::device_buffer copy_bitmask(
-  column_view const& view,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_buffer copy_bitmask(column_view const& view,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc bitmask_and(host_span<bitmask_type const* const>, host_span<size_type> const,
@@ -232,32 +229,29 @@ rmm::device_buffer copy_bitmask(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::pair<rmm::device_buffer, size_type> bitmask_and(
-  host_span<bitmask_type const* const> masks,
-  host_span<size_type const> masks_begin_bits,
-  size_type mask_size_bits,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type const* const> masks,
+                                                     host_span<size_type const> masks_begin_bits,
+                                                     size_type mask_size_bits,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::bitmask_and
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::pair<rmm::device_buffer, size_type> bitmask_and(
-  table_view const& view,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::bitmask_or
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::pair<rmm::device_buffer, size_type> bitmask_or(
-  table_view const& view,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index 69d9705556f..883d5d158fb 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,12 +32,11 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> repeat(
-  table_view const& input_table,
-  column_view const& count,
-  bool check_count,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> repeat(table_view const& input_table,
+                              column_view const& count,
+                              bool check_count,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::repeat(table_view const&, size_type,
@@ -45,11 +44,10 @@ std::unique_ptr<table> repeat(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> repeat(
-  table_view const& input_table,
-  size_type count,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> repeat(table_view const& input_table,
+                              size_type count,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index 9721c6e9849..da83f7b285d 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,11 +31,10 @@ namespace detail {
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_nulls(
-  column_view const& input,
-  cudf::column_view const& replacement,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nulls(column_view const& input,
+                                      cudf::column_view const& replacement,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, scalar const&,
@@ -43,11 +42,10 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_nulls(
-  column_view const& input,
-  scalar const& replacement,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nulls(column_view const& input,
+                                      scalar const& replacement,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, replace_policy const&,
@@ -55,11 +53,10 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_nulls(
-  column_view const& input,
-  replace_policy const& replace_policy,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nulls(column_view const& input,
+                                      replace_policy const& replace_policy,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, column_view const&,
@@ -67,11 +64,10 @@ std::unique_ptr<column> replace_nulls(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_nans(
-  column_view const& input,
-  column_view const& replacement,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nans(column_view const& input,
+                                     column_view const& replacement,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, scalar const&,
@@ -79,33 +75,30 @@ std::unique_ptr<column> replace_nans(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_nans(
-  column_view const& input,
-  scalar const& replacement,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nans(column_view const& input,
+                                     scalar const& replacement,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::find_and_replace_all
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> find_and_replace_all(
-  column_view const& input_col,
-  column_view const& values_to_replace,
-  column_view const& replacement_values,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> find_and_replace_all(column_view const& input_col,
+                                             column_view const& values_to_replace,
+                                             column_view const& replacement_values,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::normalize_nans_and_zeros
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> normalize_nans_and_zeros(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index ccffcbc61df..5ab53690a23 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,21 +30,19 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<table> tile(
-  table_view const& input,
-  size_type count,
-  rmm::cuda_stream_view,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> tile(table_view const& input,
+                            size_type count,
+                            rmm::cuda_stream_view,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::interleave_columns
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<column> interleave_columns(
-  table_view const& input,
-  rmm::cuda_stream_view,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           rmm::cuda_stream_view,
+                                           rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
index dcaece2bafc..da90217c254 100644
--- a/cpp/include/cudf/detail/rolling.hpp
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,14 +39,13 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> rolling_window(
-  column_view const& input,
-  column_view const& preceding_window,
-  column_view const& following_window,
-  size_type min_periods,
-  rolling_aggregation const& agg,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> rolling_window(column_view const& input,
+                                       column_view const& preceding_window,
+                                       column_view const& following_window,
+                                       size_type min_periods,
+                                       rolling_aggregation const& agg,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index 1e5612919f4..cdfc7caef37 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,12 +31,11 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> round(
-  column_view const& input,
-  int32_t decimal_places,
-  rounding_method method,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> round(column_view const& input,
+                              int32_t decimal_places,
+                              rounding_method method,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index c8b17e22df2..dbf7bfa9527 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -386,13 +386,12 @@ struct column_scatterer_impl<struct_view> {
  * @return Result of scattering values from source to target
  */
 template <typename MapIterator>
-std::unique_ptr<table> scatter(
-  table_view const& source,
-  MapIterator scatter_map_begin,
-  MapIterator scatter_map_end,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> scatter(table_view const& source,
+                               MapIterator scatter_map_begin,
+                               MapIterator scatter_map_end,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 7c4b04537ea..39ae4fe1944 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,12 +59,11 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
-std::unique_ptr<table> scatter(
-  table_view const& source,
-  column_view const& scatter_map,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> scatter(table_view const& source,
+                               column_view const& scatter_map,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::detail::scatter(table_view const&,column_view const&,table_view
@@ -72,12 +71,11 @@ std::unique_ptr<table> scatter(
  *
  * @throws cudf::logic_error if `scatter_map` span size is larger than max of `size_type`.
  */
-std::unique_ptr<table> scatter(
-  table_view const& source,
-  device_span<size_type const> const scatter_map,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> scatter(table_view const& source,
+                               device_span<size_type const> const scatter_map,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -108,12 +106,11 @@ std::unique_ptr<table> scatter(
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
-std::unique_ptr<table> scatter(
-  std::vector<std::reference_wrapper<const scalar>> const& source,
-  column_view const& indices,
-  table_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
+                               column_view const& indices,
+                               table_view const& target,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
@@ -123,12 +120,11 @@ std::unique_ptr<table> scatter(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> boolean_mask_scatter(
-  table_view const& source,
-  table_view const& target,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> boolean_mask_scatter(table_view const& source,
+                                            table_view const& target,
+                                            column_view const& boolean_mask,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
@@ -144,7 +140,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 56d41fd635c..4c4ad7834f4 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,12 +89,11 @@ std::unique_ptr<column> contains(column_view const& haystack,
  * @param mr Device memory resource used to allocate the returned vector
  * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
  */
-rmm::device_uvector<bool> contains(
-  table_view const& haystack,
-  table_view const& needles,
-  null_equality compare_nulls,
-  nan_equality compare_nans,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_uvector<bool> contains(table_view const& haystack,
+                                   table_view const& needles,
+                                   null_equality compare_nulls,
+                                   nan_equality compare_nans,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index 4a9bf5c74e1..3c3d1d0ed9e 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,12 +32,11 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> sequence(
-  size_type size,
-  scalar const& init,
-  scalar const& step,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> sequence(size_type size,
+                                 scalar const& init,
+                                 scalar const& step,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init,
@@ -46,11 +45,10 @@ std::unique_ptr<column> sequence(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> sequence(
-  size_type size,
-  scalar const& init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> sequence(size_type size,
+                                 scalar const& init,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::calendrical_month_sequence(size_type size,
@@ -60,12 +58,11 @@ std::unique_ptr<column> sequence(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> calendrical_month_sequence(
-  size_type size,
-  scalar const& init,
-  size_type months,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
+                                                         scalar const& init,
+                                                         size_type months,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index e725718ed22..e0fc7b71cd9 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,12 +32,11 @@ namespace detail {
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> drop_nulls(
-  table_view const& input,
-  std::vector<size_type> const& keys,
-  cudf::size_type keep_threshold,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> drop_nulls(table_view const& input,
+                                  std::vector<size_type> const& keys,
+                                  cudf::size_type keep_threshold,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::drop_nans(table_view const&, std::vector<size_type> const&,
@@ -45,50 +44,46 @@ std::unique_ptr<table> drop_nulls(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> drop_nans(
-  table_view const& input,
-  std::vector<size_type> const& keys,
-  cudf::size_type keep_threshold,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> drop_nans(table_view const& input,
+                                 std::vector<size_type> const& keys,
+                                 cudf::size_type keep_threshold,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::apply_boolean_mask
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> apply_boolean_mask(
-  table_view const& input,
-  column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> apply_boolean_mask(table_view const& input,
+                                          column_view const& boolean_mask,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::unique
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> unique(
-  table_view const& input,
-  std::vector<size_type> const& keys,
-  duplicate_keep_option keep,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> unique(table_view const& input,
+                              std::vector<size_type> const& keys,
+                              duplicate_keep_option keep,
+                              null_equality nulls_equal,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::distinct
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<table> distinct(
-  table_view const& input,
-  std::vector<size_type> const& keys,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> distinct(table_view const& input,
+                                std::vector<size_type> const& keys,
+                                duplicate_keep_option keep,
+                                null_equality nulls_equal,
+                                nan_equality nans_equal,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create a new table without duplicate rows.
@@ -110,14 +105,13 @@ std::unique_ptr<table> distinct(
  * @param mr Device memory resource used to allocate the returned table
  * @return A table containing the resulting distinct rows
  */
-std::unique_ptr<table> stable_distinct(
-  table_view const& input,
-  std::vector<size_type> const& keys,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<table> stable_distinct(table_view const& input,
+                                       std::vector<size_type> const& keys,
+                                       duplicate_keep_option keep,
+                                       null_equality nulls_equal,
+                                       nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create a column of indices of all distinct rows in the input table.
@@ -133,13 +127,12 @@ std::unique_ptr<table> stable_distinct(
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the result indices
  */
-rmm::device_uvector<size_type> get_distinct_indices(
-  table_view const& input,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
+                                                    duplicate_keep_option keep,
+                                                    null_equality nulls_equal,
+                                                    nan_equality nans_equal,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy)
@@ -157,8 +150,8 @@ cudf::size_type unique_count(column_view const& input,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 cudf::size_type unique_count(table_view const& input,
-                             null_equality nulls_equal    = null_equality::EQUAL,
-                             rmm::cuda_stream_view stream = cudf::get_default_stream());
+                             null_equality nulls_equal,
+                             rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
@@ -176,8 +169,8 @@ cudf::size_type distinct_count(column_view const& input,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 cudf::size_type distinct_count(table_view const& input,
-                               null_equality nulls_equal    = null_equality::EQUAL,
-                               rmm::cuda_stream_view stream = cudf::get_default_stream());
+                               null_equality nulls_equal,
+                               rmm::cuda_stream_view stream);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index 4a708d2fb51..5fcc331a382 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -175,7 +175,7 @@ class flattened_table {
   std::vector<null_order> const& null_precedence,
   column_nullability nullability,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Superimpose nulls from a given null mask into the input column, using bitwise AND.
@@ -222,9 +222,7 @@ class flattened_table {
  *         to be kept alive.
  */
 [[nodiscard]] std::pair<column_view, temporary_nullable_data> push_down_nulls(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Push down nulls from columns of the input table into their children columns, using
@@ -251,9 +249,7 @@ class flattened_table {
  *         to be kept alive.
  */
 [[nodiscard]] std::pair<table_view, temporary_nullable_data> push_down_nulls(
-  table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Checks if a column or any of its children is a struct column with structs that are null.
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 9df3f9daf3f..d9fb0efed45 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,15 +131,14 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& values,
  *
  * @returns The constructed tdigest column.
  */
-std::unique_ptr<column> make_tdigest_column(
-  size_type num_rows,
-  std::unique_ptr<column>&& centroid_means,
-  std::unique_ptr<column>&& centroid_weights,
-  std::unique_ptr<column>&& tdigest_offsets,
-  std::unique_ptr<column>&& min_values,
-  std::unique_ptr<column>&& max_values,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> make_tdigest_column(size_type num_rows,
+                                            std::unique_ptr<column>&& centroid_means,
+                                            std::unique_ptr<column>&& centroid_weights,
+                                            std::unique_ptr<column>&& tdigest_offsets,
+                                            std::unique_ptr<column>&& min_values,
+                                            std::unique_ptr<column>&& max_values,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create an empty tdigest column.
@@ -151,9 +150,8 @@ std::unique_ptr<column> make_tdigest_column(
  *
  * @returns An empty tdigest column.
  */
-std::unique_ptr<column> make_empty_tdigest_column(
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create an empty tdigest scalar.
@@ -165,9 +163,8 @@ std::unique_ptr<column> make_empty_tdigest_column(
  *
  * @returns An empty tdigest scalar.
  */
-std::unique_ptr<scalar> make_empty_tdigest_scalar(
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Generate a tdigest column from a grouped, sorted set of numeric input values.
diff --git a/cpp/include/cudf/detail/timezone.cuh b/cpp/include/cudf/detail/timezone.cuh
new file mode 100644
index 00000000000..830ee1a7fa6
--- /dev/null
+++ b/cpp/include/cudf/detail/timezone.cuh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/timezone.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+
+namespace cudf::detail {
+
+/**
+ * @brief Returns the UT offset for a given date and given timezone table.
+ *
+ * @param transition_times Transition times; trailing `solar_cycle_entry_count` entries are used for
+ * all times beyond the one covered by the TZif file
+ * @param offsets Time offsets in specific intervals; trailing `solar_cycle_entry_count` entries are
+ * used for all times beyond the one covered by the TZif file
+ * @param ts ORC timestamp
+ *
+ * @return offset from UT, in seconds
+ */
+inline __device__ duration_s get_ut_offset(table_device_view tz_table, timestamp_s ts)
+{
+  if (tz_table.num_rows() == 0) { return duration_s{0}; }
+
+  cudf::device_span<timestamp_s const> transition_times(tz_table.column(0).head<timestamp_s>(),
+                                                        static_cast<size_t>(tz_table.num_rows()));
+
+  auto const ts_ttime_it = [&]() {
+    auto last_less_equal = [](auto begin, auto end, auto value) {
+      auto const first_larger = thrust::upper_bound(thrust::seq, begin, end, value);
+      // Return start of the range if all elements are larger than the value
+      if (first_larger == begin) return begin;
+      // Element before the first larger element is the last one less or equal
+      return first_larger - 1;
+    };
+
+    auto const file_entry_end =
+      transition_times.begin() + (transition_times.size() - solar_cycle_entry_count);
+
+    if (ts <= *(file_entry_end - 1)) {
+      // Search the file entries if the timestamp is in range
+      return last_less_equal(transition_times.begin(), file_entry_end, ts);
+    } else {
+      auto project_to_cycle = [](timestamp_s ts) {
+        // Years divisible by four are leap years
+        // Exceptions are years divisible by 100, but not divisible by 400
+        static constexpr int32_t num_leap_years_in_cycle =
+          solar_cycle_years / 4 - (solar_cycle_years / 100 - solar_cycle_years / 400);
+        static constexpr duration_s cycle_s = cuda::std::chrono::duration_cast<duration_s>(
+          duration_D{365 * solar_cycle_years + num_leap_years_in_cycle});
+        return timestamp_s{(ts.time_since_epoch() + cycle_s) % cycle_s};
+      };
+      // Search the 400-year cycle if outside of the file entries range
+      return last_less_equal(file_entry_end, transition_times.end(), project_to_cycle(ts));
+    }
+  }();
+
+  return tz_table.column(1).element<duration_s>(ts_ttime_it - transition_times.begin());
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
new file mode 100644
index 00000000000..f7f97c0a7c2
--- /dev/null
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/timezone.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::detail {
+
+/**
+ * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> make_timezone_transition_table(
+  std::optional<std::string_view> tzif_dir,
+  std::string_view timezone_name,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 8e19ebb8da7..5b64f61f11a 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,24 +29,22 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> transform(
-  column_view const& input,
-  std::string const& unary_udf,
-  data_type output_type,
-  bool is_ptx,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> transform(column_view const& input,
+                                  std::string const& unary_udf,
+                                  data_type output_type,
+                                  bool is_ptx,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::compute_column
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> compute_column(
-  table_view const table,
-  ast::operation const& expr,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> compute_column(table_view const table,
+                                       ast::operation const& expr,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::nans_to_nulls
@@ -54,9 +52,7 @@ std::unique_ptr<column> compute_column(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::bools_to_mask
@@ -64,9 +60,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::encode
@@ -74,42 +68,37 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::one_hot_encode
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
-  column_view const& input,
-  column_view const& categories,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
+                                                              column_view const& categories,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::mask_to_bools
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> mask_to_bools(
-  bitmask_type const* null_mask,
-  size_type begin_bit,
-  size_type end_bit,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> mask_to_bools(bitmask_type const* null_mask,
+                                      size_type begin_bit,
+                                      size_type end_bit,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::row_bit_count
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> row_bit_count(
-  table_view const& t,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> row_bit_count(table_view const& t,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index 0470d625edc..d0be51860b2 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,10 +28,9 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::pair<std::unique_ptr<column>, table_view> transpose(
-  table_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index b7ecedc1489..3fbdf4a5a8f 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -45,13 +45,12 @@ namespace detail {
  */
 
 template <typename InputIterator, typename Predicate>
-std::unique_ptr<column> true_if(
-  InputIterator begin,
-  InputIterator end,
-  size_type size,
-  Predicate p,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> true_if(InputIterator begin,
+                                InputIterator end,
+                                size_type size,
+                                Predicate p,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   auto output =
     make_numeric_column(data_type(type_id::BOOL8), size, mask_state::UNALLOCATED, stream, mr);
@@ -68,52 +67,47 @@ std::unique_ptr<column> true_if(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> unary_operation(
-  cudf::column_view const& input,
-  cudf::unary_operator op,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
+                                              cudf::unary_operator op,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_valid
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<cudf::column> is_valid(
-  cudf::column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::cast
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> cast(
-  column_view const& input,
-  data_type type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> cast(column_view const& input,
+                             data_type type,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_nan
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> is_nan(
-  cudf::column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> is_nan(cudf::column_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::is_not_nan
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> is_not_nan(
-  cudf::column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 75df0d92d0a..c446a7b5148 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -48,10 +48,9 @@ namespace detail {
  * @return A device_uvector containing zeros
  */
 template <typename T>
-rmm::device_uvector<T> make_zeroed_device_uvector_async(
-  std::size_t size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_zeroed_device_uvector_async(std::size_t size,
+                                                        rmm::cuda_stream_view stream,
+                                                        rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -70,10 +69,9 @@ rmm::device_uvector<T> make_zeroed_device_uvector_async(
  * @return A device_uvector containing zeros
  */
 template <typename T>
-rmm::device_uvector<T> make_zeroed_device_uvector_sync(
-  std::size_t size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_zeroed_device_uvector_sync(std::size_t size,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -94,10 +92,9 @@ rmm::device_uvector<T> make_zeroed_device_uvector_sync(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_async(
-  host_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -126,9 +123,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_async(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -146,10 +141,9 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_async(
-  device_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_async(device_span<T const> source_data,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -178,9 +172,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_async(
     device_span<typename Container::value_type const>{c}, stream, mr);
@@ -199,10 +191,9 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_sync(
-  host_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_sync(host_span<T const> source_data,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -227,9 +218,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_sync(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -247,10 +236,9 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
  * @return A device_uvector containing the copied data
  */
 template <typename T>
-rmm::device_uvector<T> make_device_uvector_sync(
-  device_span<T const> source_data,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+rmm::device_uvector<T> make_device_uvector_sync(device_span<T const> source_data,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -275,9 +263,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
 {
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 04c78bed17d..76d6fd719a4 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,12 +86,11 @@ __global__ void valid_if_kernel(
  * null count
  */
 template <typename InputIterator, typename Predicate>
-std::pair<rmm::device_buffer, size_type> valid_if(
-  InputIterator begin,
-  InputIterator end,
-  Predicate p,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
+                                                  InputIterator end,
+                                                  Predicate p,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(begin <= end, "Invalid range.");
 
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index c141e25f939..fede8e62d9f 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,11 +36,10 @@ namespace avro {
  *
  * @return The set of columns along with table metadata
  */
-table_with_metadata read_avro(
-  std::unique_ptr<cudf::io::datasource>&& source,
-  avro_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
+                              avro_reader_options const& options,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace avro
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 90d730338fc..9fdc7a47fb9 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ void write_csv(data_sink* sink,
                host_span<std::string const> column_names,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+               rmm::mr::device_memory_resource* mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 7d2884880e7..7b0350e9bc8 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -33,11 +33,10 @@ namespace cudf::io::json::detail {
  *
  * @return cudf::table object that contains the array of cudf::column.
  */
-table_with_metadata read_json(
-  std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
-  json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+table_with_metadata read_json(std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
+                              json_reader_options const& options,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Write an entire dataset to JSON format.
@@ -52,5 +51,5 @@ void write_json(data_sink* sink,
                 table_view const& table,
                 json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::mr::device_memory_resource* mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index b03dbd4fb70..4914f434c98 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -131,7 +131,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   device_span<SymbolT const> json_in,
   cudf::io::json_reader_options const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp
index bf833d4720c..6ae399fbe75 100644
--- a/cpp/include/cudf/io/text/detail/tile_state.hpp
+++ b/cpp/include/cudf/io/text/detail/tile_state.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,7 +82,7 @@ struct scan_tile_state {
 
   scan_tile_state(cudf::size_type num_tiles,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                  rmm::mr::device_memory_resource* mr)
     : tile_status(rmm::device_uvector<cuda::atomic<scan_tile_status, cuda::thread_scope_device>>(
         num_tiles, stream, mr)),
       tile_state_partial(rmm::device_uvector<T>(num_tiles, stream, mr)),
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index a908a9fa227..7bb2e4e2ece 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -165,7 +165,7 @@ struct trie {
    */
   static trie create(std::string const& pattern,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                     rmm::mr::device_memory_resource* mr)
 
   {
     return create(std::vector<std::string>{pattern}, stream, mr);
@@ -181,7 +181,7 @@ struct trie {
    */
   static trie create(std::vector<std::string> const& patterns,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                     rmm::mr::device_memory_resource* mr)
   {
     std::vector<char> tokens;
     std::vector<uint8_t> transitions;
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 6f97eb768d9..7426811a18d 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -519,8 +519,6 @@ class column_in_metadata {
   /**
    * @brief Set the nullability of this column
    *
-   * Only valid in case of chunked writes. In single writes, this option is ignored.
-   *
    * @param nullable Whether this column is nullable
    * @return this for chaining
    */
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index fbe931f945d..21c2ca1d64e 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -42,7 +42,7 @@ namespace lists {
  *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_key The scalar key to be looked up in each list row
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
@@ -64,7 +64,7 @@ std::unique_ptr<column> contains(
  *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_keys Column of elements to be looked up in each list row
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
@@ -85,7 +85,7 @@ std::unique_ptr<column> contains(
  * Nulls inside non-null nested elements (such as lists or structs) are not considered.
  *
  * @param lists Lists column whose `n` rows are to be searched
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains_nulls(
@@ -102,7 +102,7 @@ enum class duplicate_find_option : int32_t {
 };
 
 /**
- * @brief Create a column of `size_type` values indicating the position of a search key
+ * @brief Create a column of values indicating the position of a search key
  * within each list row in the `lists` column
  *
  * The output column has as many elements as there are rows in the input `lists` column.
@@ -119,14 +119,14 @@ enum class duplicate_find_option : int32_t {
  * If `find_option == FIND_LAST`, the position of the last match in the list row is
  * returned.
  *
+ * @throw cudf::data_type_error If `search_keys` type does not match the element type in `lists`
+ *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_key The scalar key to be looked up in each list row
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return INT32 column of `n` rows with the location of the `search_key`
- *
- * @throw cudf::data_type_error If `search_keys` type does not match the element type in `lists`
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return column of `n` rows with the location of the `search_key`
  */
 std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
@@ -135,7 +135,7 @@ std::unique_ptr<column> index_of(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Create a column of `size_type` values indicating the position of a search key
+ * @brief Create a column of values indicating the position of a search key
  * row within the corresponding list row in the `lists` column
  *
  * The output column has as many elements as there are rows in the input `lists` column.
@@ -152,16 +152,16 @@ std::unique_ptr<column> index_of(
  * If `find_option == FIND_LAST`, the position of the last match in the list row is
  * returned.
  *
+ * @throw cudf::logic_error If `search_keys` does not match `lists` in its number of rows
+ * @throw cudf::data_type_error If `search_keys` type does not match the element type in `lists`
+ *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_keys A column of search keys to be looked up in each corresponding row of
  * `lists`
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return INT32 column of `n` rows with the location of the `search_key`
- *
- * @throw cudf::logic_error If `search_keys` does not match `lists` in its number of rows
- * @throw cudf::data_type_error If `search_keys` type does not match the element type in `lists`
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return column of `n` rows with the location of the `search_key`
  */
 std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index dac6c1b5bf8..552ba058b93 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,9 +44,9 @@ namespace lists {
  * Any null input element will result in a corresponding null entry
  * in the output column.
  *
- * @param input Input lists column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column with the number of elements for each row.
+ * @param input Input lists column
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with the number of elements for each row
  */
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
index 9f28074173a..4bc45e48a9f 100644
--- a/cpp/include/cudf/lists/detail/combine.hpp
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,22 +27,20 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate_rows(
-  table_view const& input,
-  concatenate_null_policy null_policy,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate_rows(table_view const& input,
+                                         concatenate_null_policy null_policy,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::concatenate_list_elements
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate_list_elements(
-  column_view const& input,
-  concatenate_null_policy null_policy,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate_list_elements(column_view const& input,
+                                                  concatenate_null_policy null_policy,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index 5a8b4bc3bf3..a1f149d4ccf 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,10 +43,9 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with concatenated results.
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/contains.hpp b/cpp/include/cudf/lists/detail/contains.hpp
index 24318e72e98..58ec18cb9ef 100644
--- a/cpp/include/cudf/lists/detail/contains.hpp
+++ b/cpp/include/cudf/lists/detail/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,12 +29,11 @@ namespace detail {
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> index_of(
-  cudf::lists_column_view const& lists,
-  cudf::scalar const& search_key,
-  cudf::lists::duplicate_find_option find_option,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
+                                 cudf::scalar const& search_key,
+                                 cudf::lists::duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
@@ -43,12 +42,11 @@ std::unique_ptr<column> index_of(
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> index_of(
-  cudf::lists_column_view const& lists,
-  cudf::column_view const& search_keys,
-  cudf::lists::duplicate_find_option find_option,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
+                                 cudf::column_view const& search_keys,
+                                 cudf::lists::duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
@@ -56,11 +54,10 @@ std::unique_ptr<column> index_of(
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> contains(
-  cudf::lists_column_view const& lists,
-  cudf::scalar const& search_key,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
+                                 cudf::scalar const& search_key,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
@@ -68,11 +65,10 @@ std::unique_ptr<column> contains(
  *                                rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> contains(
-  cudf::lists_column_view const& lists,
-  cudf::column_view const& search_keys,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
+                                 cudf::column_view const& search_keys,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/extract.hpp b/cpp/include/cudf/lists/detail/extract.hpp
index 44c31c9ddb2..013f9b491dd 100644
--- a/cpp/include/cudf/lists/detail/extract.hpp
+++ b/cpp/include/cudf/lists/detail/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,22 +27,20 @@ namespace detail {
  * rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> extract_list_element(
-  lists_column_view lists_column,
-  size_type const index,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
+                                             size_type const index,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view, column_view const&,
  * rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> extract_list_element(
-  lists_column_view lists_column,
-  column_view const& indices,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
+                                             column_view const& indices,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 48c0ed8f6e9..83710a49f6a 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -285,11 +285,10 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
  *
  * @returns column with elements gathered based on `gather_data`
  */
-std::unique_ptr<column> gather_list_nested(
-  lists_column_view const& list,
-  gather_data& gd,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
+                                           gather_data& gd,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Gather a leaf column from a hierarchy of list columns.
@@ -303,11 +302,10 @@ std::unique_ptr<column> gather_list_nested(
  *
  * @returns column with elements gathered based on `gather_data`
  */
-std::unique_ptr<column> gather_list_leaf(
-  column_view const& column,
-  gather_data const& gd,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> gather_list_leaf(column_view const& column,
+                                         gather_data const& gd,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::segmented_gather(lists_column_view const& source_column,
@@ -317,13 +315,11 @@ std::unique_ptr<column> gather_list_leaf(
  *
  * @param stream CUDA stream on which to execute kernels
  */
-std::unique_ptr<column> segmented_gather(
-  lists_column_view const& source_column,
-  lists_column_view const& gather_map_list,
-  out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
-  // Move before bounds_policy?
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
+                                         lists_column_view const& gather_map_list,
+                                         out_of_bounds_policy bounds_policy,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/interleave_columns.hpp b/cpp/include/cudf/lists/detail/interleave_columns.hpp
index 7ae90779fdc..a5cf67c95b9 100644
--- a/cpp/include/cudf/lists/detail/interleave_columns.hpp
+++ b/cpp/include/cudf/lists/detail/interleave_columns.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,11 +44,10 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return The interleaved columns as a single column.
  */
-std::unique_ptr<column> interleave_columns(
-  table_view const& input,
-  bool has_null_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           bool has_null_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/reverse.hpp b/cpp/include/cudf/lists/detail/reverse.hpp
index d467a9ac70e..6e3b952a3b0 100644
--- a/cpp/include/cudf/lists/detail/reverse.hpp
+++ b/cpp/include/cudf/lists/detail/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,9 +23,8 @@ namespace cudf::lists::detail {
  * @copydoc cudf::lists::reverse
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<column> reverse(
-  lists_column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> reverse(lists_column_view const& input,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index c2b4778aac8..856914b445e 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -89,15 +89,14 @@ rmm::device_uvector<unbound_list_view> list_vector_from_column(
  * @return New lists column.
  */
 template <typename MapIterator>
-std::unique_ptr<column> scatter_impl(
-  rmm::device_uvector<unbound_list_view> const& source_vector,
-  rmm::device_uvector<unbound_list_view>& target_vector,
-  MapIterator scatter_map_begin,
-  MapIterator scatter_map_end,
-  column_view const& source,
-  column_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> const& source_vector,
+                                     rmm::device_uvector<unbound_list_view>& target_vector,
+                                     MapIterator scatter_map_begin,
+                                     MapIterator scatter_map_end,
+                                     column_view const& source,
+                                     column_view const& target,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
 
@@ -170,13 +169,12 @@ std::unique_ptr<column> scatter_impl(
  * @return New lists column.
  */
 template <typename MapIterator>
-std::unique_ptr<column> scatter(
-  column_view const& source,
-  MapIterator scatter_map_begin,
-  MapIterator scatter_map_end,
-  column_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> scatter(column_view const& source,
+                                MapIterator scatter_map_begin,
+                                MapIterator scatter_map_end,
+                                column_view const& target,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
@@ -227,13 +225,12 @@ std::unique_ptr<column> scatter(
  * @return New lists column.
  */
 template <typename MapIterator>
-std::unique_ptr<column> scatter(
-  scalar const& slr,
-  MapIterator scatter_map_begin,
-  MapIterator scatter_map_end,
-  column_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> scatter(scalar const& slr,
+                                MapIterator scatter_map_begin,
+                                MapIterator scatter_map_end,
+                                column_view const& target,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
diff --git a/cpp/include/cudf/lists/detail/set_operations.hpp b/cpp/include/cudf/lists/detail/set_operations.hpp
index ef4255de430..1411c65448e 100644
--- a/cpp/include/cudf/lists/detail/set_operations.hpp
+++ b/cpp/include/cudf/lists/detail/set_operations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,52 +30,48 @@ namespace cudf::lists::detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> have_overlap(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
+                                     lists_column_view const& rhs,
+                                     null_equality nulls_equal,
+                                     nan_equality nans_equal,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::intersect_distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> intersect_distinct(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
+                                           lists_column_view const& rhs,
+                                           null_equality nulls_equal,
+                                           nan_equality nans_equal,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::union_distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> union_distinct(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
+                                       lists_column_view const& rhs,
+                                       null_equality nulls_equal,
+                                       nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::difference_distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> difference_distinct(
-  lists_column_view const& lhs,
-  lists_column_view const& rhs,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
+                                            lists_column_view const& rhs,
+                                            null_equality nulls_equal,
+                                            nan_equality nans_equal,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /** @} */  // end of group
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
index 1068a4c4b69..c378ca8cf06 100644
--- a/cpp/include/cudf/lists/detail/sorting.hpp
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,24 +28,22 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> sort_lists(
-  lists_column_view const& input,
-  order column_order,
-  null_order null_precedence,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> sort_lists(lists_column_view const& input,
+                                   order column_order,
+                                   null_order null_precedence,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::lists::stable_sort_lists
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> stable_sort_lists(
-  lists_column_view const& input,
-  order column_order,
-  null_order null_precedence,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
+                                          order column_order,
+                                          null_order null_precedence,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index ba3dbb6594b..7ab9cf9a343 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,22 +28,20 @@ namespace cudf::lists::detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<column> apply_boolean_mask(
-  lists_column_view const& input,
-  lists_column_view const& boolean_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
+                                           lists_column_view const& boolean_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::list::distinct
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> distinct(
-  lists_column_view const& input,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> distinct(lists_column_view const& input,
+                                 null_equality nulls_equal,
+                                 nan_equality nans_equal,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/lists_column_factories.hpp b/cpp/include/cudf/lists/lists_column_factories.hpp
index a6eacb97e91..fea1118748c 100644
--- a/cpp/include/cudf/lists/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/lists_column_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,11 +35,10 @@ namespace detail {
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @param[in] mr Device memory resource used to allocate the returned column's device memory.
  */
-std::unique_ptr<cudf::column> make_lists_column_from_scalar(
-  list_scalar const& value,
-  size_type size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& value,
+                                                            size_type size,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/detail/reduction.cuh b/cpp/include/cudf/reduction/detail/reduction.cuh
similarity index 99%
rename from cpp/include/cudf/detail/reduction.cuh
rename to cpp/include/cudf/reduction/detail/reduction.cuh
index 9dc3b996afc..1620635e0e3 100644
--- a/cpp/include/cudf/detail/reduction.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/detail/reduction_operators.cuh>
+#include "reduction_operators.cuh"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -31,6 +31,8 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/iterator_traits.h>
 
+#include <optional>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
similarity index 70%
rename from cpp/include/cudf/detail/reduction_functions.hpp
rename to cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 1f892bb90c5..014a6ba70eb 100644
--- a/cpp/include/cudf/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -27,6 +27,7 @@
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 /**
  * @brief Computes sum of elements in input column
  *
@@ -42,12 +43,11 @@ namespace reduction {
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Sum as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> sum(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> sum(column_view const& col,
+                            data_type const output_dtype,
+                            std::optional<std::reference_wrapper<scalar const>> init,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes minimum of elements in input column
@@ -63,12 +63,11 @@ std::unique_ptr<scalar> sum(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Minimum element as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> min(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> min(column_view const& col,
+                            data_type const output_dtype,
+                            std::optional<std::reference_wrapper<scalar const>> init,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes maximum of elements in input column
@@ -84,12 +83,11 @@ std::unique_ptr<scalar> min(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Maximum element as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> max(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> max(column_view const& col,
+                            data_type const output_dtype,
+                            std::optional<std::reference_wrapper<scalar const>> init,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes any of elements in input column is true when typecasted to bool
@@ -106,12 +104,11 @@ std::unique_ptr<scalar> max(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return bool scalar if any of elements is true when typecasted to bool
  */
-std::unique_ptr<scalar> any(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> any(column_view const& col,
+                            data_type const output_dtype,
+                            std::optional<std::reference_wrapper<scalar const>> init,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes all of elements in input column is true when typecasted to bool
@@ -128,12 +125,11 @@ std::unique_ptr<scalar> any(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return bool scalar if all of elements is true when typecasted to bool
  */
-std::unique_ptr<scalar> all(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> all(column_view const& col,
+                            data_type const output_dtype,
+                            std::optional<std::reference_wrapper<scalar const>> init,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes product of elements in input column
@@ -150,12 +146,11 @@ std::unique_ptr<scalar> all(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Product as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> product(
-  column_view const& col,
-  data_type const output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> product(column_view const& col,
+                                data_type const output_dtype,
+                                std::optional<std::reference_wrapper<scalar const>> init,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes sum of squares of elements in input column
@@ -171,11 +166,10 @@ std::unique_ptr<scalar> product(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Sum of squares as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> sum_of_squares(
-  column_view const& col,
-  data_type const output_dtype,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> sum_of_squares(column_view const& col,
+                                       data_type const output_dtype,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes mean of elements in input column
@@ -191,11 +185,10 @@ std::unique_ptr<scalar> sum_of_squares(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Mean as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> mean(
-  column_view const& col,
-  data_type const output_dtype,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> mean(column_view const& col,
+                             data_type const output_dtype,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes variance of elements in input column
@@ -213,12 +206,11 @@ std::unique_ptr<scalar> mean(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Variance as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> variance(
-  column_view const& col,
-  data_type const output_dtype,
-  size_type ddof,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> variance(column_view const& col,
+                                 data_type const output_dtype,
+                                 size_type ddof,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes standard deviation of elements in input column
@@ -236,12 +228,11 @@ std::unique_ptr<scalar> variance(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return Standard deviation as scalar of type `output_dtype`
  */
-std::unique_ptr<scalar> standard_deviation(
-  column_view const& col,
-  data_type const output_dtype,
-  size_type ddof,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> standard_deviation(column_view const& col,
+                                           data_type const output_dtype,
+                                           size_type ddof,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Returns nth element in input column
@@ -267,12 +258,11 @@ std::unique_ptr<scalar> standard_deviation(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return nth element as scalar
  */
-std::unique_ptr<scalar> nth_element(
-  column_view const& col,
-  size_type n,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> nth_element(column_view const& col,
+                                    size_type n,
+                                    null_policy null_handling,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Collect input column into a (list) scalar
@@ -283,11 +273,10 @@ std::unique_ptr<scalar> nth_element(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return collected list as scalar
  */
-std::unique_ptr<scalar> collect_list(
-  column_view const& col,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> collect_list(column_view const& col,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Merge a bunch of list scalars into single list scalar
@@ -297,10 +286,9 @@ std::unique_ptr<scalar> collect_list(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return merged list as scalar
  */
-std::unique_ptr<scalar> merge_lists(
-  lists_column_view const& col,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> merge_lists(lists_column_view const& col,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Collect input column into a (list) scalar without duplicated elements
@@ -313,13 +301,12 @@ std::unique_ptr<scalar> merge_lists(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return collected list with unique elements as scalar
  */
-std::unique_ptr<scalar> collect_set(
-  column_view const& col,
-  null_policy null_handling,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> collect_set(column_view const& col,
+                                    null_policy null_handling,
+                                    null_equality nulls_equal,
+                                    nan_equality nans_equal,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Merge a bunch of list scalars into single list scalar then drop duplicated elements
@@ -331,12 +318,12 @@ std::unique_ptr<scalar> collect_set(
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @return collected list with unique elements as scalar
  */
-std::unique_ptr<scalar> merge_sets(
-  lists_column_view const& col,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
+                                   null_equality nulls_equal,
+                                   nan_equality nans_equal,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh
similarity index 97%
rename from cpp/include/cudf/detail/reduction_operators.cuh
rename to cpp/include/cudf/reduction/detail/reduction_operators.cuh
index 5a0cb4c1714..0dba84a0b28 100644
--- a/cpp/include/cudf/detail/reduction_operators.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 // intermediate data structure to compute `var`, `std`
 template <typename ResultType>
 struct var_std {
@@ -244,7 +245,7 @@ struct variance : public compound_op<variance> {
   using op = cudf::DeviceSum;
 
   template <typename ResultType>
-  using transformer = cudf::reduction::transformer_var_std<ResultType>;
+  using transformer = cudf::reduction::detail::transformer_var_std<ResultType>;
 
   template <typename ResultType>
   struct intermediate {
@@ -270,7 +271,7 @@ struct standard_deviation : public compound_op<standard_deviation> {
   using op = cudf::DeviceSum;
 
   template <typename ResultType>
-  using transformer = cudf::reduction::transformer_var_std<ResultType>;
+  using transformer = cudf::reduction::detail::transformer_var_std<ResultType>;
 
   template <typename ResultType>
   struct intermediate {
@@ -288,7 +289,7 @@ struct standard_deviation : public compound_op<standard_deviation> {
     };
   };
 };
-
 }  // namespace op
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/segmented_reduction.cuh b/cpp/include/cudf/reduction/detail/segmented_reduction.cuh
similarity index 99%
rename from cpp/include/cudf/detail/segmented_reduction.cuh
rename to cpp/include/cudf/reduction/detail/segmented_reduction.cuh
index 1c39d5eab1e..5c2eaf8cdcb 100644
--- a/cpp/include/cudf/detail/segmented_reduction.cuh
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/detail/reduction_operators.cuh>
+#include "reduction_operators.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/include/cudf/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
similarity index 63%
rename from cpp/include/cudf/detail/segmented_reduction_functions.hpp
rename to cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
index 7b5628fa49a..3902a7200a9 100644
--- a/cpp/include/cudf/detail/segmented_reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
@@ -27,6 +27,7 @@
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 /**
  * @brief Compute sum of each segment in the input column
@@ -50,14 +51,13 @@ namespace reduction {
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Sums of segments as type `output_dtype`
  */
-std::unique_ptr<column> segmented_sum(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_sum(column_view const& col,
+                                      device_span<size_type const> offsets,
+                                      data_type const output_dtype,
+                                      null_policy null_handling,
+                                      std::optional<std::reference_wrapper<scalar const>> init,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes product of each segment in the input column
@@ -81,14 +81,13 @@ std::unique_ptr<column> segmented_sum(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Product of segments as type `output_dtype`
  */
-std::unique_ptr<column> segmented_product(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_product(column_view const& col,
+                                          device_span<size_type const> offsets,
+                                          data_type const output_dtype,
+                                          null_policy null_handling,
+                                          std::optional<std::reference_wrapper<scalar const>> init,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Compute minimum of each segment in the input column
@@ -111,14 +110,13 @@ std::unique_ptr<column> segmented_product(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minimums of segments as type `output_dtype`
  */
-std::unique_ptr<column> segmented_min(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_min(column_view const& col,
+                                      device_span<size_type const> offsets,
+                                      data_type const output_dtype,
+                                      null_policy null_handling,
+                                      std::optional<std::reference_wrapper<scalar const>> init,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Compute maximum of each segment in the input column
@@ -141,14 +139,13 @@ std::unique_ptr<column> segmented_min(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Maximums of segments as type `output_dtype`
  */
-std::unique_ptr<column> segmented_max(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_max(column_view const& col,
+                                      device_span<size_type const> offsets,
+                                      data_type const output_dtype,
+                                      null_policy null_handling,
+                                      std::optional<std::reference_wrapper<scalar const>> init,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Compute if any of the values in the segment are true when typecasted to bool
@@ -172,14 +169,13 @@ std::unique_ptr<column> segmented_max(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of type BOOL8 for the results of the segments
  */
-std::unique_ptr<column> segmented_any(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_any(column_view const& col,
+                                      device_span<size_type const> offsets,
+                                      data_type const output_dtype,
+                                      null_policy null_handling,
+                                      std::optional<std::reference_wrapper<scalar const>> init,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Compute if all of the values in the segment are true when typecasted to bool
@@ -203,14 +199,13 @@ std::unique_ptr<column> segmented_any(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of BOOL8 for the results of the segments
  */
-std::unique_ptr<column> segmented_all(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_all(column_view const& col,
+                                      device_span<size_type const> offsets,
+                                      data_type const output_dtype,
+                                      null_policy null_handling,
+                                      std::optional<std::reference_wrapper<scalar const>> init,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes mean of elements of segments in the input column
@@ -233,13 +228,12 @@ std::unique_ptr<column> segmented_all(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of `output_dtype` for the reduction results of the segments
  */
-std::unique_ptr<column> segmented_mean(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_mean(column_view const& col,
+                                       device_span<size_type const> offsets,
+                                       data_type const output_dtype,
+                                       null_policy null_handling,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes sum of squares of elements of segments in the input column
@@ -262,13 +256,12 @@ std::unique_ptr<column> segmented_mean(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of `output_dtype` for the reduction results of the segments
  */
-std::unique_ptr<column> segmented_sum_of_squares(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_sum_of_squares(column_view const& col,
+                                                 device_span<size_type const> offsets,
+                                                 data_type const output_dtype,
+                                                 null_policy null_handling,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes the standard deviation of elements of segments in the input column
@@ -293,14 +286,13 @@ std::unique_ptr<column> segmented_sum_of_squares(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of `output_dtype` for the reduction results of the segments
  */
-std::unique_ptr<column> segmented_standard_deviation(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  size_type ddof,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_standard_deviation(column_view const& col,
+                                                     device_span<size_type const> offsets,
+                                                     data_type const output_dtype,
+                                                     null_policy null_handling,
+                                                     size_type ddof,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes the variance of elements of segments in the input column
@@ -325,14 +317,42 @@ std::unique_ptr<column> segmented_standard_deviation(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Column of `output_dtype` for the reduction results of the segments
  */
-std::unique_ptr<column> segmented_variance(
-  column_view const& col,
-  device_span<size_type const> offsets,
-  data_type const output_dtype,
-  null_policy null_handling,
-  size_type ddof,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> segmented_variance(column_view const& col,
+                                           device_span<size_type const> offsets,
+                                           data_type const output_dtype,
+                                           null_policy null_handling,
+                                           size_type ddof,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Counts the number of unique values within each segment of a column
+ *
+ * Unique entries are counted by comparing adjacent values so the column segments
+ * are expected to be sorted before calling this function otherwise the results
+ * are undefined.
+ *
+ * If any input segment is empty, that segment's result is null.
+ *
+ * If `null_handling==null_policy::INCLUDE`, the segment count is the number of
+ * unique values +1 which includes all the null entries in that segment.
+ * If `null_handling==null_policy::EXCLUDE`, the segment count does not include nulls.
+ *
+ * @throw cudf::logic_error if input column type is a nested type
+ *
+ * @param col Input column data
+ * @param offsets Indices to identify segment boundaries within input `col`
+ * @param null_handling Specifies how null elements are processed for each segment
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of unique counts per segment
+ */
+std::unique_ptr<column> segmented_nunique(column_view const& col,
+                                          device_span<size_type const> offsets,
+                                          null_policy null_handling,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index bd9520df644..fee22786d7a 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ namespace cudf {
  * @param column_order Vector of column sort order
  * @param null_precedence Vector of null_precedence enums needles
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return A non-nullable column of cudf::size_type elements containing the insertion points
+ * @return A non-nullable column of elements containing the insertion points
  */
 std::unique_ptr<column> lower_bound(
   table_view const& haystack,
@@ -104,7 +104,7 @@ std::unique_ptr<column> lower_bound(
  * @param column_order Vector of column sort order
  * @param null_precedence Vector of null_precedence enums needles
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return A non-nullable column of cudf::size_type elements containing the insertion points
+ * @return A non-nullable column of elements containing the insertion points
  */
 std::unique_ptr<column> upper_bound(
   table_view const& haystack,
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 922bed3b1ea..6924e77ae9b 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -44,7 +44,7 @@ namespace cudf {
  * for each column. Size must be equal to `input.num_columns()` or empty.
  * If empty, all columns will be sorted in `null_order::BEFORE`.
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return A non-nullable column of `size_type` elements containing the permuted row indices of
+ * @return A non-nullable column of elements containing the permuted row indices of
  * `input` if it were sorted
  */
 std::unique_ptr<column> sorted_order(
diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp
index f0f7c667697..85086e44a26 100644
--- a/cpp/include/cudf/strings/attributes.hpp
+++ b/cpp/include/cudf/strings/attributes.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,8 +32,8 @@ namespace strings {
  */
 
 /**
- * @brief Returns an integer numeric column containing the length of each string in
- * characters.
+ * @brief Returns a column containing character lengths
+ * of each string in the given column
  *
  * The output column will have the same number of rows as the
  * specified strings column. Each row value will be the number of
@@ -41,17 +41,17 @@ namespace strings {
  *
  * Any null string will result in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column with lengths for each string.
+ * @param input Strings instance for this operation
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with lengths for each string
  */
 std::unique_ptr<column> count_characters(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Returns a numeric column containing the length of each string in
- * bytes.
+ * @brief Returns a column containing byte lengths
+ * of each string in the given column
  *
  * The output column will have the same number of rows as the
  * specified strings column. Each row value will be the number of
@@ -59,17 +59,17 @@ std::unique_ptr<column> count_characters(
  *
  * Any null string will result in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column with the number of bytes for each string.
+ * @param input Strings instance for this operation
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with the number of bytes for each string
  */
 std::unique_ptr<column> count_bytes(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a numeric column with code point values (integers) for each
- * character of each string.
+ * character of each string
  *
  * A code point is the integer value representation of a character.
  * For example, the code point value for the character 'A' in UTF-8 is 65.
@@ -79,12 +79,12 @@ std::unique_ptr<column> count_bytes(
  *
  * Any null string is ignored. No null entries will appear in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column with code point integer values for each character.
+ * @param input Strings instance for this operation
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New INT32 column with code point integer values for each character
  */
 std::unique_ptr<column> code_points(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of strings_apis group
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index aebc4ae7dab..92914bc810f 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -165,7 +165,7 @@ std::unique_ptr<column> matches_re(
  * @param strings Strings instance for this operation
  * @param prog Regex program instance
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT32 column with counts for each string
+ * @return New column of match counts for each string
  */
 std::unique_ptr<column> count_re(
   strings_column_view const& strings,
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 09e0f3bb079..02a65c01178 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -59,7 +59,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            rmm::mr::device_memory_resource* mr)
 {
   auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto offsets_view          = offsets_column->mutable_view();
   auto d_offsets             = offsets_view.template data<int32_t>();
   size_and_exec_fn.d_offsets = d_offsets;
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 2939c47e6af..a3a5946fe55 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -175,7 +175,7 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
 
   // build offsets column -- this is the number of strings + 1
   auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto offsets_view = offsets_column->mutable_view();
   thrust::transform(rmm::exec_policy(stream),
                     offsets_begin,
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index a098703e4b0..82ccca188e2 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,10 +48,9 @@ namespace detail {
  * @param mr      Device memory resource used to allocate the returned column's device memory.
  * @return        New column with concatenated results.
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
new file mode 100644
index 00000000000..56678c73811
--- /dev/null
+++ b/cpp/include/cudf/timezone.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <memory>
+#include <optional>
+#include <string>
+
+namespace cudf {
+class table;
+
+// Cycle in which the time offsets repeat in Gregorian calendar
+static constexpr int32_t solar_cycle_years = 400;
+// Number of future entries in the timezone transition table:
+// Two entries per year, over the length of the Gregorian calendar's solar cycle
+static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
+
+/**
+ * @brief Creates a transition table to convert ORC timestamps to UTC.
+ *
+ * Uses system's TZif files. Assumes little-endian platform when parsing these files.
+ * The transition table starts with the entries from the TZif file. For timestamps after the file's
+ * last transition, the table includes entries that form a `solar_cycle_years`-year cycle (future
+ * entries). This portion of the table has `solar_cycle_entry_count` elements, as it assumes two
+ * transitions per year from Daylight Saving Time. If the timezone does not have DST, the table will
+ * still include the future entries, which will all have the same offset.
+ *
+ * @param tzif_dir The directory where the TZif files are located
+ * @param timezone_name standard timezone name (for example, "America/Los_Angeles")
+ * @param mr Device memory resource used to allocate the returned table's device memory.
+ *
+ * @return The transition table for the given timezone
+ */
+std::unique_ptr<table> make_timezone_transition_table(
+  std::optional<std::string_view> tzif_dir,
+  std::string_view timezone_name,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index 4fa712fe7c3..b925fc8ae92 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,4 +36,15 @@ namespace cudf {
  */
 bool column_types_equal(column_view const& lhs, column_view const& rhs);
 
+/**
+ * @brief Compare the type IDs of two `column_view`s
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is ignored.
+ *
+ * @param lhs The first `column_view` to compare
+ * @param rhs The second `column_view` to compare
+ * @return true if column types match
+ */
+bool column_types_equivalent(column_view const& lhs, column_view const& rhs);
+
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index be4d5bccd7b..1477314c592 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,8 +23,9 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/cxxopts.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/file_utilities.hpp>
-#include <cudf_test/stream_checking_resource_adapter.hpp>
+#include <cudf_test/stream_checking_resource_adaptor.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -243,7 +244,10 @@ inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_res
 
 inline auto make_pool()
 {
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+  auto const [free, total] = rmm::detail::available_device_memory();
+  auto min_alloc =
+    rmm::detail::align_down(std::min(free, total / 10), rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda(), min_alloc);
 }
 
 inline auto make_arena()
@@ -308,16 +312,33 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
     const char* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE");  // Overridden by CLI options
     const char* env_stream_mode =
       std::getenv("GTEST_CUDF_STREAM_MODE");  // Overridden by CLI options
-    auto default_rmm_mode    = env_rmm_mode ? env_rmm_mode : "pool";
-    auto default_stream_mode = env_stream_mode ? env_stream_mode : "default";
+    const char* env_stream_error_mode =
+      std::getenv("GTEST_CUDF_STREAM_ERROR_MODE");  // Overridden by CLI options
+    auto default_rmm_mode          = env_rmm_mode ? env_rmm_mode : "pool";
+    auto default_stream_mode       = env_stream_mode ? env_stream_mode : "default";
+    auto default_stream_error_mode = env_stream_error_mode ? env_stream_error_mode : "error";
     options.allow_unrecognised_options().add_options()(
       "rmm_mode",
       "RMM allocation mode",
       cxxopts::value<std::string>()->default_value(default_rmm_mode));
+    // `new_cudf_default` means that cudf::get_default_stream has been patched,
+    // so we raise errors anywhere that a CUDA default stream is observed
+    // instead of cudf::get_default_stream(). This corresponds to compiling
+    // identify_stream_usage with STREAM_MODE_TESTING=OFF (must do both at the
+    // same time).
+    // `new_testing_default` means that cudf::test::get_default_stream has been
+    // patched, so we raise errors anywhere that _any_ other stream is
+    // observed. This corresponds to compiling identify_stream_usage with
+    // STREAM_MODE_TESTING=ON (must do both at the same time).
     options.allow_unrecognised_options().add_options()(
       "stream_mode",
       "Whether to use a non-default stream",
       cxxopts::value<std::string>()->default_value(default_stream_mode));
+    options.allow_unrecognised_options().add_options()(
+      "stream_error_mode",
+      "Whether to error or print to stdout when a non-default stream is observed and stream_mode "
+      "is not \"default\"",
+      cxxopts::value<std::string>()->default_value(default_stream_error_mode));
     return options.parse(argc, argv);
   } catch (const cxxopts::OptionException& e) {
     CUDF_FAIL("Error parsing command line options");
@@ -334,21 +355,24 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
  * function parses the command line to customize test behavior, like the
  * allocation mode used for creating the default memory resource.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                            \
-  int main(int argc, char** argv)                                           \
-  {                                                                         \
-    ::testing::InitGoogleTest(&argc, argv);                                 \
-    auto const cmd_opts = parse_cudf_test_opts(argc, argv);                 \
-    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();           \
-    auto resource       = cudf::test::create_memory_resource(rmm_mode);     \
-    rmm::mr::set_current_device_resource(resource.get());                   \
-                                                                            \
-    auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();     \
-    rmm::cuda_stream const new_default_stream{};                            \
-    if (stream_mode == "custom") {                                          \
-      auto adapter = make_stream_checking_resource_adaptor(resource.get()); \
-      rmm::mr::set_current_device_resource(&adapter);                       \
-    }                                                                       \
-                                                                            \
-    return RUN_ALL_TESTS();                                                 \
+#define CUDF_TEST_PROGRAM_MAIN()                                                              \
+  int main(int argc, char** argv)                                                             \
+  {                                                                                           \
+    ::testing::InitGoogleTest(&argc, argv);                                                   \
+    auto const cmd_opts = parse_cudf_test_opts(argc, argv);                                   \
+    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();                             \
+    auto resource       = cudf::test::create_memory_resource(rmm_mode);                       \
+    rmm::mr::set_current_device_resource(resource.get());                                     \
+                                                                                              \
+    auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();                       \
+    if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {      \
+      auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();   \
+      auto const error_on_invalid_stream = (stream_error_mode == "error");                    \
+      auto const check_default_stream    = (stream_mode == "new_cudf_default");               \
+      auto adaptor                       = make_stream_checking_resource_adaptor(             \
+        resource.get(), error_on_invalid_stream, check_default_stream); \
+      rmm::mr::set_current_device_resource(&adaptor);                                         \
+    }                                                                                         \
+                                                                                              \
+    return RUN_ALL_TESTS();                                                                   \
   }
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 91773b2c3f1..6341e2e10b0 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -732,9 +732,11 @@ class strings_column_wrapper : public detail::column_wrapper {
   {
     auto all_valid        = thrust::make_constant_iterator(true);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
-    auto d_chars   = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
-    auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
-    wrapped        = cudf::make_strings_column(d_chars, d_offsets);
+    auto d_chars          = cudf::detail::make_device_uvector_sync(
+      chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto d_offsets = cudf::detail::make_device_uvector_sync(
+      offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    wrapped = cudf::make_strings_column(d_chars, d_offsets);
   }
 
   /**
@@ -772,10 +774,13 @@ class strings_column_wrapper : public detail::column_wrapper {
     size_type num_strings = std::distance(begin, end);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
     auto null_mask        = detail::make_null_mask_vector(v, v + num_strings);
-    auto d_chars   = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
-    auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
-    auto d_bitmask = cudf::detail::make_device_uvector_sync(null_mask, cudf::get_default_stream());
-    wrapped        = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
+    auto d_chars          = cudf::detail::make_device_uvector_sync(
+      chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto d_offsets = cudf::detail::make_device_uvector_sync(
+      offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto d_bitmask = cudf::detail::make_device_uvector_sync(
+      null_mask, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
   }
 
   /**
diff --git a/cpp/include/cudf_test/default_stream.hpp b/cpp/include/cudf_test/default_stream.hpp
new file mode 100644
index 00000000000..1da97d71f44
--- /dev/null
+++ b/cpp/include/cudf_test/default_stream.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace test {
+
+/**
+ * @brief Get the default stream to use for tests.
+ *
+ * The standard behavior of this function is to return cudf's default stream
+ * (cudf::get_default_stream). This function is primarily provided as an
+ * overload target for preload libraries (via LD_PRELOAD) so that the default
+ * stream used for tests may be modified for tracking purposes. All tests of
+ * public APIs that accept streams should pass `cudf::test::get_default_stream`
+ * as the stream argument so that a preload library changing the behavior of
+ * this function will trigger those tests to run on a different stream than
+ * `cudf::get_default_stream`.
+ *
+ * @return The default stream to use for tests.
+ */
+rmm::cuda_stream_view const get_default_stream();
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/include/cudf_test/stream_checking_resource_adapter.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
similarity index 69%
rename from cpp/include/cudf_test/stream_checking_resource_adapter.hpp
rename to cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 4a22ff148ae..e6108309ae2 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adapter.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <cudf_test/default_stream.hpp>
+
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 /**
@@ -33,7 +35,12 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    *
    * @param upstream The resource used for allocating/deallocating device memory
    */
-  stream_checking_resource_adaptor(Upstream* upstream) : upstream_{upstream}
+  stream_checking_resource_adaptor(Upstream* upstream,
+                                   bool error_on_invalid_stream,
+                                   bool check_default_stream)
+    : upstream_{upstream},
+      error_on_invalid_stream_{error_on_invalid_stream},
+      check_default_stream_{check_default_stream}
   {
     CUDF_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
@@ -87,7 +94,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    */
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
-    verify_non_default_stream(stream);
+    verify_stream(stream);
     return upstream_->allocate(bytes, stream);
   }
 
@@ -102,7 +109,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    */
   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
   {
-    verify_non_default_stream(stream);
+    verify_stream(stream);
     upstream_->deallocate(ptr, bytes, stream);
   }
 
@@ -131,25 +138,44 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    */
   std::pair<std::size_t, std::size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override
   {
-    verify_non_default_stream(stream);
+    verify_stream(stream);
     return upstream_->get_mem_info(stream);
   }
 
   /**
-   * @brief Throw an error if given one of CUDA's default stream specifiers.
+   * @brief Throw an error if the provided stream is invalid.
+   *
+   * A stream is invalid if:
+   * - check_default_stream_ is true and this function is passed one of CUDA's
+   *   default stream specifiers, or
+   * - check_default_stream_ is false and this function is passed any stream
+   *   other than the result of cudf::test::get_default_stream().
    *
-   * @throws `std::runtime_error` if provided a default stream
+   * @throws `std::runtime_error` if provided an invalid stream
    */
-  void verify_non_default_stream(rmm::cuda_stream_view const stream) const
+  void verify_stream(rmm::cuda_stream_view const stream) const
   {
     auto cstream{stream.value()};
-    if (cstream == cudaStreamDefault || (cstream == cudaStreamLegacy) ||
-        (cstream == cudaStreamPerThread)) {
-      throw std::runtime_error("Attempted to perform an operation on a default stream!");
+    auto const invalid_stream =
+      check_default_stream_ ? ((cstream == cudaStreamDefault) || (cstream == cudaStreamLegacy) ||
+                               (cstream == cudaStreamPerThread))
+                            : (cstream != cudf::test::get_default_stream().value());
+
+    if (invalid_stream) {
+      if (error_on_invalid_stream_) {
+        throw std::runtime_error("Attempted to perform an operation on an unexpected stream!");
+      } else {
+        std::cout << "Attempted to perform an operation on an unexpected stream!" << std::endl;
+      }
     }
   }
 
-  Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
+  Upstream* upstream_;            // the upstream resource used for satisfying allocation requests
+  bool error_on_invalid_stream_;  // If true, throw an exception when the wrong stream is detected.
+                                  // If false, simply print to stdout.
+  bool check_default_stream_;  // If true, throw an exception when the default stream is observed.
+                               // If false, throw an exception when anything other than
+                               // cudf::test::get_default_stream() is observed.
 };
 
 /**
@@ -160,7 +186,9 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
  * @param upstream Pointer to the upstream resource
  */
 template <typename Upstream>
-stream_checking_resource_adaptor<Upstream> make_stream_checking_resource_adaptor(Upstream* upstream)
+stream_checking_resource_adaptor<Upstream> make_stream_checking_resource_adaptor(
+  Upstream* upstream, bool error_on_invalid_stream, bool check_default_stream)
 {
-  return stream_checking_resource_adaptor<Upstream>{upstream};
+  return stream_checking_resource_adaptor<Upstream>{
+    upstream, error_on_invalid_stream, check_default_stream};
 }
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index ce45ad91be1..df1900bfa0c 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,9 @@
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
@@ -32,8 +35,6 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-#include <rmm/exec_policy.hpp>
-
 // for use with groupby and reduction aggregation tests.
 
 namespace cudf {
@@ -168,7 +169,8 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv,
   // verify min/max
   thrust::host_vector<device_span<T const>> h_spans;
   h_spans.push_back({input_values.begin<T>(), static_cast<size_t>(input_values.size())});
-  auto spans = cudf::detail::make_device_uvector_async(h_spans, cudf::get_default_stream());
+  auto spans = cudf::detail::make_device_uvector_async(
+    h_spans, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto expected_min = cudf::make_fixed_width_column(
     data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED);
@@ -267,7 +269,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -558,9 +561,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
-  auto b = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
+  auto b = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -570,7 +576,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index 38b49e63590..80a6edc496b 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,12 +28,7 @@ namespace detail {
  * @copydoc nvtext::tokenize(strings_column_view const&,string_scalar
  * const&,rmm::mr::device_memory_resource*)
  *
- * @param strings Strings column tokenize.
- * @param delimiter UTF-8 characters used to separate each string into tokens.
- *                  The default of empty string will separate tokens using whitespace.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::string_scalar const& delimiter,
@@ -44,11 +39,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
  * @copydoc nvtext::tokenize(strings_column_view const&,strings_column_view
  * const&,rmm::mr::device_memory_resource*)
  *
- * @param strings Strings column to tokenize.
- * @param delimiters Strings used to separate individual strings into tokens.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::strings_column_view const& delimiters,
@@ -59,12 +50,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
  * @copydoc nvtext::count_tokens(strings_column_view const&, string_scalar
  * const&,rmm::mr::device_memory_resource*)
  *
- * @param strings Strings column to use for this operation.
- * @param delimiter Strings used to separate each string into tokens.
- *                  The default of empty string will separate tokens using whitespace.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column of token counts.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::string_scalar const& delimiter,
@@ -75,11 +61,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
  * @copydoc nvtext::count_tokens(strings_column_view const&,strings_column_view
  * const&,rmm::mr::device_memory_resource*)
  *
- * @param strings Strings column to use for this operation.
- * @param delimiters Strings used to separate each string into tokens.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column of token counts.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::strings_column_view const& delimiters,
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 10a9f746d76..a72f7dcfa59 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,11 +112,11 @@ std::unique_ptr<cudf::column> tokenize(
  * All null row entries are ignored and the output contains all valid rows.
  * The number of tokens for a null element is set to 0 in the output column.
  *
- * @param strings Strings column to use for this operation.
- * @param delimiter Strings used to separate each string into tokens.
+ * @param strings Strings column to use for this operation
+ * @param delimiter Strings used to separate each string into tokens;
  *                  The default of empty string will separate tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column of token counts.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of token counts
  */
 std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& strings,
@@ -141,12 +141,12 @@ std::unique_ptr<cudf::column> count_tokens(
  * All null row entries are ignored and the output contains all valid rows.
  * The number of tokens for a null element is set to 0 in the output column.
  *
- * @throw cudf::logic_error if the delimiters column is empty or contains nulls.
+ * @throw cudf::logic_error if the delimiters column is empty or contains nulls
  *
- * @param strings Strings column to use for this operation.
- * @param delimiters Strings used to separate each string into tokens.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column of token counts.
+ * @param strings Strings column to use for this operation
+ * @param delimiters Strings used to separate each string into tokens
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of token counts
  */
 std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& strings,
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index afa10f02c16..68a5327b455 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -13,7 +13,12 @@
 # =============================================================================
 
 # ##################################################################################################
-# * compiler function -----------------------------------------------------------------------------
+# enable testing ################################################################################
+# ##################################################################################################
+enable_testing()
+
+include(rapids-test)
+rapids_test_init()
 
 # This function takes in a test name and test source and handles setting all of the associated
 # properties and linking to build the test
@@ -27,12 +32,12 @@ function(ConfigureTest test_name)
   target_link_libraries(
     ${test_name} PRIVATE GTest::gmock GTest::gmock_main GTest::gtest_main cudf_kafka
   )
-  add_test(NAME ${test_name} COMMAND ${test_name})
-  install(
-    TARGETS ${test_name}
-    COMPONENT testing
-    DESTINATION bin/gtests/libcudf_kafka
-    EXCLUDE_FROM_ALL
+  rapids_test_add(
+    NAME ${test_name}
+    COMMAND ${test_name}
+    GPUS 1
+    PERCENT 25
+    INSTALL_COMPONENT_SET testing
   )
 endfunction()
 
@@ -40,3 +45,5 @@ endfunction()
 # * Kafka host tests
 # ----------------------------------------------------------------------------------
 ConfigureTest(KAFKA_HOST_TEST kafka_consumer_tests.cpp)
+
+rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcudf_kafka)
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index 9cb8afbff9f..3fe503f749e 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,10 +1,11 @@
 #
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 import argparse
 import os
 import sys
 import xml.etree.ElementTree as ET
+from pathlib import Path
 from xml.dom import minidom
 
 parser = argparse.ArgumentParser()
@@ -22,52 +23,50 @@
     "--msg",
     type=str,
     default=None,
-    help="optional message to include in html output",
+    help="optional text file to include at the top of the html output",
+)
+parser.add_argument(
+    "--cmp_log",
+    type=str,
+    default=None,
+    help="optional baseline ninja_log to compare results",
 )
 args = parser.parse_args()
 
 log_file = args.log_file
-log_path = os.path.dirname(os.path.abspath(log_file))
-
 output_fmt = args.fmt
+cmp_file = args.cmp_log
 
 # build a map of the log entries
-entries = {}
-with open(log_file) as log:
-    last = 0
-    files = {}
-    for line in log:
-        entry = line.split()
-        if len(entry) > 4:
-            obj_file = entry[3]
-            file_size = (
-                os.path.getsize(os.path.join(log_path, obj_file))
-                if os.path.exists(obj_file)
-                else 0
-            )
-            start = int(entry[0])
-            end = int(entry[1])
-            # logic based on ninjatracing
-            if end < last:
-                files = {}
-            last = end
-            files.setdefault(entry[4], (entry[3], start, end, file_size))
-
-    # build entries from files dict
-    for entry in files.values():
-        entries[entry[0]] = (entry[1], entry[2], entry[3])
-
-# check file could be loaded and we have entries to report
-if len(entries) == 0:
-    print("Could not parse", log_file)
-    exit()
+def build_log_map(log_file):
+    entries = {}
+    log_path = os.path.dirname(os.path.abspath(log_file))
+    with open(log_file) as log:
+        last = 0
+        files = {}
+        for line in log:
+            entry = line.split()
+            if len(entry) > 4:
+                obj_file = entry[3]
+                file_size = (
+                    os.path.getsize(os.path.join(log_path, obj_file))
+                    if os.path.exists(obj_file)
+                    else 0
+                )
+                start = int(entry[0])
+                end = int(entry[1])
+                # logic based on ninjatracing
+                if end < last:
+                    files = {}
+                last = end
+                files.setdefault(entry[4], (entry[3], start, end, file_size))
+
+        # build entries from files dict
+        for entry in files.values():
+            entries[entry[0]] = (entry[1], entry[2], entry[3])
+
+    return entries
 
-# sort the entries by build-time (descending order)
-sorted_list = sorted(
-    list(entries.keys()),
-    key=lambda k: entries[k][1] - entries[k][0],
-    reverse=True,
-)
 
 # output results in XML format
 def output_xml(entries, sorted_list, args):
@@ -147,14 +146,46 @@ def assign_entries_to_threads(entries):
     return (results, end_time)
 
 
-# output chart results in HTML format
-def output_html(entries, sorted_list, args):
+# format the build-time
+def format_build_time(input_time):
+    build_time = abs(input_time)
+    build_time_str = str(build_time) + " ms"
+    if build_time > 120000:  # 2 minutes
+        minutes = int(build_time / 60000)
+        seconds = int(((build_time / 60000) - minutes) * 60)
+        build_time_str = "{:d}:{:02d} min".format(minutes, seconds)
+    elif build_time > 1000:
+        build_time_str = "{:.3f} s".format(build_time / 1000)
+    if input_time < 0:
+        build_time_str = "-" + build_time_str
+    return build_time_str
+
+
+# format file size
+def format_file_size(input_size):
+    file_size = abs(input_size)
+    file_size_str = ""
+    if file_size > 1000000:
+        file_size_str = "{:.3f} MB".format(file_size / 1000000)
+    elif file_size > 1000:
+        file_size_str = "{:.3f} KB".format(file_size / 1000)
+    elif file_size > 0:
+        file_size_str = str(file_size) + " bytes"
+    if input_size < 0:
+        file_size_str = "-" + file_size_str
+    return file_size_str
+
+
+# Output chart results in HTML format
+# Builds a standalone html file with no javascript or styles
+def output_html(entries, sorted_list, cmp_entries, args):
     print("<html><head><title>Build Metrics Report</title>")
-    # Note: Jenkins does not support javascript nor style defined in the html
-    # https://www.jenkins.io/doc/book/security/configuring-content-security-policy/
     print("</head><body>")
     if args.msg is not None:
-        print("<p>", args.msg, "</p>")
+        msg_file = Path(args.msg)
+        if msg_file.is_file():
+            msg = msg_file.read_text()
+            print("<p>", msg, "</p>")
 
     # map entries to threads
     # the end_time is used to scale all the entries to a fixed output width
@@ -201,15 +232,8 @@ def output_html(entries, sorted_list, args):
             # adjust for the cellspacing
             prev_end = end + int(end_time / 500)
 
-            # format the build-time
             build_time = end - start
-            build_time_str = str(build_time) + " ms"
-            if build_time > 120000:  # 2 minutes
-                minutes = int(build_time / 60000)
-                seconds = int(((build_time / 60000) - minutes) * 60)
-                build_time_str = "{:d}:{:02d} min".format(minutes, seconds)
-            elif build_time > 1000:
-                build_time_str = "{:.3f} s".format(build_time / 1000)
+            build_time_str = format_build_time(build_time)
 
             # assign color and accumulate legend values
             color = white
@@ -248,7 +272,7 @@ def output_html(entries, sorted_list, args):
             # done with this entry
             print("</font></td>")
             # update the entry with just the computed output info
-            entries[name] = (build_time_str, color, entry[2])
+            entries[name] = (build_time, color, entry[2])
 
         # add a filler column at the end of each row
         print("<td width='*'></td></tr></table></td></tr>")
@@ -259,30 +283,53 @@ def output_html(entries, sorted_list, args):
     # output detail table in build-time descending order
     print("<table id='detail' bgcolor='#EEEEEE'>")
     print(
-        "<tr><th>File</th>",
-        "<th>Compile time</th>",
-        "<th>Size</th><tr>",
-        sep="",
+        "<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep=""
     )
+    if cmp_entries:
+        print("<th>t-cmp</th>", sep="")
+    print("</tr>")
+
     for name in sorted_list:
         entry = entries[name]
-        build_time_str = entry[0]
+        build_time = entry[0]
         color = entry[1]
         file_size = entry[2]
 
-        # format file size
-        file_size_str = ""
-        if file_size > 1000000:
-            file_size_str = "{:.3f} MB".format(file_size / 1000000)
-        elif file_size > 1000:
-            file_size_str = "{:.3f} KB".format(file_size / 1000)
-        elif file_size > 0:
-            file_size_str = str(file_size) + " bytes"
+        build_time_str = format_build_time(build_time)
+        file_size_str = format_file_size(file_size)
 
         # output entry row
         print("<tr ", color, "><td>", name, "</td>", sep="", end="")
         print("<td align='right'>", build_time_str, "</td>", sep="", end="")
-        print("<td align='right'>", file_size_str, "</td></tr>", sep="")
+        print("<td align='right'>", file_size_str, "</td>", sep="", end="")
+        # output diff column
+        cmp_entry = (
+            cmp_entries[name] if cmp_entries and name in cmp_entries else None
+        )
+        if cmp_entry:
+            diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
+            diff_time_str = format_build_time(diff_time)
+            diff_color = white
+            diff_percent = int((diff_time / build_time) * 100)
+            if build_time > 60000:
+                if diff_percent > 20:
+                    diff_color = red
+                    diff_time_str = "<b>" + diff_time_str + "</b>"
+                elif diff_percent < -20:
+                    diff_color = green
+                    diff_time_str = "<b>" + diff_time_str + "</b>"
+                elif diff_percent > 0:
+                    diff_color = yellow
+            print(
+                "<td align='right' ",
+                diff_color,
+                ">",
+                diff_time_str,
+                "</td>",
+                sep="",
+                end="",
+            )
+        print("</tr>")
 
     print("</table><br/>")
 
@@ -296,22 +343,62 @@ def output_html(entries, sorted_list, args):
     print("<td align='right'>", summary["green"], "</td></tr>")
     print("<tr><td", white, ">time &lt; 1 second</td>")
     print("<td align='right'>", summary["white"], "</td></tr>")
-    print("</table></body></html>")
+    print("</table>")
+
+    if cmp_entries:
+        print("<table id='legend' border='2' bgcolor='#EEEEEE'>")
+        print("<tr><td", red, ">time increase &gt; 20%</td></tr>")
+        print("<tr><td", yellow, ">time increase &gt; 0</td></tr>")
+        print("<tr><td", green, ">time decrease &gt; 20%</td></tr>")
+        print(
+            "<tr><td",
+            white,
+            ">time change &lt; 20%% or build time &lt; 1 minute</td></tr>",
+        )
+        print("</table>")
+
+    print("</body></html>")
 
 
 # output results in CSV format
-def output_csv(entries, sorted_list, args):
-    print("time,size,file")
+def output_csv(entries, sorted_list, cmp_entries, args):
+    print("time,size,file", end="")
+    if cmp_entries:
+        print(",diff", end="")
+    print()
     for name in sorted_list:
         entry = entries[name]
         build_time = entry[1] - entry[0]
         file_size = entry[2]
-        print(build_time, file_size, name, sep=",")
+        cmp_entry = (
+            cmp_entries[name] if cmp_entries and name in cmp_entries else None
+        )
+        print(build_time, file_size, name, sep=",", end="")
+        if cmp_entry:
+            diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
+            print(",", diff_time, sep="", end="")
+        print()
+
+
+# parse log file into map
+entries = build_log_map(log_file)
+if len(entries) == 0:
+    print("Could not parse", log_file)
+    exit()
+
+# sort the entries by build-time (descending order)
+sorted_list = sorted(
+    list(entries.keys()),
+    key=lambda k: entries[k][1] - entries[k][0],
+    reverse=True,
+)
 
+# load the comparison build log if available
+cmp_entries = build_log_map(cmp_file) if cmp_file else None
 
 if output_fmt == "xml":
     output_xml(entries, sorted_list, args)
 elif output_fmt == "html":
-    output_html(entries, sorted_list, args)
+    output_html(entries, sorted_list, cmp_entries, args)
 else:
-    output_csv(entries, sorted_list, args)
+    output_csv(entries, sorted_list, cmp_entries, args)
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 07c53b3a421..2e6a643484e 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -605,6 +605,8 @@ template std::unique_ptr<groupby_aggregation> make_nunique_aggregation<groupby_a
   null_policy null_handling);
 template std::unique_ptr<reduce_aggregation> make_nunique_aggregation<reduce_aggregation>(
   null_policy null_handling);
+template std::unique_ptr<segmented_reduce_aggregation>
+make_nunique_aggregation<segmented_reduce_aggregation>(null_policy null_handling);
 
 /// Factory to create an NTH_ELEMENT aggregation
 template <typename Base>
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index c51993409ef..47fd50c5d97 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,29 +32,26 @@ class mutable_column_device_view;
 namespace binops {
 namespace compiled {
 
-std::unique_ptr<column> string_null_min_max(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> string_null_min_max(scalar const& lhs,
+                                            column_view const& rhs,
+                                            binary_operator op,
+                                            data_type output_type,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> string_null_min_max(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> string_null_min_max(column_view const& lhs,
+                                            scalar const& rhs,
+                                            binary_operator op,
+                                            data_type output_type,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> string_null_min_max(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> string_null_min_max(column_view const& lhs,
+                                            column_view const& rhs,
+                                            binary_operator op,
+                                            data_type output_type,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Performs a binary operation between a string scalar and a string
@@ -75,13 +72,12 @@ std::unique_ptr<column> string_null_min_max(
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> Output column
  */
-std::unique_ptr<column> binary_operation(
-  scalar const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(scalar const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Performs a binary operation between a string column and a string
@@ -102,13 +98,12 @@ std::unique_ptr<column> binary_operation(
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> Output column
  */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  scalar const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         scalar const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Performs a binary operation between two string columns.
@@ -128,13 +123,12 @@ std::unique_ptr<column> binary_operation(
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> Output column
  */
-std::unique_ptr<column> binary_operation(
-  column_view const& lhs,
-  column_view const& rhs,
-  binary_operator op,
-  data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> binary_operation(column_view const& lhs,
+                                         column_view const& rhs,
+                                         binary_operator op,
+                                         data_type output_type,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 void binary_operation(mutable_column_view& out,
                       scalar const& lhs,
diff --git a/cpp/src/binaryop/compiled/struct_binary_ops.cuh b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
index 8418493318f..2299df5a9bb 100644
--- a/cpp/src/binaryop/compiled/struct_binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
@@ -70,8 +70,8 @@ void apply_struct_binary_op(mutable_column_view& out,
                             column_view const& rhs,
                             bool is_lhs_scalar,
                             bool is_rhs_scalar,
-                            PhysicalElementComparator comparator = {},
-                            rmm::cuda_stream_view stream         = cudf::get_default_stream())
+                            PhysicalElementComparator comparator,
+                            rmm::cuda_stream_view stream)
 {
   auto const compare_orders = std::vector<order>(
     lhs.size(),
@@ -144,8 +144,8 @@ void apply_struct_equality_op(mutable_column_view& out,
                               bool is_lhs_scalar,
                               bool is_rhs_scalar,
                               binary_operator op,
-                              PhysicalEqualityComparator comparator = {},
-                              rmm::cuda_stream_view stream          = cudf::get_default_stream())
+                              PhysicalEqualityComparator comparator,
+                              rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(op == binary_operator::EQUAL || op == binary_operator::NOT_EQUAL ||
                  op == binary_operator::NULL_EQUALS,
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 5d36d70696c..6d6ef9fd7b0 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -76,7 +76,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
                  std::back_inserter(device_views),
                  [](auto const& col) { return *col; });
 
-  auto d_views = make_device_uvector_async(device_views, stream);
+  auto d_views =
+    make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource());
 
   // Compute the partition offsets
   auto offsets = thrust::host_vector<size_t>(views.size() + 1);
@@ -87,7 +88,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
     thrust::plus{});
-  auto d_offsets         = make_device_uvector_async(offsets, stream);
+  auto d_offsets =
+    make_device_uvector_async(offsets, stream, rmm::mr::get_current_device_resource());
   auto const output_size = offsets.back();
 
   return std::make_tuple(
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 0978cf441d8..9ec00612f2f 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -174,7 +175,8 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                     gather_map,
                                                     out_of_bounds_policy::DONT_CHECK,
                                                     negative_index_policy::NOT_ALLOWED,
-                                                    stream);
+                                                    stream,
+                                                    rmm::mr::get_current_device_resource());
 
   auto result = cudf::detail::scatter(
     table_view{std::vector<column_view>{scatter_src_lhs->get_column(0).view()}},
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 5e76b4adbbe..cc12aaa1382 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,11 +37,10 @@ namespace {
 
 struct get_element_functor {
   template <typename T, std::enable_if_t<is_fixed_width<T>() && !is_fixed_point<T>()>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto s = make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
 
@@ -61,11 +60,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, string_view>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto device_col = column_device_view::create(input, stream);
 
@@ -86,11 +84,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, dictionary32>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     auto dict_view    = dictionary_column_view(input);
     auto indices_iter = detail::indexalator_factory::make_input_iterator(dict_view.indices());
@@ -122,11 +119,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, list_view>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     bool valid               = is_element_valid_sync(input, index, stream);
     auto const child_col_idx = lists_column_view::child_column_index;
@@ -147,11 +143,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     using Type = typename T::rep;
 
@@ -178,11 +173,10 @@ struct get_element_functor {
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, struct_view>>* p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const& input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     size_type index,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     bool valid = is_element_valid_sync(input, index, stream);
     auto row_contents =
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index 5bdf10c8af6..20a8ce986aa 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,8 @@ bool has_nonempty_null_rows(cudf::column_view const& input, rmm::cuda_stream_vie
 {
   if (not input.has_nulls()) { return false; }  // No nulls => no dirty rows.
 
+  if ((input.size() == input.null_count()) && (input.num_children() == 0)) { return false; }
+
   // Cross-reference nullmask and offsets.
   auto const type         = input.type().id();
   auto const offsets      = (type == type_id::STRING) ? (strings_column_view{input}).offsets()
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index dd4912a216e..316f39b616c 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -253,7 +253,8 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
 
     auto scatter_functor   = column_scalar_scatterer<decltype(scatter_iter)>{};
     auto fields_iter_begin = make_counting_transform_iterator(0, [&](auto const& i) {
-      auto row_slr = get_element(typed_s->view().column(i), 0, stream);
+      auto row_slr =
+        get_element(typed_s->view().column(i), 0, stream, rmm::mr::get_current_device_resource());
       return type_dispatcher<dispatch_storage_type>(row_slr->type(),
                                                     scatter_functor,
                                                     *row_slr,
@@ -392,8 +393,8 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
                    0);
 
   // The scatter map is actually a table with only one column, which is scatter map.
-  auto scatter_map =
-    detail::apply_boolean_mask(table_view{{indices->view()}}, boolean_mask, stream);
+  auto scatter_map = detail::apply_boolean_mask(
+    table_view{{indices->view()}}, boolean_mask, stream, rmm::mr::get_current_device_resource());
   auto output_table = detail::scatter(
     table_view{{input}}, scatter_map->get_column(0).view(), table_view{{target}}, stream, mr);
 
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/datetime/timezone.cpp
similarity index 79%
rename from cpp/src/io/orc/timezone.cpp
rename to cpp/src/datetime/timezone.cpp
index 810dfe87320..55d68fe4a1a 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,22 +13,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "timezone.cuh"
+#include <cudf/detail/timezone.hpp>
 
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/table.hpp>
 
 #include <algorithm>
+#include <filesystem>
 #include <fstream>
 
 namespace cudf {
-namespace io {
+
+namespace {
 
 constexpr uint32_t tzif_magic           = ('T' << 0) | ('Z' << 8) | ('i' << 16) | ('f' << 24);
 std::string const tzif_system_directory = "/usr/share/zoneinfo/";
 
-// Seconds from Jan 1st, 1970 to Jan 1st, 2015
-constexpr int64_t orc_utc_offset = 1420070400;
-
 #pragma pack(push, 1)
 /**
  * @brief 32-bit TZif header
@@ -127,12 +129,13 @@ struct timezone_file {
                  "Number of transition times is larger than the file size.");
   }
 
-  timezone_file(std::string const& timezone_name)
+  timezone_file(std::optional<std::string_view> tzif_dir, std::string_view timezone_name)
   {
     using std::ios_base;
 
     // Open the input file
-    std::string const tz_filename = tzif_system_directory + timezone_name;
+    auto const tz_filename =
+      std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name;
     std::ifstream fin;
     fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate);
     CUDF_EXPECTS(fin, "Failed to open the timezone file.");
@@ -373,45 +376,62 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
   return trans.time + cuda::std::chrono::duration_cast<duration_s>(duration_D{day}).count();
 }
 
-timezone_table build_timezone_transition_table(std::string const& timezone_name,
-                                               rmm::cuda_stream_view stream)
+}  // namespace
+
+std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
+                                                      std::string_view timezone_name,
+                                                      rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::make_timezone_transition_table(
+    tzif_dir, timezone_name, cudf::get_default_stream(), mr);
+}
+
+namespace detail {
+
+std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
+                                                      std::string_view timezone_name,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::mr::device_memory_resource* mr)
 {
   if (timezone_name == "UTC" || timezone_name.empty()) {
     // Return an empty table for UTC
-    return {};
+    return std::make_unique<cudf::table>();
   }
 
-  timezone_file const tzf(timezone_name);
+  timezone_file const tzf(tzif_dir, timezone_name);
 
-  std::vector<int64_t> ttimes(1);
-  std::vector<int32_t> offsets(1);
+  std::vector<timestamp_s::rep> transition_times(1);
+  std::vector<duration_s::rep> offsets(1);
   // One ancient rule entry, one per TZ file entry, 2 entries per year in the future cycle
-  ttimes.reserve(1 + tzf.timecnt() + cycle_entry_cnt);
-  offsets.reserve(1 + tzf.timecnt() + cycle_entry_cnt);
+  transition_times.reserve(1 + tzf.timecnt() + solar_cycle_entry_count);
+  offsets.reserve(1 + tzf.timecnt() + solar_cycle_entry_count);
   size_t earliest_std_idx = 0;
   for (size_t t = 0; t < tzf.timecnt(); t++) {
     auto const ttime = tzf.transition_times[t];
     auto const idx   = tzf.ttime_idx[t];
     CUDF_EXPECTS(idx < tzf.typecnt(), "Out-of-range type index");
     auto const utcoff = tzf.ttype[idx].utcoff;
-    ttimes.push_back(ttime);
+    transition_times.push_back(ttime);
     offsets.push_back(utcoff);
-    if (!earliest_std_idx && !tzf.ttype[idx].isdst) { earliest_std_idx = ttimes.size() - 1; }
+    if (!earliest_std_idx && !tzf.ttype[idx].isdst) {
+      earliest_std_idx = transition_times.size() - 1;
+    }
   }
 
   if (tzf.timecnt() != 0) {
     if (!earliest_std_idx) { earliest_std_idx = 1; }
-    ttimes[0]  = ttimes[earliest_std_idx];
-    offsets[0] = offsets[earliest_std_idx];
+    transition_times[0] = transition_times[earliest_std_idx];
+    offsets[0]          = offsets[earliest_std_idx];
   } else {
     if (tzf.typecnt() == 0 || tzf.ttype[0].utcoff == 0) {
       // No transitions, offset is zero; Table would be a no-op.
       // Return an empty table to speed up parsing.
-      return {};
+      return std::make_unique<cudf::table>();
     }
     // No transitions to use for the time/offset - use the first offset and apply to all timestamps
-    ttimes[0]  = std::numeric_limits<int64_t>::max();
-    offsets[0] = tzf.ttype[0].utcoff;
+    transition_times[0] = std::numeric_limits<int64_t>::max();
+    offsets[0]          = tzf.ttype[0].utcoff;
   }
 
   // Generate entries for times after the last transition
@@ -440,19 +460,19 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
 
   // Add entries to fill the transition cycle
   int64_t year_timestamp = 0;
-  for (int32_t year = 1970; year < 1970 + cycle_years; ++year) {
+  for (int32_t year = 1970; year < 1970 + solar_cycle_years; ++year) {
     auto const dst_start_time = get_transition_time(dst_start, year);
     auto const dst_end_time   = get_transition_time(dst_end, year);
 
     // Two entries per year, since there are two transitions
-    ttimes.push_back(year_timestamp + dst_start_time - future_std_offset);
+    transition_times.push_back(year_timestamp + dst_start_time - future_std_offset);
     offsets.push_back(future_dst_offset);
-    ttimes.push_back(year_timestamp + dst_end_time - future_dst_offset);
+    transition_times.push_back(year_timestamp + dst_end_time - future_dst_offset);
     offsets.push_back(future_std_offset);
 
     // Swap the newly added transitions if in descending order
-    if (ttimes.rbegin()[1] > ttimes.rbegin()[0]) {
-      std::swap(ttimes.rbegin()[0], ttimes.rbegin()[1]);
+    if (transition_times.rbegin()[1] > transition_times.rbegin()[0]) {
+      std::swap(transition_times.rbegin()[0], transition_times.rbegin()[1]);
       std::swap(offsets.rbegin()[0], offsets.rbegin()[1]);
     }
 
@@ -461,13 +481,33 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
                         .count();
   }
 
-  rmm::device_uvector<int64_t> d_ttimes  = cudf::detail::make_device_uvector_async(ttimes, stream);
-  rmm::device_uvector<int32_t> d_offsets = cudf::detail::make_device_uvector_async(offsets, stream);
-  auto const gmt_offset                  = get_gmt_offset(ttimes, offsets, orc_utc_offset);
+  CUDF_EXPECTS(transition_times.size() == offsets.size(),
+               "Error reading TZif file for timezone " + std::string{timezone_name});
+
+  std::vector<timestamp_s> ttimes_typed;
+  ttimes_typed.reserve(transition_times.size());
+  std::transform(transition_times.cbegin(),
+                 transition_times.cend(),
+                 std::back_inserter(ttimes_typed),
+                 [](auto ts) { return timestamp_s{duration_s{ts}}; });
+  std::vector<duration_s> offsets_typed;
+  offsets_typed.reserve(offsets.size());
+  std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) {
+    return duration_s{ts};
+  });
+
+  auto d_ttimes  = cudf::detail::make_device_uvector_async(ttimes_typed, stream, mr);
+  auto d_offsets = cudf::detail::make_device_uvector_async(offsets_typed, stream, mr);
+
+  std::vector<std::unique_ptr<column>> tz_table_columns;
+  tz_table_columns.emplace_back(std::make_unique<cudf::column>(std::move(d_ttimes)));
+  tz_table_columns.emplace_back(std::make_unique<cudf::column>(std::move(d_offsets)));
+
+  // Need to finish copies before transition_times and offsets go out of scope
   stream.synchronize();
 
-  return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)};
+  return std::make_unique<cudf::table>(std::move(tz_table_columns));
 }
 
-}  // namespace io
+}  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 486e7d2d24b..d543225d3eb 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -54,8 +56,8 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
   CUDF_EXPECTS(new_keys.type() == old_keys.type(), "Keys must be the same type");
   // first, concatenate the keys together
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
-  auto combined_keys =
-    cudf::detail::concatenate(std::vector<column_view>{old_keys, new_keys}, stream);
+  auto combined_keys = cudf::detail::concatenate(
+    std::vector<column_view>{old_keys, new_keys}, stream, rmm::mr::get_current_device_resource());
 
   // Drop duplicates from the combined keys, then sort the result.
   // sort(distinct([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f]
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index d4f3a9ca495..98ad108655f 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -114,7 +115,8 @@ struct compute_children_offsets_fn {
       [](auto lhs, auto rhs) {
         return offsets_pair{lhs.first + rhs.first, lhs.second + rhs.second};
       });
-    return cudf::detail::make_device_uvector_sync(offsets, stream);
+    return cudf::detail::make_device_uvector_sync(
+      offsets, stream, rmm::mr::get_current_device_resource());
   }
 
  private:
@@ -219,7 +221,8 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     CUDF_EXPECTS(keys.type() == keys_type, "key types of all dictionary columns must match");
     return keys;
   });
-  auto all_keys = cudf::detail::concatenate(keys_views, stream);
+  auto all_keys =
+    cudf::detail::concatenate(keys_views, stream, rmm::mr::get_current_device_resource());
 
   // sort keys and remove duplicates;
   // this becomes the keys child for the output dictionary column
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 075fb6115e3..36f5021d305 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -181,7 +181,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
-  auto new_keys  = cudf::detail::concatenate(keys, stream);
+  auto new_keys  = cudf::detail::concatenate(keys, stream, rmm::mr::get_current_device_resource());
   auto keys_view = new_keys->view();
   std::vector<std::unique_ptr<column>> result(input.size());
   std::transform(input.begin(), input.end(), result.begin(), [keys_view, mr, stream](auto& col) {
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index ecd66f1b0c9..a747cc195ae 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,11 +104,10 @@ struct out_of_place_fill_range_dispatch {
 
   template <typename T,
             CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>() or cudf::is_fixed_point<T>())>
-  std::unique_ptr<cudf::column> operator()(
-    cudf::size_type begin,
-    cudf::size_type end,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<cudf::column> operator()(cudf::size_type begin,
+                                           cudf::size_type end,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index 284e7c46347..b4bab369c61 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,11 +134,10 @@ std::unique_ptr<column> sequence(size_type size,
   return type_dispatcher(init.type(), sequence_functor{}, size, init, step, stream, mr);
 }
 
-std::unique_ptr<column> sequence(
-  size_type size,
-  scalar const& init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> sequence(size_type size,
+                                 scalar const& init,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "init scalar type must be numeric");
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 1979108eaa2..df590c0c4b9 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -37,6 +37,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -305,7 +306,8 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
     thrust::make_counting_iterator(values.num_columns()),
     std::back_inserter(results),
     [&](size_type i) {
-      auto grouped_values = helper().grouped_values(values.column(i), stream);
+      auto grouped_values =
+        helper().grouped_values(values.column(i), stream, rmm::mr::get_current_device_resource());
       return cudf::detail::segmented_shift(
         grouped_values->view(), group_offsets, offsets[i], fill_values[i].get(), stream, mr);
     });
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 72ac6255549..6c55b1438ee 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -481,12 +481,15 @@ void compute_single_pass_aggs(table_view const& keys,
   // prepare to launch kernel to do the actual aggregation
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
   auto d_values       = table_device_view::create(flattened_values, stream);
-  auto const d_aggs   = cudf::detail::make_device_uvector_async(agg_kinds, stream);
+  auto const d_aggs   = cudf::detail::make_device_uvector_async(
+    agg_kinds, stream, rmm::mr::get_current_device_resource());
   auto const skip_key_rows_with_nulls =
     keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   auto row_bitmask =
-    skip_key_rows_with_nulls ? cudf::detail::bitmask_and(keys, stream).first : rmm::device_buffer{};
+    skip_key_rows_with_nulls
+      ? cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first
+      : rmm::device_buffer{};
 
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator(0),
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index bcc190c745b..be36956b929 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,7 +78,7 @@ struct store_result_functor {
       // It's overridden in scan implementation.
       return sorted_values->view();
     else
-      return (grouped_values = helper.grouped_values(values, stream))->view();
+      return (grouped_values = helper.grouped_values(values, stream, mr))->view();
   };
 
   /**
@@ -90,7 +90,7 @@ struct store_result_functor {
   column_view get_sorted_values()
   {
     return sorted_values ? sorted_values->view()
-                         : (sorted_values = helper.sorted_values(values, stream))->view();
+                         : (sorted_values = helper.sorted_values(values, stream, mr))->view();
   };
 
  protected:
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index edc8b089120..70b05100fb0 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,10 +25,12 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace groupby {
@@ -62,15 +64,19 @@ void compute_m2_fn(column_device_view const& values,
                    ResultType* d_result,
                    rmm::cuda_stream_view stream)
 {
-  auto const var_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    m2_transform<ResultType, decltype(values_iter)>{
-      values, values_iter, d_means, group_labels.data()});
+  auto m2_fn = m2_transform<ResultType, decltype(values_iter)>{
+    values, values_iter, d_means, group_labels.data()};
+  auto const itr = thrust::counting_iterator<size_type>(0);
+  // Using a temporary buffer for intermediate transform results instead of
+  // using the transform-iterator directly in thrust::reduce_by_key
+  // improves compile-time significantly.
+  auto m2_vals = rmm::device_uvector<ResultType>(values.size(), stream);
+  thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), m2_vals.begin(), m2_fn);
 
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
                         group_labels.end(),
-                        var_iter,
+                        m2_vals.begin(),
                         thrust::make_discard_iterator(),
                         d_result);
 }
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index cf81253483e..1a5f1691d5b 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -94,21 +94,20 @@ std::unique_ptr<column> group_nunique(column_view const& values,
 
   auto const d_values_view = column_device_view::create(values, stream);
 
+  auto d_result = rmm::device_uvector<size_type>(group_labels.size(), stream);
+
   auto const comparator_helper = [&](auto const d_equal) {
-    auto const is_unique_iterator =
-      thrust::make_transform_iterator(thrust::counting_iterator<cudf::size_type>(0),
-                                      is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
-                                                            *d_values_view,
-                                                            d_equal,
-                                                            null_handling,
-                                                            group_offsets.data(),
-                                                            group_labels.data()});
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          group_labels.begin(),
-                          group_labels.end(),
-                          is_unique_iterator,
-                          thrust::make_discard_iterator(),
-                          result->mutable_view().begin<size_type>());
+    auto fn = is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
+                                    *d_values_view,
+                                    d_equal,
+                                    null_handling,
+                                    group_offsets.data(),
+                                    group_labels.data()};
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(values.size()),
+                      d_result.begin(),
+                      fn);
   };
 
   if (cudf::detail::has_nested_columns(values_view)) {
@@ -121,6 +120,15 @@ std::unique_ptr<column> group_nunique(column_view const& values,
     comparator_helper(d_equal);
   }
 
+  // calling this with a vector instead of a transform iterator is 10x faster to compile;
+  // it also helps that we are only calling it once for both conditions
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        d_result.begin(),
+                        thrust::make_discard_iterator(),
+                        result->mutable_view().begin<size_type>());
+
   return result;
 }
 
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index 90ca5a5c90e..a9edcfecbf7 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -156,7 +156,8 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  auto dv_quantiles = cudf::detail::make_device_uvector_async(quantiles, stream);
+  auto dv_quantiles = cudf::detail::make_device_uvector_async(
+    quantiles, stream, rmm::mr::get_current_device_resource());
 
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index cb954e614f2..f12efd3cd24 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,7 +124,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
         make_null_replacement_iterator(*values_view, OpType::template identity<DeviceType>()),
         thrust::identity<ResultDeviceType>{});
       do_scan(input, result_view->begin<ResultDeviceType>(), OpType{});
-      result->set_null_mask(cudf::detail::copy_bitmask(values, stream));
+      result->set_null_mask(cudf::detail::copy_bitmask(values, stream, mr));
     } else {
       auto input = thrust::make_transform_iterator(values_view->begin<DeviceType>(),
                                                    thrust::identity<ResultDeviceType>{});
@@ -175,7 +175,7 @@ struct group_scan_functor<K,
     // turn the string_view vector into a strings column
     auto results = make_strings_column(results_vector, string_view{}, stream, mr);
     if (values.has_nulls())
-      results->set_null_mask(cudf::detail::copy_bitmask(values, stream), values.null_count());
+      results->set_null_mask(cudf::detail::copy_bitmask(values, stream, mr), values.null_count());
     return results;
   }
 };
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index a3efc1f172a..8cd2d8baf4e 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
@@ -33,6 +34,7 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace groupby {
@@ -48,7 +50,7 @@ struct var_transform {
   size_type const* d_group_labels;
   size_type ddof;
 
-  __device__ ResultType operator()(size_type i)
+  __device__ ResultType operator()(size_type i) const
   {
     if (d_values.is_null(i)) return 0.0;
 
@@ -75,15 +77,19 @@ void reduce_by_key_fn(column_device_view const& values,
                       ResultType* d_result,
                       rmm::cuda_stream_view stream)
 {
-  auto var_iter = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0),
-    var_transform<ResultType, decltype(values_iter)>{
-      values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof});
+  auto var_fn = var_transform<ResultType, decltype(values_iter)>{
+    values, values_iter, d_means, d_group_sizes, group_labels.data(), ddof};
+  auto const itr = thrust::make_counting_iterator<size_type>(0);
+  // Using a temporary buffer for intermediate transform results instead of
+  // using the transform-iterator directly in thrust::reduce_by_key
+  // improves compile-time significantly.
+  auto vars = rmm::device_uvector<ResultType>(values.size(), stream);
+  thrust::transform(rmm::exec_policy(stream), itr, itr + values.size(), vars.begin(), var_fn);
 
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
                         group_labels.end(),
-                        var_iter,
+                        vars.begin(),
                         thrust::make_discard_iterator(),
                         d_result);
 }
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 743ca5e8065..820dc8a3077 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ struct scan_result_functor final : store_result_functor {
     if (grouped_values)
       return grouped_values->view();
     else
-      return (grouped_values = helper.grouped_values(values, stream))->view();
+      return (grouped_values = helper.grouped_values(values, stream, mr))->view();
   };
 };
 
@@ -129,8 +129,10 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
   auto const group_labels_view = column_view(cudf::device_span<const size_type>(group_labels));
   auto const gather_map        = [&]() {
     if (is_presorted()) {  // assumes both keys and values are sorted, Spark does this.
-      return cudf::detail::sequence(
-        group_labels.size(), *cudf::make_fixed_width_scalar(size_type{0}, stream), stream);
+      return cudf::detail::sequence(group_labels.size(),
+                                    *cudf::make_fixed_width_scalar(size_type{0}, stream),
+                                    stream,
+                                    rmm::mr::get_current_device_resource());
     } else {
       auto sort_order = (rank_agg._method == rank_method::FIRST ? cudf::detail::stable_sorted_order
                                                                        : cudf::detail::sorted_order);
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index ebafcd75e6d..5b5a6356d67 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -16,6 +16,8 @@
 
 #include "common_utils.cuh"
 
+#include <stream_compaction/stream_compaction_common.cuh>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
@@ -144,7 +146,8 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
 {
   if (_group_offsets) return *_group_offsets;
 
-  _group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
+  auto const size = num_keys(stream);
+  _group_offsets  = std::make_unique<index_vector>(size + 1, stream);
 
   auto const comparator = cudf::experimental::row::equality::self_comparator{_keys, stream};
 
@@ -154,23 +157,33 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
   if (cudf::detail::has_nested_columns(_keys)) {
     auto const d_key_equal = comparator.equal_to<true>(
       cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
-    result_end = thrust::unique_copy(rmm::exec_policy(stream),
-                                     thrust::counting_iterator<size_type>(0),
-                                     thrust::counting_iterator<size_type>(num_keys(stream)),
-                                     _group_offsets->begin(),
-                                     permuted_row_equality_comparator(d_key_equal, sorted_order));
+    // Using a temporary buffer for intermediate transform results from the iterator containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator directly in thrust::unique_copy.
+    auto result       = rmm::device_uvector<bool>(size, stream);
+    auto const itr    = thrust::make_counting_iterator<size_type>(0);
+    auto const row_eq = permuted_row_equality_comparator(d_key_equal, sorted_order);
+    auto const ufn    = cudf::detail::unique_copy_fn<decltype(itr), decltype(row_eq)>{
+      itr, duplicate_keep_option::KEEP_FIRST, row_eq, size - 1};
+    thrust::transform(rmm::exec_policy(stream), itr, itr + size, result.begin(), ufn);
+    result_end = thrust::copy_if(rmm::exec_policy(stream),
+                                 itr,
+                                 itr + size,
+                                 result.begin(),
+                                 _group_offsets->begin(),
+                                 thrust::identity<bool>{});
   } else {
     auto const d_key_equal = comparator.equal_to<false>(
       cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
     result_end = thrust::unique_copy(rmm::exec_policy(stream),
                                      thrust::counting_iterator<size_type>(0),
-                                     thrust::counting_iterator<size_type>(num_keys(stream)),
+                                     thrust::counting_iterator<size_type>(size),
                                      _group_offsets->begin(),
                                      permuted_row_equality_comparator(d_key_equal, sorted_order));
   }
 
   size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
-  _group_offsets->set_element(num_groups, num_keys(stream), stream);
+  _group_offsets->set_element(num_groups, size, stream);
   _group_offsets->resize(num_groups + 1, stream);
 
   return *_group_offsets;
@@ -223,7 +236,8 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea
 {
   if (_keys_bitmask_column) return _keys_bitmask_column->view();
 
-  auto [row_bitmask, null_count] = cudf::detail::bitmask_and(_keys, stream);
+  auto [row_bitmask, null_count] =
+    cudf::detail::bitmask_and(_keys, stream, rmm::mr::get_current_device_resource());
 
   _keys_bitmask_column = make_numeric_column(
     data_type(type_id::INT8), _keys.num_rows(), std::move(row_bitmask), null_count, stream);
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index c017fd43079..55036bec6a6 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -84,10 +84,10 @@ class unordered_multiset {
     auto d_column = column_device_view::create(col, stream);
     auto d_col    = *d_column;
 
-    auto hash_bins_start =
-      cudf::detail::make_zeroed_device_uvector_async<size_type>(2 * d_col.size() + 1, stream);
-    auto hash_bins_end =
-      cudf::detail::make_zeroed_device_uvector_async<size_type>(2 * d_col.size() + 1, stream);
+    auto hash_bins_start = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
+    auto hash_bins_end = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
     auto hash_data = rmm::device_uvector<Element>(d_col.size(), stream);
 
     Hasher hasher;
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 861b5b0fba4..7f88019beb2 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -215,7 +215,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                                                                   arrow::MemoryPool* ar_mr,
                                                                   rmm::cuda_stream_view stream)
 {
-  auto bitmask = bools_to_mask(input, stream);
+  auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
 
   auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
 
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 48c458109c1..aa0e36d9972 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,9 +124,11 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
   md->total_data_size = m_cur - (m_base + md->metadata_size);
   // Extract columns
   for (size_t i = 0; i < md->schema.size(); i++) {
-    type_kind_e kind = md->schema[i].kind;
-    if (kind > type_null && kind < type_record) {
-      // Primitive type column
+    type_kind_e kind                = md->schema[i].kind;
+    logicaltype_kind_e logical_kind = md->schema[i].logical_kind;
+
+    bool is_supported_kind = ((kind > type_null) && (kind < type_record));
+    if (is_supported_logical_type(logical_kind) || is_supported_kind) {
       column_desc col;
       int parent_idx       = md->schema[i].parent_idx;
       col.schema_data_idx  = (int32_t)i;
@@ -141,7 +143,9 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
                  --num_children) {
               int skip = 1;
               if (pos == i) {
-                col.parent_union_idx = md->schema[parent_idx].num_children - num_children;
+                // parent_idx will always be pointing to our immediate parent
+                // union at this point.
+                col.parent_union_idx = parent_idx;
               } else if (md->schema[pos].kind == type_null) {
                 col.schema_null_idx = pos;
                 break;
@@ -152,7 +156,9 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
               } while (skip != 0);
             }
           }
-          // Ignore the root or array entries
+          // We want to "inherit" the column name from our parent union's
+          // name, as long as we're not dealing with the root (parent_idx == 0)
+          // or array entries.
           if ((parent_idx != 0 && md->schema[parent_idx].kind != type_array) ||
               col.name.length() == 0) {
             if (col.name.length() > 0) { col.name.insert(0, 1, '.'); }
@@ -179,13 +185,14 @@ enum json_state_e {
   state_nextsymbol,
 };
 
-enum {
+enum attrtype_e {
   attrtype_none = -1,
   attrtype_type = 0,
   attrtype_name,
   attrtype_fields,
   attrtype_symbols,
   attrtype_items,
+  attrtype_logicaltype,
 };
 
 /**
@@ -205,26 +212,40 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, const std::string&
   int depth = 0, parent_idx = -1, entry_idx = -1;
   json_state_e state = state_attrname;
   std::string str;
-  const std::unordered_map<std::string, type_kind_e> typenames = {{"null", type_null},
-                                                                  {"boolean", type_boolean},
-                                                                  {"int", type_int},
-                                                                  {"long", type_long},
-                                                                  {"float", type_float},
-                                                                  {"double", type_double},
-                                                                  {"bytes", type_bytes},
-                                                                  {"string", type_string},
-                                                                  {"record", type_record},
-                                                                  {"enum", type_enum},
-                                                                  {"array", type_array}};
-  const std::unordered_map<std::string, int> attrnames         = {{"type", attrtype_type},
-                                                          {"name", attrtype_name},
-                                                          {"fields", attrtype_fields},
-                                                          {"symbols", attrtype_symbols},
-                                                          {"items", attrtype_items}};
-  int cur_attr                                                 = attrtype_none;
-  m_base                                                       = json_str.c_str();
-  m_cur                                                        = m_base;
-  m_end                                                        = m_base + json_str.length();
+  const std::unordered_map<std::string, type_kind_e> typenames = {
+    {"null", type_null},
+    {"boolean", type_boolean},
+    {"int", type_int},
+    {"long", type_long},
+    {"float", type_float},
+    {"double", type_double},
+    {"bytes", type_bytes},
+    {"string", type_string},
+    {"record", type_record},
+    {"enum", type_enum},
+    {"array", type_array},
+    {"union", type_union},
+    {"fixed", type_fixed},
+    {"decimal", type_decimal},
+    {"date", type_date},
+    {"time-millis", type_time_millis},
+    {"time-micros", type_time_micros},
+    {"timestamp-millis", type_timestamp_millis},
+    {"timestamp-micros", type_timestamp_micros},
+    {"local-timestamp-millis", type_local_timestamp_millis},
+    {"local-timestamp-micros", type_local_timestamp_micros},
+    {"duration", type_duration}};
+  const std::unordered_map<std::string, attrtype_e> attrnames = {
+    {"type", attrtype_type},
+    {"name", attrtype_name},
+    {"fields", attrtype_fields},
+    {"symbols", attrtype_symbols},
+    {"items", attrtype_items},
+    {"logicalType", attrtype_logicaltype}};
+  attrtype_e cur_attr = attrtype_none;
+  m_base              = json_str.c_str();
+  m_cur               = m_base;
+  m_end               = m_base + json_str.length();
   while (more_data()) {
     int c = *m_cur++;
     switch (c) {
@@ -250,6 +271,10 @@ bool schema_parser::parse(std::vector<schema_entry>& schema, const std::string&
             auto t = typenames.find(str);
             if (t == typenames.end()) return false;
             schema[entry_idx].kind = t->second;
+          } else if (cur_attr == attrtype_logicaltype) {
+            auto t = typenames.find(str);
+            if (t == typenames.end()) return false;
+            schema[entry_idx].logical_kind = static_cast<logicaltype_kind_e>(t->second);
           } else if (cur_attr == attrtype_name) {
             if (entry_idx < 0) return false;
             schema[entry_idx].name = std::move(str);
diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp
index 1ca50f04d18..ef294893e4b 100644
--- a/cpp/src/io/avro/avro.hpp
+++ b/cpp/src/io/avro/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,8 @@ struct schema_entry {
   int32_t parent_idx   = -1;  // index of parent entry in schema array, negative if no parent
   int32_t num_children = 0;
   type_kind_e kind     = type_not_set;
-  std::string name     = "";
+  logicaltype_kind_e logical_kind = logicaltype_not_set;
+  std::string name                = "";
   std::vector<std::string> symbols;
 };
 
diff --git a/cpp/src/io/avro/avro_common.hpp b/cpp/src/io/avro/avro_common.hpp
index 229ffa5da04..a3025650ae9 100644
--- a/cpp/src/io/avro/avro_common.hpp
+++ b/cpp/src/io/avro/avro_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,8 +56,75 @@ enum type_kind_e {
   type_record,
   type_union,
   type_array,
+  type_fixed,
+  // Logical types
+  type_decimal,
+  type_uuid,
+  type_date,
+  type_time_millis,
+  type_time_micros,
+  type_timestamp_millis,
+  type_timestamp_micros,
+  type_local_timestamp_millis,
+  type_local_timestamp_micros,
+  type_duration,
 };
 
+enum logicaltype_kind_e {
+  logicaltype_not_set = 0,
+  // N.B. We intentionally mirror the logicaltype enum values with their
+  //      equivalent type enum value, as this allows us to cast the type
+  //      value directly to a logical type without an intermediate
+  //      mapping step, and vice versa, e.g.:
+  //
+  //        auto kind = type_date;
+  //        auto logical_kind = static_cast<logical_kind_e>(type_date);
+  //        // logical_kind == logicaltype_kind_e::logicaltype_date
+  //
+  //      And:
+  //
+  //        auto logical_kind = logicaltype_date;
+  //        auto kind = static_cast<type_kind_e>(logical_kind);
+  //        // kind == type_kind_e::type_date
+  //
+  logicaltype_decimal = type_decimal,
+  logicaltype_uuid,
+  logicaltype_date,
+  logicaltype_time_millis,
+  logicaltype_time_micros,
+  logicaltype_timestamp_millis,
+  logicaltype_timestamp_micros,
+  logicaltype_local_timestamp_millis,
+  logicaltype_local_timestamp_micros,
+  logicaltype_duration,
+};
+
+/**
+ * @brief Determines if the supplied logical type is currently supported.
+ *
+ * @param[in] logical_kind Supplies the logicaltype_kind_e enum value.
+ *
+ * @return true if the logical type is supported, false otherwise.
+ */
+inline constexpr bool is_supported_logical_type(logicaltype_kind_e logical_kind)
+{
+  switch (logical_kind) {
+    case logicaltype_date: return true;
+
+    case logicaltype_not_set: [[fallthrough]];
+    case logicaltype_decimal: [[fallthrough]];
+    case logicaltype_uuid: [[fallthrough]];
+    case logicaltype_time_millis: [[fallthrough]];
+    case logicaltype_time_micros: [[fallthrough]];
+    case logicaltype_timestamp_millis: [[fallthrough]];
+    case logicaltype_timestamp_micros: [[fallthrough]];
+    case logicaltype_local_timestamp_millis: [[fallthrough]];
+    case logicaltype_local_timestamp_micros: [[fallthrough]];
+    case logicaltype_duration: [[fallthrough]];
+    default: return false;
+  }
+}
+
 using cudf::io::detail::string_index_pair;
 
 }  // namespace avro
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 03edb7ed6cb..64c572424e0 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,8 +78,11 @@ avro_decode_row(schemadesc_s const* schema,
   uint32_t array_start = 0, array_repeat_count = 0;
   int array_children = 0;
   for (uint32_t i = 0; i < schema_len;) {
-    uint32_t kind = schema[i].kind;
-    int skip      = 0;
+    type_kind_e kind                = schema[i].kind;
+    logicaltype_kind_e logical_kind = schema[i].logical_kind;
+    int skip                        = 0;
+
+    if (is_supported_logical_type(logical_kind)) { kind = static_cast<type_kind_e>(logical_kind); }
 
     if (kind == type_union) {
       int skip_after;
@@ -93,7 +96,11 @@ avro_decode_row(schemadesc_s const* schema,
         --skip;
       }
       if (i >= schema_len || skip_after < 0) break;
-      kind = schema[i].kind;
+      kind         = schema[i].kind;
+      logical_kind = schema[i].logical_kind;
+      if (is_supported_logical_type(logical_kind)) {
+        kind = static_cast<type_kind_e>(logical_kind);
+      }
       skip = skip_after;
     }
 
@@ -106,36 +113,38 @@ avro_decode_row(schemadesc_s const* schema,
         }
         break;
 
-      case type_int:
-      case type_long:
-      case type_bytes:
-      case type_string:
-      case type_enum: {
+      case type_int: {
         int64_t v = avro_decode_zigzag_varint(cur, end);
-        if (kind == type_int) {
-          if (dataptr != nullptr && row < max_rows) {
-            static_cast<int32_t*>(dataptr)[row] = static_cast<int32_t>(v);
-          }
-        } else if (kind == type_long) {
-          if (dataptr != nullptr && row < max_rows) { static_cast<int64_t*>(dataptr)[row] = v; }
-        } else {  // string or enum
-          size_t count    = 0;
-          const char* ptr = nullptr;
-          if (kind == type_enum) {  // dictionary
-            size_t idx = schema[i].count + v;
-            if (idx < global_dictionary.size()) {
-              ptr   = global_dictionary[idx].first;
-              count = global_dictionary[idx].second;
-            }
-          } else if (v >= 0 && cur + v <= end) {  // string
-            ptr   = reinterpret_cast<const char*>(cur);
-            count = (size_t)v;
-            cur += count;
-          }
-          if (dataptr != nullptr && row < max_rows) {
-            static_cast<string_index_pair*>(dataptr)[row].first  = ptr;
-            static_cast<string_index_pair*>(dataptr)[row].second = count;
+        if (dataptr != nullptr && row < max_rows) {
+          static_cast<int32_t*>(dataptr)[row] = static_cast<int32_t>(v);
+        }
+      } break;
+
+      case type_long: {
+        int64_t v = avro_decode_zigzag_varint(cur, end);
+        if (dataptr != nullptr && row < max_rows) { static_cast<int64_t*>(dataptr)[row] = v; }
+      } break;
+
+      case type_bytes: [[fallthrough]];
+      case type_string: [[fallthrough]];
+      case type_enum: {
+        int64_t v       = avro_decode_zigzag_varint(cur, end);
+        size_t count    = 0;
+        const char* ptr = nullptr;
+        if (kind == type_enum) {  // dictionary
+          size_t idx = schema[i].count + v;
+          if (idx < global_dictionary.size()) {
+            ptr   = global_dictionary[idx].first;
+            count = global_dictionary[idx].second;
           }
+        } else if (v >= 0 && cur + v <= end) {  // string or bytes
+          ptr   = reinterpret_cast<const char*>(cur);
+          count = (size_t)v;
+          cur += count;
+        }
+        if (dataptr != nullptr && row < max_rows) {
+          static_cast<string_index_pair*>(dataptr)[row].first  = ptr;
+          static_cast<string_index_pair*>(dataptr)[row].second = count;
         }
       } break;
 
@@ -190,7 +199,48 @@ avro_decode_row(schemadesc_s const* schema,
           skip += schema[i].count;  // Should always be 1
         }
       } break;
+
+      case type_duration: {
+        // A duration logical type annotates Avro fixed type of size 12, which
+        // stores three little-endian unsigned integers that represent durations
+        // at different granularities of time. The first stores a number in
+        // months, the second stores a number in days, and the third stores a
+        // number in milliseconds.
+        CUDF_UNREACHABLE("avro type 'duration' not yet implemented");
+      } break;
+
+      // N.B. These aren't handled yet, see the discussion on
+      //      https://github.com/rapidsai/cudf/pull/12788.  The decoding logic
+      //      is correct, though, so there's no harm in having them here.
+      case type_timestamp_millis: [[fallthrough]];
+      case type_timestamp_micros: [[fallthrough]];
+      case type_local_timestamp_millis: [[fallthrough]];
+      case type_local_timestamp_micros: [[fallthrough]];
+      case type_time_millis: [[fallthrough]];
+      case type_time_micros: {
+        // N.B. time-millis is stored as a 32-bit int, however, cudf expects an
+        //      int64 for DURATION_MILLISECONDS.  From our perspective, the fact
+        //      that time-millis comes from a 32-bit int is hidden from us by
+        //      way of the zig-zag varint encoding, so we can safely treat them
+        //      both as int64_t.  Everything else is 64-bit in both avro and
+        //      cudf.
+        CUDF_UNREACHABLE("avro time/timestamp types not yet implemented");
+        //
+        // When we do implement these, the following decoding logic should
+        // be correct:
+        //
+        // int64_t v = avro_decode_zigzag_varint(cur, end);
+        // if (dataptr != nullptr && row < max_rows) { static_cast<int64_t*>(dataptr)[row] = v; }
+      } break;
+
+      case type_date: {
+        int64_t v = avro_decode_zigzag_varint(cur, end);
+        if (dataptr != nullptr && row < max_rows) {
+          static_cast<int32_t*>(dataptr)[row] = static_cast<int32_t>(v);
+        }
+      } break;
     }
+
     if (array_repeat_count != 0) {
       array_children--;
       if (schema[i].kind >= type_record) { array_children += schema[i].count; }
diff --git a/cpp/src/io/avro/avro_gpu.hpp b/cpp/src/io/avro/avro_gpu.hpp
index 7bfb3a75250..6575d76d8d9 100644
--- a/cpp/src/io/avro/avro_gpu.hpp
+++ b/cpp/src/io/avro/avro_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,8 @@ namespace gpu {
  * @brief Struct to describe the avro schema
  */
 struct schemadesc_s {
-  uint32_t kind;   // avro type kind
+  cudf::io::avro::type_kind_e kind;                 // avro type kind
+  cudf::io::avro::logicaltype_kind_e logical_kind;  // avro logicaltype kind
   uint32_t count;  // for records/unions: number of following child columns, for nulls: global
                    // null_count, for enums: dictionary ofs
   void* dataptr;   // Ptr to column data, or null if column not selected
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index d9da2f083d1..60a1b4263b2 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -66,15 +66,42 @@ namespace {
  */
 type_id to_type_id(avro::schema_entry const* col)
 {
-  switch (col->kind) {
+  avro::type_kind_e kind;
+
+  // N.B. The switch statement seems a bit ridiculous for a single type, but the
+  //      plan is to incrementally add more types to it as support is added for
+  //      them in the future.
+  switch (col->logical_kind) {
+    case avro::logicaltype_date: kind = static_cast<avro::type_kind_e>(col->logical_kind); break;
+    case avro::logicaltype_not_set: [[fallthrough]];
+    default: kind = col->kind; break;
+  }
+
+  switch (kind) {
     case avro::type_boolean: return type_id::BOOL8;
     case avro::type_int: return type_id::INT32;
     case avro::type_long: return type_id::INT64;
     case avro::type_float: return type_id::FLOAT32;
     case avro::type_double: return type_id::FLOAT64;
-    case avro::type_bytes:
+    case avro::type_bytes: [[fallthrough]];
     case avro::type_string: return type_id::STRING;
+    case avro::type_date: return type_id::TIMESTAMP_DAYS;
+    case avro::type_timestamp_millis: return type_id::TIMESTAMP_MILLISECONDS;
+    case avro::type_timestamp_micros: return type_id::TIMESTAMP_MICROSECONDS;
+    case avro::type_local_timestamp_millis: return type_id::TIMESTAMP_MILLISECONDS;
+    case avro::type_local_timestamp_micros: return type_id::TIMESTAMP_MICROSECONDS;
     case avro::type_enum: return (!col->symbols.empty()) ? type_id::STRING : type_id::INT32;
+    // The avro time-millis and time-micros types are closest to Arrow's
+    // TIME32 and TIME64.  They're single-day units, i.e. they won't exceed
+    // 23:59:59.9999 (or .999999 for micros).  There's no equivalent cudf
+    // type for this; type_id::DURATION_MILLISECONDS/MICROSECONDS are close,
+    // but they're not semantically the same.
+    case avro::type_time_millis: [[fallthrough]];
+    case avro::type_time_micros: [[fallthrough]];
+    // There's no cudf equivalent for the avro duration type, which is a fixed
+    // 12 byte value which stores three little-endian unsigned 32-bit integers
+    // representing months, days, and milliseconds, respectively.
+    case avro::type_duration: [[fallthrough]];
     default: return type_id::EMPTY;
   }
 }
@@ -141,6 +168,7 @@ class metadata : public file_metadata {
             break;
           }
         }
+
         if (!column_in_array) {
           auto col_type = to_type_id(&schema[columns[i].schema_data_idx]);
           CUDF_EXPECTS(col_type != type_id::EMPTY, "Unsupported data type");
@@ -360,7 +388,9 @@ std::vector<column_buffer> decode_data(metadata& meta,
   int skip_field_cnt         = 0;
 
   for (size_t i = 0; i < meta.schema.size(); i++) {
-    type_kind_e kind = meta.schema[i].kind;
+    type_kind_e kind                = meta.schema[i].kind;
+    logicaltype_kind_e logical_kind = meta.schema[i].logical_kind;
+
     if (skip_field_cnt != 0) {
       // Exclude union and array members from min_row_data_size
       skip_field_cnt += meta.schema[i].num_children - 1;
@@ -382,7 +412,8 @@ std::vector<column_buffer> decode_data(metadata& meta,
       }
     }
     if (kind == type_enum && !meta.schema[i].symbols.size()) { kind = type_int; }
-    schema_desc[i].kind = kind;
+    schema_desc[i].kind         = kind;
+    schema_desc[i].logical_kind = logical_kind;
     schema_desc[i].count =
       (kind == type_enum) ? 0 : static_cast<uint32_t>(meta.schema[i].num_children);
     schema_desc[i].dataptr = nullptr;
@@ -413,7 +444,8 @@ std::vector<column_buffer> decode_data(metadata& meta,
     }
   }
 
-  auto block_list = cudf::detail::make_device_uvector_async(meta.block_list, stream);
+  auto block_list = cudf::detail::make_device_uvector_async(
+    meta.block_list, stream, rmm::mr::get_current_device_resource());
 
   schema_desc.host_to_device(stream);
 
@@ -543,8 +575,10 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
           }
         }
 
-        d_global_dict      = cudf::detail::make_device_uvector_async(h_global_dict, stream);
-        d_global_dict_data = cudf::detail::make_device_uvector_async(h_global_dict_data, stream);
+        d_global_dict = cudf::detail::make_device_uvector_async(
+          h_global_dict, stream, rmm::mr::get_current_device_resource());
+        d_global_dict_data = cudf::detail::make_device_uvector_async(
+          h_global_dict_data, stream, rmm::mr::get_current_device_resource());
 
         stream.synchronize();
       }
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 6778ddead28..008c7215cca 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -509,9 +509,10 @@ size_t decompress_zstd(host_span<uint8_t const> src,
                        rmm::cuda_stream_view stream)
 {
   // Init device span of spans (source)
-  auto const d_src = cudf::detail::make_device_uvector_async(src, stream);
-  auto hd_srcs     = hostdevice_vector<device_span<uint8_t const>>(1, stream);
-  hd_srcs[0]       = d_src;
+  auto const d_src =
+    cudf::detail::make_device_uvector_async(src, stream, rmm::mr::get_current_device_resource());
+  auto hd_srcs = hostdevice_vector<device_span<uint8_t const>>(1, stream);
+  hd_srcs[0]   = d_src;
   hd_srcs.host_to_device(stream);
 
   // Init device span of spans (temporary destination)
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 4f6f8162246..51e3783bac5 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -794,8 +794,8 @@ std::vector<column_type_histogram> detect_column_types(
   const int block_size = csvparse_block_dim;
   const int grid_size  = (row_starts.size() + block_size - 1) / block_size;
 
-  auto d_stats =
-    detail::make_zeroed_device_uvector_async<column_type_histogram>(num_active_columns, stream);
+  auto d_stats = detail::make_zeroed_device_uvector_async<column_type_histogram>(
+    num_active_columns, stream, rmm::mr::get_current_device_resource());
 
   data_type_detection<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_starts, d_stats);
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
index d42ddf3817c..ac925011c58 100644
--- a/cpp/src/io/csv/durations.hpp
+++ b/cpp/src/io/csv/durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,10 +28,9 @@ namespace io {
 namespace detail {
 namespace csv {
 
-std::unique_ptr<column> pandas_format_durations(
-  column_view const& durations,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> pandas_format_durations(column_view const& durations,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 2e38ea7f4ab..9c1ff67d97c 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -523,13 +523,13 @@ void infer_column_types(parse_options const& parse_opts,
     });
   if (num_inferred_columns == 0) { return; }
 
-  auto const column_stats =
-    cudf::io::csv::gpu::detect_column_types(parse_opts.view(),
-                                            data,
-                                            make_device_uvector_async(column_flags, stream),
-                                            row_offsets,
-                                            num_inferred_columns,
-                                            stream);
+  auto const column_stats = cudf::io::csv::gpu::detect_column_types(
+    parse_opts.view(),
+    data,
+    make_device_uvector_async(column_flags, stream, rmm::mr::get_current_device_resource()),
+    row_offsets,
+    num_inferred_columns,
+    stream);
   stream.synchronize();
 
   auto inf_col_idx = 0;
@@ -595,14 +595,15 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
     h_valid[i] = out_buffers[i].null_mask();
   }
 
-  cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(),
-                                             data,
-                                             make_device_uvector_async(column_flags, stream),
-                                             row_offsets,
-                                             make_device_uvector_async(column_types, stream),
-                                             make_device_uvector_async(h_data, stream),
-                                             make_device_uvector_async(h_valid, stream),
-                                             stream);
+  cudf::io::csv::gpu::decode_row_column_data(
+    parse_opts.view(),
+    data,
+    make_device_uvector_async(column_flags, stream, rmm::mr::get_current_device_resource()),
+    row_offsets,
+    make_device_uvector_async(column_types, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(h_data, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(h_valid, stream, rmm::mr::get_current_device_resource()),
+    stream);
 
   return out_buffers;
 }
diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp
index 70a0b66ebc6..c18b15708ab 100644
--- a/cpp/src/io/json/experimental/read_json.cpp
+++ b/cpp/src/io/json/experimental/read_json.cpp
@@ -80,7 +80,8 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
     auto const uncomp_data = decompress(compression, buffer);
     return cudf::detail::make_device_uvector_sync(
       host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
-      stream);
+      stream,
+      rmm::mr::get_current_device_resource());
   }
 }
 
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 16273b35a11..c937315969c 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -602,8 +602,10 @@ void make_device_json_column(device_span<SymbolT const> input,
                                             col.validity.data()};
   }
 
-  auto d_ignore_vals  = cudf::detail::make_device_uvector_async(ignore_vals, stream);
-  auto d_columns_data = cudf::detail::make_device_uvector_async(columns_data, stream);
+  auto d_ignore_vals = cudf::detail::make_device_uvector_async(
+    ignore_vals, stream, rmm::mr::get_current_device_resource());
+  auto d_columns_data = cudf::detail::make_device_uvector_async(
+    columns_data, stream, rmm::mr::get_current_device_resource());
 
   // 3. scatter string offsets to respective columns, set validity bits
   thrust::for_each_n(
@@ -891,9 +893,11 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
 
   auto gpu_tree = [&]() {
     // Parse the JSON and get the token stream
-    const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream);
+    const auto [tokens_gpu, token_indices_gpu] =
+      get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource());
     // gpu tree generation
-    return get_tree_representation(tokens_gpu, token_indices_gpu, stream);
+    return get_tree_representation(
+      tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
@@ -913,8 +917,13 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     return h_node_categories[0] == NC_LIST and h_node_categories[1] == NC_LIST;
   }();
 
-  auto [gpu_col_id, gpu_row_offsets] = records_orient_tree_traversal(
-    d_input, gpu_tree, is_array_of_arrays, options.is_enabled_lines(), stream);
+  auto [gpu_col_id, gpu_row_offsets] =
+    records_orient_tree_traversal(d_input,
+                                  gpu_tree,
+                                  is_array_of_arrays,
+                                  options.is_enabled_lines(),
+                                  stream,
+                                  rmm::mr::get_current_device_resource());
 
   device_json_column root_column(stream, mr);
   root_column.type = json_col_t::ListColumn;
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 8b6c0f9d528..d1711db0484 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -578,7 +578,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
       return d_column_infos;
     } else {
       return cudf::detail::make_zeroed_device_uvector_async<cudf::io::column_type_histogram>(
-        num_columns, stream);
+        num_columns, stream, rmm::mr::get_current_device_resource());
     }
   }();
 
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 409786d5f1d..f44b7d1ddcc 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -191,11 +191,10 @@ void get_stack_context(device_span<SymbolT const> json_in,
  * @return A tree representation of the input JSON string as vectors of node type, parent index,
  * level, begin index, and end index in the input JSON string
  */
-tree_meta_t get_tree_representation(
-  device_span<PdaTokenT const> tokens,
-  device_span<SymbolOffsetT const> token_indices,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
+                                    device_span<SymbolOffsetT const> token_indices,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Traverse the tree representation of the JSON input in records orient format and populate
@@ -211,13 +210,12 @@ tree_meta_t get_tree_representation(
  * @return A tuple of the output column indices and the row offsets within each column for each node
  */
 std::tuple<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
-records_orient_tree_traversal(
-  device_span<SymbolT const> d_input,
-  tree_meta_t const& d_tree,
-  bool is_array_of_arrays,
-  bool is_enabled_lines,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+records_orient_tree_traversal(device_span<SymbolT const> d_input,
+                              tree_meta_t const& d_tree,
+                              bool is_array_of_arrays,
+                              bool is_enabled_lines,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Searches for and selects nodes at level `row_array_children_level`. For each selected
@@ -258,11 +256,10 @@ reduce_to_column_tree(tree_meta_t& tree,
  * All processing is done in device memory.
  *
  */
-table_with_metadata device_parse_nested_json(
-  device_span<SymbolT const> input,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
+                                             cudf::io::json_reader_options const& options,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Parses the given JSON string and generates table from the given input.
@@ -273,11 +270,10 @@ table_with_metadata device_parse_nested_json(
  * @param mr Optional, resource with which to allocate
  * @return The data parsed from the given JSON input
  */
-table_with_metadata host_parse_nested_json(
-  device_span<SymbolT const> input,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+table_with_metadata host_parse_nested_json(device_span<SymbolT const> input,
+                                           cudf::io::json_reader_options const& options,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index fb58b48d68d..77749b42781 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1169,7 +1169,7 @@ void make_json_column(json_column& root_column,
                       cudf::io::json_reader_options const& options,
                       bool include_quote_char,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                      rmm::mr::device_memory_resource* mr)
 {
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
@@ -1597,9 +1597,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
 
       // Move string_offsets and string_lengths to GPU
       rmm::device_uvector<json_column::row_offset_t> d_string_offsets =
-        cudf::detail::make_device_uvector_async(json_col.string_offsets, stream);
+        cudf::detail::make_device_uvector_async(
+          json_col.string_offsets, stream, rmm::mr::get_current_device_resource());
       rmm::device_uvector<json_column::row_offset_t> d_string_lengths =
-        cudf::detail::make_device_uvector_async(json_col.string_lengths, stream);
+        cudf::detail::make_device_uvector_async(
+          json_col.string_lengths, stream, rmm::mr::get_current_device_resource());
 
       // Prepare iterator that returns (string_offset, string_length)-tuples
       auto offset_length_it =
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 6e1089796de..7ae8deb8055 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -340,8 +340,8 @@ rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reade
                "Error finding the record within the specified byte range.\n");
 
   // Upload the raw data that is within the rows of interest
-  return cudf::detail::make_device_uvector_async(h_data.subspan(start_offset, bytes_to_upload),
-                                                 stream);
+  return cudf::detail::make_device_uvector_async(
+    h_data.subspan(start_offset, bytes_to_upload), stream, rmm::mr::get_current_device_resource());
 }
 
 std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
@@ -512,11 +512,14 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
     h_valid[i]  = out_buffers[i].null_mask();
   }
 
-  auto d_dtypes = cudf::detail::make_device_uvector_async<data_type>(h_dtypes, stream);
-  auto d_data   = cudf::detail::make_device_uvector_async<void*>(h_data, stream);
-  auto d_valid  = cudf::detail::make_device_uvector_async<cudf::bitmask_type*>(h_valid, stream);
-  auto d_valid_counts =
-    cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(num_columns, stream);
+  auto d_dtypes = cudf::detail::make_device_uvector_async<data_type>(
+    h_dtypes, stream, rmm::mr::get_current_device_resource());
+  auto d_data = cudf::detail::make_device_uvector_async<void*>(
+    h_data, stream, rmm::mr::get_current_device_resource());
+  auto d_valid = cudf::detail::make_device_uvector_async<cudf::bitmask_type*>(
+    h_valid, stream, rmm::mr::get_current_device_resource());
+  auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
+    num_columns, stream, rmm::mr::get_current_device_resource());
 
   cudf::io::json::gpu::convert_json_to_columns(
     parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream);
@@ -530,13 +533,18 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   auto repl_chars   = std::vector<char>{'"', '\\', '\t', '\r', '\b'};
   auto repl_offsets = std::vector<size_type>{0, 1, 2, 3, 4, 5};
 
-  auto target = make_strings_column(cudf::detail::make_device_uvector_async(target_chars, stream),
-                                    cudf::detail::make_device_uvector_async(target_offsets, stream),
-                                    {},
-                                    0,
-                                    stream);
-  auto repl   = make_strings_column(cudf::detail::make_device_uvector_async(repl_chars, stream),
-                                  cudf::detail::make_device_uvector_async(repl_offsets, stream),
+  auto target =
+    make_strings_column(cudf::detail::make_device_uvector_async(
+                          target_chars, stream, rmm::mr::get_current_device_resource()),
+                        cudf::detail::make_device_uvector_async(
+                          target_offsets, stream, rmm::mr::get_current_device_resource()),
+                        {},
+                        0,
+                        stream);
+  auto repl = make_strings_column(cudf::detail::make_device_uvector_async(
+                                    repl_chars, stream, rmm::mr::get_current_device_resource()),
+                                  cudf::detail::make_device_uvector_async(
+                                    repl_offsets, stream, rmm::mr::get_current_device_resource()),
                                   {},
                                   0,
                                   stream);
@@ -617,7 +625,8 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
   auto d_data = rmm::device_uvector<char>(0, stream);
 
   if (should_load_whole_source(reader_opts)) {
-    d_data = cudf::detail::make_device_uvector_async(h_data, stream);
+    d_data = cudf::detail::make_device_uvector_async(
+      h_data, stream, rmm::mr::get_current_device_resource());
   }
 
   auto rec_starts = find_record_starts(reader_opts, h_data, d_data, stream);
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index b4bcb5548de..9e56b20114c 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -552,14 +552,16 @@ std::unique_ptr<column> make_strings_column_from_host(host_span<std::string cons
 {
   std::string const host_chars =
     std::accumulate(host_strings.begin(), host_strings.end(), std::string(""));
-  auto d_chars = cudf::detail::make_device_uvector_async(host_chars, stream);
+  auto d_chars = cudf::detail::make_device_uvector_async(
+    host_chars, stream, rmm::mr::get_current_device_resource());
   std::vector<cudf::size_type> offsets(host_strings.size() + 1, 0);
   std::transform_inclusive_scan(host_strings.begin(),
                                 host_strings.end(),
                                 offsets.begin() + 1,
                                 std::plus<cudf::size_type>{},
                                 [](auto& str) { return str.size(); });
-  auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream);
+  auto d_offsets =
+    cudf::detail::make_device_uvector_sync(offsets, stream, rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(
     host_strings.size(), std::move(d_offsets), std::move(d_chars), {}, 0);
 }
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 880990c552f..5445e59297c 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -28,7 +28,7 @@ namespace cudf {
 namespace io {
 namespace orc {
 
-uint32_t ProtobufReader::read_field_size(const uint8_t* end)
+uint32_t ProtobufReader::read_field_size(uint8_t const* end)
 {
   auto const size = get<uint32_t>();
   CUDF_EXPECTS(size <= static_cast<uint32_t>(end - m_cur), "Protobuf parsing out of bounds");
@@ -213,8 +213,7 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
                                          TypeKind kind,
                                          ColStatsBlob const* stats)
 {
-  std::vector<uint8_t> positions_data;
-  ProtobufWriter position_writer(&positions_data);
+  ProtobufWriter position_writer;
   auto const positions_size_offset = position_writer.put_uint(
     encode_field_number(1, ProtofType::FIXEDLEN));  // 1:positions[packed=true]
   position_writer.put_byte(0xcd);                   // positions size placeholder
@@ -246,19 +245,20 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
       positions_size += position_writer.put_byte(0);
     }
   }
+
   // size of the field 1
-  positions_data[positions_size_offset] = static_cast<uint8_t>(positions_size);
+  position_writer.buffer()[positions_size_offset] = static_cast<uint8_t>(positions_size);
 
   auto const stats_size = (stats == nullptr)
                             ? 0
                             : varint_size(encode_field_number<decltype(*stats)>(2)) +
                                 varint_size(stats->size()) + stats->size();
-  auto const entry_size = positions_data.size() + stats_size;
+  auto const entry_size = position_writer.size() + stats_size;
 
   // 1:RowIndex.entry
   put_uint(encode_field_number(1, ProtofType::FIXEDLEN));
   put_uint(entry_size);
-  put_bytes<uint8_t>(positions_data);
+  put_bytes<uint8_t>(position_writer.buffer());
 
   if (stats != nullptr) {
     put_uint(encode_field_number<decltype(*stats)>(2));  // 2: statistics
@@ -268,7 +268,7 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
   }
 }
 
-size_t ProtobufWriter::write(const PostScript& s)
+size_t ProtobufWriter::write(PostScript const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.footerLength);
@@ -280,7 +280,7 @@ size_t ProtobufWriter::write(const PostScript& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const FileFooter& s)
+size_t ProtobufWriter::write(FileFooter const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.headerLength);
@@ -294,7 +294,7 @@ size_t ProtobufWriter::write(const FileFooter& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const StripeInformation& s)
+size_t ProtobufWriter::write(StripeInformation const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.offset);
@@ -305,7 +305,7 @@ size_t ProtobufWriter::write(const StripeInformation& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const SchemaType& s)
+size_t ProtobufWriter::write(SchemaType const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
@@ -317,7 +317,7 @@ size_t ProtobufWriter::write(const SchemaType& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const UserMetadataItem& s)
+size_t ProtobufWriter::write(UserMetadataItem const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_blob(1, s.name);
@@ -325,7 +325,7 @@ size_t ProtobufWriter::write(const UserMetadataItem& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const StripeFooter& s)
+size_t ProtobufWriter::write(StripeFooter const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_repeated_struct(1, s.streams);
@@ -334,7 +334,7 @@ size_t ProtobufWriter::write(const StripeFooter& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const Stream& s)
+size_t ProtobufWriter::write(Stream const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
@@ -343,7 +343,7 @@ size_t ProtobufWriter::write(const Stream& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const ColumnEncoding& s)
+size_t ProtobufWriter::write(ColumnEncoding const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
@@ -351,14 +351,14 @@ size_t ProtobufWriter::write(const ColumnEncoding& s)
   return w.value();
 }
 
-size_t ProtobufWriter::write(const StripeStatistics& s)
+size_t ProtobufWriter::write(StripeStatistics const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_repeated_struct_blob(1, s.colStats);
   return w.value();
 }
 
-size_t ProtobufWriter::write(const Metadata& s)
+size_t ProtobufWriter::write(Metadata const& s)
 {
   ProtobufFieldWriter w(this);
   w.field_repeated_struct(1, s.stripeStats);
@@ -443,13 +443,13 @@ host_span<uint8_t const> OrcDecompressor::decompress_blocks(host_span<uint8_t co
 
 metadata::metadata(datasource* const src, rmm::cuda_stream_view stream) : source(src)
 {
-  const auto len         = source->size();
-  const auto max_ps_size = std::min(len, static_cast<size_t>(256));
+  auto const len         = source->size();
+  auto const max_ps_size = std::min(len, static_cast<size_t>(256));
 
   // Read uncompressed postscript section (max 255 bytes + 1 byte for length)
   auto buffer            = source->host_read(len - max_ps_size, max_ps_size);
-  const size_t ps_length = buffer->data()[max_ps_size - 1];
-  const uint8_t* ps_data = &buffer->data()[max_ps_size - ps_length - 1];
+  size_t const ps_length = buffer->data()[max_ps_size - 1];
+  uint8_t const* ps_data = &buffer->data()[max_ps_size - ps_length - 1];
   ProtobufReader(ps_data, ps_length).read(ps);
   CUDF_EXPECTS(ps.footerLength + ps_length < len, "Invalid footer length");
 
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 44882b71925..21fc04a69ec 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,8 @@ namespace io {
 namespace orc {
 
 static constexpr uint32_t block_header_size = 3;
+// Seconds from January 1st, 1970 to January 1st, 2015
+static constexpr int64_t orc_utc_epoch = 1420070400;
 
 struct PostScript {
   uint64_t footerLength       = 0;     // the length of the footer section in bytes
@@ -196,7 +198,7 @@ int constexpr encode_field_number(int field_number) noexcept
  */
 class ProtobufReader {
  public:
-  ProtobufReader(const uint8_t* base, size_t len) : m_base(base), m_cur(base), m_end(base + len) {}
+  ProtobufReader(uint8_t const* base, size_t len) : m_base(base), m_cur(base), m_end(base + len) {}
 
   template <typename T>
   void read(T& s)
@@ -241,40 +243,40 @@ class ProtobufReader {
   template <typename T, typename... Operator>
   void function_builder(T& s, size_t maxlen, std::tuple<Operator...>& op);
 
-  uint32_t read_field_size(const uint8_t* end);
+  uint32_t read_field_size(uint8_t const* end);
 
   template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     value = get<T>();
   }
 
   template <typename T, std::enable_if_t<std::is_enum_v<T>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     value = static_cast<T>(get<uint32_t>());
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, std::string>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     auto const size = read_field_size(end);
-    value.assign(reinterpret_cast<const char*>(m_cur), size);
+    value.assign(reinterpret_cast<char const*>(m_cur), size);
     m_cur += size;
   }
 
   template <typename T, std::enable_if_t<std::is_same_v<T, std::vector<std::string>>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     auto const size = read_field_size(end);
-    value.emplace_back(reinterpret_cast<const char*>(m_cur), size);
+    value.emplace_back(reinterpret_cast<char const*>(m_cur), size);
     m_cur += size;
   }
 
   template <typename T,
             std::enable_if_t<std::is_same_v<T, std::vector<typename T::value_type>> and
                              !std::is_same_v<std::string, typename T::value_type>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     auto const size = read_field_size(end);
     value.emplace_back();
@@ -283,7 +285,7 @@ class ProtobufReader {
 
   template <typename T,
             std::enable_if_t<std::is_same_v<T, std::optional<typename T::value_type>>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     typename T::value_type contained_value;
     read_field(contained_value, end);
@@ -291,21 +293,21 @@ class ProtobufReader {
   }
 
   template <typename T>
-  auto read_field(T& value, const uint8_t* end) -> decltype(read(value, 0))
+  auto read_field(T& value, uint8_t const* end) -> decltype(read(value, 0))
   {
     auto const size = read_field_size(end);
     read(value, size);
   }
 
   template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
-  void read_field(T& value, const uint8_t* end)
+  void read_field(T& value, uint8_t const* end)
   {
     memcpy(&value, m_cur, sizeof(T));
     m_cur += sizeof(T);
   }
 
   template <typename T>
-  void read_packed_field(T& value, const uint8_t* end)
+  void read_packed_field(T& value, uint8_t const* end)
   {
     auto const len       = get<uint32_t>();
     auto const field_end = std::min(m_cur + len, end);
@@ -314,7 +316,7 @@ class ProtobufReader {
   }
 
   template <typename T>
-  void read_raw_field(T& value, const uint8_t* end)
+  void read_raw_field(T& value, uint8_t const* end)
   {
     auto const size = read_field_size(end);
     value.emplace_back(m_cur, m_cur + size);
@@ -331,7 +333,7 @@ class ProtobufReader {
     {
     }
 
-    inline void operator()(ProtobufReader* pbr, const uint8_t* end)
+    inline void operator()(ProtobufReader* pbr, uint8_t const* end)
     {
       pbr->read_field(output_value, end);
     }
@@ -347,7 +349,7 @@ class ProtobufReader {
     {
     }
 
-    inline void operator()(ProtobufReader* pbr, const uint8_t* end)
+    inline void operator()(ProtobufReader* pbr, uint8_t const* end)
     {
       pbr->read_packed_field(output_value, end);
     }
@@ -363,15 +365,15 @@ class ProtobufReader {
     {
     }
 
-    inline void operator()(ProtobufReader* pbr, const uint8_t* end)
+    inline void operator()(ProtobufReader* pbr, uint8_t const* end)
     {
       pbr->read_raw_field(output_value, end);
     }
   };
 
-  const uint8_t* const m_base;
-  const uint8_t* m_cur;
-  const uint8_t* const m_end;
+  uint8_t const* const m_base;
+  uint8_t const* m_cur;
+  uint8_t const* const m_end;
 
  public:
   /**
@@ -477,21 +479,25 @@ inline int64_t ProtobufReader::get<int64_t>()
  */
 class ProtobufWriter {
  public:
-  ProtobufWriter() { m_buf = nullptr; }
-  ProtobufWriter(std::vector<uint8_t>* output) { m_buf = output; }
+  ProtobufWriter() = default;
+
+  ProtobufWriter(std::size_t bytes) : m_buff(bytes) {}
+
   uint32_t put_byte(uint8_t v)
   {
-    m_buf->push_back(v);
+    m_buff.push_back(v);
     return 1;
   }
+
   template <typename T>
   uint32_t put_bytes(host_span<T const> values)
   {
     static_assert(sizeof(T) == 1);
-    m_buf->reserve(m_buf->size() + values.size());
-    m_buf->insert(m_buf->end(), values.begin(), values.end());
+    m_buff.reserve(m_buff.size() + values.size());
+    m_buff.insert(m_buff.end(), values.begin(), values.end());
     return values.size();
   }
+
   uint32_t put_uint(uint64_t v)
   {
     int l = 1;
@@ -519,6 +525,7 @@ class ProtobufWriter {
     int64_t s = (v < 0);
     return put_uint(((v ^ -s) << 1) + s);
   }
+
   void put_row_index_entry(int32_t present_blk,
                            int32_t present_ofs,
                            int32_t data_blk,
@@ -528,20 +535,26 @@ class ProtobufWriter {
                            TypeKind kind,
                            ColStatsBlob const* stats);
 
+  std::size_t size() const { return m_buff.size(); }
+  uint8_t const* data() { return m_buff.data(); }
+
+  std::vector<uint8_t>& buffer() { return m_buff; }
+  std::vector<uint8_t> release() { return std::move(m_buff); }
+
  public:
-  size_t write(const PostScript&);
-  size_t write(const FileFooter&);
-  size_t write(const StripeInformation&);
-  size_t write(const SchemaType&);
-  size_t write(const UserMetadataItem&);
-  size_t write(const StripeFooter&);
-  size_t write(const Stream&);
-  size_t write(const ColumnEncoding&);
-  size_t write(const StripeStatistics&);
-  size_t write(const Metadata&);
+  size_t write(PostScript const&);
+  size_t write(FileFooter const&);
+  size_t write(StripeInformation const&);
+  size_t write(SchemaType const&);
+  size_t write(UserMetadataItem const&);
+  size_t write(StripeFooter const&);
+  size_t write(Stream const&);
+  size_t write(ColumnEncoding const&);
+  size_t write(StripeStatistics const&);
+  size_t write(Metadata const&);
 
  protected:
-  std::vector<uint8_t>* m_buf;
+  std::vector<uint8_t> m_buff;
   struct ProtobufFieldWriter;
 };
 
@@ -613,7 +626,7 @@ struct column_validity_info {
  * convenience methods for initializing and accessing metadata.
  */
 class metadata {
-  using OrcStripeInfo = std::pair<const StripeInformation*, const StripeFooter*>;
+  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
 
  public:
   struct stripe_source_mapping {
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index 44d87190844..fdba0d81a32 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ struct ProtobufWriter::ProtobufFieldWriter {
   void field_packed_uint(int field, const std::vector<T>& value)
   {
     struct_size += p->put_uint(encode_field_number<std::vector<T>>(field));
-    auto lpos = p->m_buf->size();
+    auto lpos = p->m_buff.size();
     p->put_byte(0);
     auto sz = std::accumulate(value.begin(), value.end(), 0, [p = this->p](size_t sum, auto val) {
       return sum + p->put_uint(val);
@@ -62,8 +62,8 @@ struct ProtobufWriter::ProtobufFieldWriter {
 
     struct_size += sz + 1;
     for (; sz > 0x7f; sz >>= 7, struct_size++)
-      p->m_buf->insert(p->m_buf->begin() + (lpos++), static_cast<uint8_t>((sz & 0x7f) | 0x80));
-    (*(p->m_buf))[lpos] = static_cast<uint8_t>(sz);
+      p->m_buff.insert(p->m_buff.begin() + (lpos++), static_cast<uint8_t>((sz & 0x7f) | 0x80));
+    (p->m_buff)[lpos] = static_cast<uint8_t>(sz);
   }
 
   /**
@@ -84,13 +84,13 @@ struct ProtobufWriter::ProtobufFieldWriter {
   void field_struct(int field, const T& value)
   {
     struct_size += p->put_uint(encode_field_number(field, ProtofType::FIXEDLEN));
-    auto lpos = p->m_buf->size();
+    auto lpos = p->m_buff.size();
     p->put_byte(0);
     auto sz = p->write(value);
     struct_size += sz + 1;
     for (; sz > 0x7f; sz >>= 7, struct_size++)
-      p->m_buf->insert(p->m_buf->begin() + (lpos++), static_cast<uint8_t>((sz & 0x7f) | 0x80));
-    (*(p->m_buf))[lpos] = static_cast<uint8_t>(sz);
+      p->m_buff.insert(p->m_buff.begin() + (lpos++), static_cast<uint8_t>((sz & 0x7f) | 0x80));
+    (p->m_buff)[lpos] = static_cast<uint8_t>(sz);
   }
 
   /**
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 43f0565845c..05560a3ca62 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "timezone.cuh"
+#include <cudf/detail/timezone.cuh>
 
 #include "orc.hpp"
 
@@ -294,7 +294,7 @@ void DecodeOrcColumnData(ColumnDesc* chunks,
                          uint32_t num_columns,
                          uint32_t num_stripes,
                          size_t first_row,
-                         timezone_table_view tz_table,
+                         table_device_view tz_table,
                          uint32_t num_rowgroups,
                          uint32_t rowidx_stride,
                          size_t level,
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 96eb20e1e66..bcf53159676 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -23,13 +23,13 @@
 #include "orc_gpu.hpp"
 
 #include "reader_impl.hpp"
-#include "timezone.cuh"
 
 #include <io/comp/gpuinflate.hpp>
 #include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
+#include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
@@ -576,8 +576,8 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
       prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
     }
   }
-  auto const d_prefix_sums_to_update =
-    cudf::detail::make_device_uvector_async(prefix_sums_to_update, stream);
+  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
+    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
 
   thrust::for_each(rmm::exec_policy(stream),
                    d_prefix_sums_to_update.begin(),
@@ -603,7 +603,7 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
 void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                                       size_t num_dicts,
                                       size_t skip_rows,
-                                      timezone_table_view tz_table,
+                                      table_device_view tz_table,
                                       cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
                                       size_t row_index_stride,
                                       std::vector<column_buffer>& out_buffers,
@@ -915,11 +915,11 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   decimal128_columns = options.get_decimal128_columns();
 }
 
-timezone_table reader::impl::compute_timezone_table(
+std::unique_ptr<table> reader::impl::compute_timezone_table(
   const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
   rmm::cuda_stream_view stream)
 {
-  if (selected_stripes.empty()) return {};
+  if (selected_stripes.empty()) return std::make_unique<cudf::table>();
 
   auto const has_timestamp_column = std::any_of(
     selected_columns.levels.cbegin(), selected_columns.levels.cend(), [&](auto& col_lvl) {
@@ -927,10 +927,10 @@ timezone_table reader::impl::compute_timezone_table(
         return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
       });
     });
-  if (not has_timestamp_column) return {};
+  if (not has_timestamp_column) return std::make_unique<cudf::table>();
 
-  return build_timezone_transition_table(selected_stripes[0].stripe_info[0].second->writerTimezone,
-                                         stream);
+  return cudf::detail::make_timezone_transition_table(
+    {}, selected_stripes[0].stripe_info[0].second->writerTimezone, stream);
 }
 
 table_with_metadata reader::impl::read(size_type skip_rows,
@@ -1038,7 +1038,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                       selected_columns.levels[level].size(),
                       [&]() {
                         return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                          total_num_stripes, stream);
+                          total_num_stripes, stream, rmm::mr::get_current_device_resource());
                       });
 
       // Tracker for eventually deallocating compressed and uncompressed data
@@ -1238,10 +1238,11 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         }
 
         if (not is_level_data_empty) {
+          auto const tz_table_dview = table_device_view::create(tz_table->view(), stream);
           decode_stream_data(chunks,
                              num_dict_entries,
                              skip_rows,
-                             tz_table.view(),
+                             *tz_table_dview,
                              row_groups,
                              _metadata.get_row_index_stride(),
                              out_buffers[level],
@@ -1270,7 +1271,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
             });
 
           if (buff_data.size()) {
-            auto const dev_buff_data = cudf::detail::make_device_uvector_async(buff_data, stream);
+            auto const dev_buff_data = cudf::detail::make_device_uvector_async(
+              buff_data, stream, rmm::mr::get_current_device_resource());
             generate_offsets_for_list(dev_buff_data, stream);
           }
         }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 96492e4c2b2..94b0fdc09d2 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,7 +145,7 @@ class reader::impl {
   void decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                           size_t num_dicts,
                           size_t skip_rows,
-                          timezone_table_view tz_table,
+                          table_device_view tz_table,
                           cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
                           size_t row_index_stride,
                           std::vector<column_buffer>& out_buffers,
@@ -210,7 +210,7 @@ class reader::impl {
    *
    * @return Timezone table with timestamp offsets
    */
-  timezone_table compute_timezone_table(
+  std::unique_ptr<table> compute_timezone_table(
     const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
     rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index d0d077d2611..8e698dd9dff 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,9 +43,6 @@ inline __device__ uint8_t is_rlev1(uint8_t encoding_mode) { return encoding_mode
 
 inline __device__ uint8_t is_dictionary(uint8_t encoding_mode) { return encoding_mode & 1; }
 
-static __device__ __constant__ int64_t kORCTimeToUTC =
-  1420070400;  // Seconds from January 1st, 1970 to January 1st, 2015
-
 struct orc_bytestream_s {
   const uint8_t* base;
   uint32_t pos;
@@ -101,7 +98,7 @@ struct orc_datadec_state_s {
   uint32_t max_vals;        // max # of non-zero values to decode in this batch
   uint32_t nrows;           // # of rows in current batch (up to block_size)
   uint32_t buffered_count;  // number of buffered values in the secondary data stream
-  int64_t utc_epoch;        // kORCTimeToUTC - gmtOffset
+  duration_s tz_epoch;      // orc_ut_epoch - ut_offset
   RowGroup index;
 };
 
@@ -1374,7 +1371,7 @@ template <int block_size>
 __global__ void __launch_bounds__(block_size)
   gpuDecodeOrcColumnData(ColumnDesc* chunks,
                          DictionaryEntry* global_dictionary,
-                         timezone_table_view tz_table,
+                         table_device_view tz_table,
                          device_2dspan<RowGroup> row_groups,
                          size_t first_row,
                          uint32_t rowidx_stride,
@@ -1446,7 +1443,8 @@ __global__ void __launch_bounds__(block_size)
     }
     if (!is_dictionary(s->chunk.encoding_kind)) { s->chunk.dictionary_start = 0; }
 
-    s->top.data.utc_epoch = kORCTimeToUTC - tz_table.gmt_offset;
+    static constexpr duration_s d_orc_utc_epoch = duration_s{orc_utc_epoch};
+    s->top.data.tz_epoch = d_orc_utc_epoch - get_ut_offset(tz_table, timestamp_s{d_orc_utc_epoch});
 
     bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]);
     bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]);
@@ -1769,37 +1767,33 @@ __global__ void __launch_bounds__(block_size)
               break;
             }
             case TIMESTAMP: {
-              int64_t seconds = s->vals.i64[t + vals_skipped] + s->top.data.utc_epoch;
-              int64_t nanos   = secondary_val;
-              nanos           = (nanos >> 3) * kTimestampNanoScale[nanos & 7];
-              if (!tz_table.ttimes.empty()) {
-                seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
-              }
+              auto seconds = s->top.data.tz_epoch + duration_s{s->vals.i64[t + vals_skipped]};
+              // Convert to UTC
+              seconds += get_ut_offset(tz_table, timestamp_s{seconds});
+
+              duration_ns nanos = duration_ns{(static_cast<int64_t>(secondary_val) >> 3) *
+                                              kTimestampNanoScale[secondary_val & 7]};
+
               // Adjust seconds only for negative timestamps with positive nanoseconds.
               // Alternative way to represent negative timestamps is with negative nanoseconds
               // in which case the adjustment in not needed.
               // Comparing with 999999 instead of zero to match the apache writer.
-              if (seconds < 0 and nanos > 999999) { seconds -= 1; }
-
-              duration_ns d_ns{nanos};
-              duration_s d_s{seconds};
+              if (seconds.count() < 0 and nanos.count() > 999999) { seconds -= duration_s{1}; }
 
               static_cast<int64_t*>(data_out)[row] = [&]() {
                 using cuda::std::chrono::duration_cast;
                 switch (s->chunk.timestamp_type_id) {
                   case type_id::TIMESTAMP_SECONDS:
-                    return d_s.count() + duration_cast<duration_s>(d_ns).count();
+                    return (seconds + duration_cast<duration_s>(nanos)).count();
                   case type_id::TIMESTAMP_MILLISECONDS:
-                    return duration_cast<duration_ms>(d_s).count() +
-                           duration_cast<duration_ms>(d_ns).count();
+                    return (seconds + duration_cast<duration_ms>(nanos)).count();
                   case type_id::TIMESTAMP_MICROSECONDS:
-                    return duration_cast<duration_us>(d_s).count() +
-                           duration_cast<duration_us>(d_ns).count();
+                    return (seconds + duration_cast<duration_us>(nanos)).count();
                   case type_id::TIMESTAMP_NANOSECONDS:
                   default:
-                    return duration_cast<duration_ns>(d_s).count() +
-                           d_ns.count();  // nanoseconds as output in case of `type_id::EMPTY` and
-                                          // `type_id::TIMESTAMP_NANOSECONDS`
+                    // nanoseconds as output in case of `type_id::EMPTY` and
+                    // `type_id::TIMESTAMP_NANOSECONDS`
+                    return (seconds + nanos).count();
                 }
               }();
 
@@ -1887,7 +1881,7 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
                                   uint32_t num_columns,
                                   uint32_t num_stripes,
                                   size_t first_row,
-                                  timezone_table_view tz_table,
+                                  table_device_view tz_table,
                                   uint32_t num_rowgroups,
                                   uint32_t rowidx_stride,
                                   size_t level,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 9032e3d2502..427167e2d0f 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,9 +49,6 @@ constexpr int scratch_buffer_size = 512 * 4;
 // Workaround replaces zero-length patch lists by a dummy zero patch
 constexpr bool zero_pll_war = true;
 
-static __device__ __constant__ int64_t kORCTimeToUTC =
-  1420070400;  // Seconds from January 1st, 1970 to January 1st, 2015
-
 struct byterle_enc_state_s {
   uint32_t literal_run;
   uint32_t repeat_run;
@@ -814,7 +811,7 @@ __global__ void __launch_bounds__(block_size)
             int32_t ts_scale    = powers_of_ten[9 - min(s->chunk.scale, 9)];
             int64_t seconds     = ts / ts_scale;
             int64_t nanos       = (ts - seconds * ts_scale);
-            s->vals.i64[nz_idx] = seconds - kORCTimeToUTC;
+            s->vals.i64[nz_idx] = seconds - orc_utc_epoch;
             if (nanos != 0) {
               // Trailing zeroes are encoded in the lower 3-bits
               uint32_t zeroes = 0;
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
deleted file mode 100644
index 52736d6451a..00000000000
--- a/cpp/src/io/orc/timezone.cuh
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/span.hpp>
-#include <io/utilities/time_utils.cuh>
-
-#include <rmm/device_uvector.hpp>
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-namespace cudf {
-namespace io {
-
-struct timezone_table_view {
-  int32_t gmt_offset = 0;
-  cudf::device_span<int64_t const> ttimes;
-  cudf::device_span<int32_t const> offsets;
-};
-
-// Cycle in which the time offsets repeat
-static constexpr int32_t cycle_years = 400;
-// Number of seconds in 400 years
-static constexpr int64_t cycle_seconds =
-  cuda::std::chrono::duration_cast<duration_s>(duration_D{365 * cycle_years + (100 - 3)}).count();
-// Two entries per year, over the length of the cycle
-static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years;
-
-/**
- * @brief Returns the GMT offset for a given date and given timezone table.
- *
- * @param ttimes Transition times; trailing `cycle_entry_cnt` entries are used for all times
- * beyond the one covered by the TZif file
- * @param offsets Time offsets in specific intervals; trailing `cycle_entry_cnt` entries are used
- * for all times beyond the one covered by the TZif file
- * @param count Number of elements in @p ttimes and @p offsets
- * @param ts ORC timestamp
- *
- * @return GMT offset
- */
-CUDF_HOST_DEVICE inline int32_t get_gmt_offset_impl(int64_t const* ttimes,
-                                                    int32_t const* offsets,
-                                                    size_t count,
-                                                    int64_t ts)
-{
-  // Returns start of the range if all elements are larger than the input timestamp
-  auto last_less_equal_ttime_idx = [&](long begin_idx, long end_idx, int64_t ts) {
-    auto const first_larger_ttime =
-      thrust::upper_bound(thrust::seq, ttimes + begin_idx, ttimes + end_idx, ts);
-    // Element before the first larger element is the last one less of equal
-    return std::max(first_larger_ttime - ttimes - 1, begin_idx);
-  };
-
-  auto const file_entry_cnt = count - cycle_entry_cnt;
-  // Search in the file entries if the timestamp is in range
-  if (ts <= ttimes[file_entry_cnt - 1]) {
-    return offsets[last_less_equal_ttime_idx(0, file_entry_cnt, ts)];
-  } else {
-    // Search in the 400-year cycle if outside of the file entries range
-    return offsets[last_less_equal_ttime_idx(
-      file_entry_cnt, count, (ts + cycle_seconds) % cycle_seconds)];
-  }
-}
-
-/**
- * @brief Host `get_gmt_offset` interface.
- *
- * Implemented in `get_gmt_offset_impl`.
- */
-inline __host__ int32_t get_gmt_offset(cudf::host_span<int64_t const> ttimes,
-                                       cudf::host_span<int32_t const> offsets,
-                                       int64_t ts)
-{
-  CUDF_EXPECTS(ttimes.size() == offsets.size(),
-               "transition times and offsets must have the same length");
-  return get_gmt_offset_impl(ttimes.begin(), offsets.begin(), ttimes.size(), ts);
-}
-
-/**
- * @brief Device `get_gmt_offset` interface.
- *
- * Implemented in `get_gmt_offset_impl`.
- */
-inline __device__ int32_t get_gmt_offset(cudf::device_span<int64_t const> ttimes,
-                                         cudf::device_span<int32_t const> offsets,
-                                         int64_t ts)
-{
-  return get_gmt_offset_impl(ttimes.begin(), offsets.begin(), ttimes.size(), ts);
-}
-
-class timezone_table {
-  int32_t gmt_offset = 0;
-  rmm::device_uvector<int64_t> ttimes;
-  rmm::device_uvector<int32_t> offsets;
-
- public:
-  // Safe to use the default stream, device_uvectors will not change after they are created empty
-  timezone_table() : ttimes{0, cudf::get_default_stream()}, offsets{0, cudf::get_default_stream()}
-  {
-  }
-  timezone_table(int32_t gmt_offset,
-                 rmm::device_uvector<int64_t>&& ttimes,
-                 rmm::device_uvector<int32_t>&& offsets)
-    : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)}
-  {
-  }
-  [[nodiscard]] timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; }
-};
-
-/**
- * @brief Creates a transition table to convert ORC timestamps to UTC.
- *
- * Uses system's TZif files. Assumes little-endian platform when parsing these files.
- *
- * @param timezone_name standard timezone name (for example, "US/Pacific")
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return The transition table for the given timezone
- */
-timezone_table build_timezone_transition_table(std::string const& timezone_name,
-                                               rmm::cuda_stream_view stream);
-
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 8d85b001817..bd526f4f4eb 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -55,6 +55,7 @@
 #include <algorithm>
 #include <cstring>
 #include <numeric>
+#include <tuple>
 #include <utility>
 
 #include <cooperative_groups.h>
@@ -332,6 +333,136 @@ size_type orc_table_view::num_rows() const noexcept
   return columns.empty() ? 0 : columns.front().size();
 }
 
+orc_streams::orc_stream_offsets orc_streams::compute_offsets(
+  host_span<orc_column_view const> columns, size_t num_rowgroups) const
+{
+  std::vector<size_t> strm_offsets(streams.size());
+  size_t non_rle_data_size = 0;
+  size_t rle_data_size     = 0;
+  for (size_t i = 0; i < streams.size(); ++i) {
+    const auto& stream = streams[i];
+
+    auto const is_rle_data = [&]() {
+      // First stream is an index stream, don't check types, etc.
+      if (!stream.column_index().has_value()) return true;
+
+      auto const& column = columns[stream.column_index().value()];
+      // Dictionary encoded string column - dictionary characters or
+      // directly encoded string - column characters
+      if (column.orc_kind() == TypeKind::STRING &&
+          ((stream.kind == DICTIONARY_DATA && column.orc_encoding() == DICTIONARY_V2) ||
+           (stream.kind == DATA && column.orc_encoding() == DIRECT_V2)))
+        return false;
+      // Decimal data
+      if (column.orc_kind() == TypeKind::DECIMAL && stream.kind == DATA) return false;
+
+      // Everything else uses RLE
+      return true;
+    }();
+    // non-RLE and RLE streams are separated in the buffer that stores encoded data
+    // The computed offsets do not take the streams of the other type into account
+    if (is_rle_data) {
+      strm_offsets[i] = rle_data_size;
+      rle_data_size += (stream.length + 7) & ~7;
+    } else {
+      strm_offsets[i] = non_rle_data_size;
+      non_rle_data_size += stream.length;
+    }
+  }
+  non_rle_data_size = (non_rle_data_size + 7) & ~7;
+
+  return {std::move(strm_offsets), non_rle_data_size, rle_data_size};
+}
+
+namespace {
+struct string_length_functor {
+  __device__ inline size_type operator()(int const i) const
+  {
+    // we translate from 0 -> num_chunks * 2 because each statistic has a min and max
+    // string and we need to calculate lengths for both.
+    if (i >= num_chunks * 2) return 0;
+
+    // min strings are even values, max strings are odd values of i
+    auto const should_copy_min = i % 2 == 0;
+    // index of the chunk
+    auto const idx = i / 2;
+    auto& str_val  = should_copy_min ? stripe_stat_chunks[idx].min_value.str_val
+                                     : stripe_stat_chunks[idx].max_value.str_val;
+    auto const str = stripe_stat_merge[idx].stats_dtype == dtype_string;
+    return str ? str_val.length : 0;
+  }
+
+  int const num_chunks;
+  statistics_chunk const* stripe_stat_chunks;
+  statistics_merge_group const* stripe_stat_merge;
+};
+
+__global__ void copy_string_data(char* string_pool,
+                                 size_type* offsets,
+                                 statistics_chunk* chunks,
+                                 statistics_merge_group const* groups)
+{
+  auto const idx = blockIdx.x / 2;
+  if (groups[idx].stats_dtype == dtype_string) {
+    // min strings are even values, max strings are odd values of i
+    auto const should_copy_min = blockIdx.x % 2 == 0;
+    auto& str_val = should_copy_min ? chunks[idx].min_value.str_val : chunks[idx].max_value.str_val;
+    auto dst      = &string_pool[offsets[blockIdx.x]];
+    auto src      = str_val.ptr;
+
+    for (int i = threadIdx.x; i < str_val.length; i += blockDim.x) {
+      dst[i] = src[i];
+    }
+    if (threadIdx.x == 0) { str_val.ptr = dst; }
+  }
+}
+
+}  // namespace
+
+void persisted_statistics::persist(int num_table_rows,
+                                   bool single_write_mode,
+                                   intermediate_statistics& intermediate_stats,
+                                   rmm::cuda_stream_view stream)
+{
+  if (not single_write_mode) {
+    // persist the strings in the chunks into a string pool and update pointers
+    auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
+    // min offset and max offset + 1 for total size
+    rmm::device_uvector<size_type> offsets((num_chunks * 2) + 1, stream);
+
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0,
+      string_length_functor{num_chunks,
+                            intermediate_stats.stripe_stat_chunks.data(),
+                            intermediate_stats.stripe_stat_merge.device_ptr()});
+    thrust::exclusive_scan(rmm::exec_policy(stream), iter, iter + offsets.size(), offsets.begin());
+
+    // pull size back to host
+    auto const total_string_pool_size = offsets.element(num_chunks * 2, stream);
+    if (total_string_pool_size > 0) {
+      rmm::device_uvector<char> string_pool(total_string_pool_size, stream);
+
+      // offsets describes where in the string pool each string goes. Going with the simple
+      // approach for now, but it is possible something fancier with breaking up each thread into
+      // copying x bytes instead of a single string is the better method since we are dealing in
+      // min/max strings they almost certainly will not be uniform length.
+      copy_string_data<<<num_chunks * 2, 256, 0, stream.value()>>>(
+        string_pool.data(),
+        offsets.data(),
+        intermediate_stats.stripe_stat_chunks.data(),
+        intermediate_stats.stripe_stat_merge.device_ptr());
+      string_pools.emplace_back(std::move(string_pool));
+    }
+  }
+
+  stripe_stat_chunks.emplace_back(std::move(intermediate_stats.stripe_stat_chunks));
+  stripe_stat_merge.emplace_back(std::move(intermediate_stats.stripe_stat_merge));
+  stats_dtypes = std::move(intermediate_stats.stats_dtypes);
+  col_types    = std::move(intermediate_stats.col_types);
+  num_rows     = num_table_rows;
+}
+
+namespace {
 /**
  * @brief Gathers stripe information.
  *
@@ -418,7 +549,7 @@ void init_dictionaries(orc_table_view& orc_table,
                  [&](auto& col_idx) {
                    auto& str_column = orc_table.column(col_idx);
                    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     str_column.size(), stream);
+                     str_column.size(), stream, rmm::mr::get_current_device_resource());
                  });
 
   // Create views of the temporary buffers in device memory
@@ -428,7 +559,8 @@ void init_dictionaries(orc_table_view& orc_table,
     dict_indices.begin(), dict_indices.end(), std::back_inserter(dict_indices_views), [](auto& di) {
       return device_span<uint32_t>{di};
     });
-  auto d_dict_indices_views = cudf::detail::make_device_uvector_async(dict_indices_views, stream);
+  auto d_dict_indices_views = cudf::detail::make_device_uvector_async(
+    dict_indices_views, stream, rmm::mr::get_current_device_resource());
 
   gpu::InitDictionaryIndices(orc_table.d_columns,
                              *dict,
@@ -441,15 +573,27 @@ void init_dictionaries(orc_table_view& orc_table,
   dict->device_to_host(stream, true);
 }
 
-void writer::impl::build_dictionaries(orc_table_view& orc_table,
-                                      host_span<stripe_rowgroups const> stripe_bounds,
-                                      hostdevice_2dvector<gpu::DictionaryChunk> const& dict,
-                                      host_span<rmm::device_uvector<uint32_t>> dict_index,
-                                      host_span<bool const> dictionary_enabled,
-                                      hostdevice_2dvector<gpu::StripeDictionary>& stripe_dict)
+/**
+ * @brief Builds up per-stripe dictionaries for string columns.
+ *
+ * @param orc_table Non-owning view of a cuDF table w/ ORC-related info
+ * @param stripe_bounds List of stripe boundaries
+ * @param dict List of dictionary chunks [rowgroup][column]
+ * @param dict_index List of dictionary indices
+ * @param dictionary_enabled Whether dictionary encoding is enabled for a given column
+ * @param stripe_dict List of stripe dictionaries
+ * @param enable_dictionary Whether dictionary is enabled
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void build_dictionaries(orc_table_view& orc_table,
+                        host_span<stripe_rowgroups const> stripe_bounds,
+                        hostdevice_2dvector<gpu::DictionaryChunk> const& dict,
+                        host_span<rmm::device_uvector<uint32_t>> dict_index,
+                        host_span<bool const> dictionary_enabled,
+                        hostdevice_2dvector<gpu::StripeDictionary>& stripe_dict,
+                        bool enable_dictionary,
+                        rmm::cuda_stream_view stream)
 {
-  const auto num_rowgroups = dict.size().first;
-
   for (size_t dict_idx = 0; dict_idx < orc_table.num_string_columns(); ++dict_idx) {
     auto& str_column = orc_table.string_column(dict_idx);
     str_column.attach_stripe_dict(stripe_dict.base_host_ptr(), stripe_dict.base_device_ptr());
@@ -470,7 +614,7 @@ void writer::impl::build_dictionaries(orc_table_view& orc_table,
       sd.leaf_column = dict[0][dict_idx].leaf_column;
     }
 
-    if (enable_dictionary_) {
+    if (enable_dictionary) {
       struct string_column_cost {
         size_t direct     = 0;
         size_t dictionary = 0;
@@ -554,9 +698,20 @@ auto comp_block_alignment(CompressionKind compression_kind)
   return 1u << nvcomp::compress_output_alignment_bits(to_nvcomp_compression_type(compression_kind));
 }
 
-orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
-                                         file_segmentation const& segmentation,
-                                         std::map<uint32_t, size_t> const& decimal_column_sizes)
+/**
+ * @brief Builds up per-column streams.
+ *
+ * @param[in,out] columns List of columns
+ * @param[in] segmentation stripe and rowgroup ranges
+ * @param[in] decimal_column_sizes Sizes of encoded decimal columns
+ * @return List of stream descriptors
+ */
+orc_streams create_streams(host_span<orc_column_view> columns,
+                           file_segmentation const& segmentation,
+                           std::map<uint32_t, size_t> const& decimal_column_sizes,
+                           bool enable_dictionary,
+                           CompressionKind compression_kind,
+                           bool single_write_mode)
 {
   // 'column 0' row index stream
   std::vector<Stream> streams{{ROW_INDEX, 0}};  // TODO: Separate index and data streams?
@@ -599,7 +754,7 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
 
     auto add_stream =
       [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
-        auto const max_alignment_padding = uncomp_block_alignment(compression_kind_) - 1;
+        auto const max_alignment_padding = uncomp_block_alignment(compression_kind) - 1;
         const auto base                  = column.index() * gpu::CI_NUM_STREAMS;
         ids[base + index_type]           = streams.size();
         streams.push_back(orc::Stream{
@@ -636,7 +791,7 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         column.set_orc_encoding(DIRECT);
         break;
       case TypeKind::STRING: {
-        bool enable_dict           = enable_dictionary_;
+        bool enable_dict           = enable_dictionary;
         size_t dict_data_size      = 0;
         size_t dict_strings        = 0;
         size_t dict_lengths_div512 = 0;
@@ -711,47 +866,6 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
   return {std::move(streams), std::move(ids), std::move(types)};
 }
 
-orc_streams::orc_stream_offsets orc_streams::compute_offsets(
-  host_span<orc_column_view const> columns, size_t num_rowgroups) const
-{
-  std::vector<size_t> strm_offsets(streams.size());
-  size_t non_rle_data_size = 0;
-  size_t rle_data_size     = 0;
-  for (size_t i = 0; i < streams.size(); ++i) {
-    const auto& stream = streams[i];
-
-    auto const is_rle_data = [&]() {
-      // First stream is an index stream, don't check types, etc.
-      if (!stream.column_index().has_value()) return true;
-
-      auto const& column = columns[stream.column_index().value()];
-      // Dictionary encoded string column - dictionary characters or
-      // directly encoded string - column characters
-      if (column.orc_kind() == TypeKind::STRING &&
-          ((stream.kind == DICTIONARY_DATA && column.orc_encoding() == DICTIONARY_V2) ||
-           (stream.kind == DATA && column.orc_encoding() == DIRECT_V2)))
-        return false;
-      // Decimal data
-      if (column.orc_kind() == TypeKind::DECIMAL && stream.kind == DATA) return false;
-
-      // Everything else uses RLE
-      return true;
-    }();
-    // non-RLE and RLE streams are separated in the buffer that stores encoded data
-    // The computed offsets do not take the streams of the other type into account
-    if (is_rle_data) {
-      strm_offsets[i] = rle_data_size;
-      rle_data_size += (stream.length + 7) & ~7;
-    } else {
-      strm_offsets[i] = non_rle_data_size;
-      non_rle_data_size += stream.length;
-    }
-  }
-  non_rle_data_size = (non_rle_data_size + 7) & ~7;
-
-  return {std::move(strm_offsets), non_rle_data_size, rle_data_size};
-}
-
 std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
   orc_table_view const& orc_table,
   file_segmentation const& segmentation,
@@ -772,7 +886,8 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
                                 aligned_rgs.count() * sizeof(rowgroup_rows),
                                 cudaMemcpyDefault,
                                 stream.value()));
-  auto const d_stripes = cudf::detail::make_device_uvector_async(segmentation.stripes, stream);
+  auto const d_stripes = cudf::detail::make_device_uvector_async(
+    segmentation.stripes, stream, rmm::mr::get_current_device_resource());
 
   // One thread per column, per stripe
   thrust::for_each_n(
@@ -1091,11 +1206,23 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   return {std::move(encoded_data), std::move(chunk_streams)};
 }
 
-std::vector<StripeInformation> writer::impl::gather_stripes(
+/**
+ * @brief Returns stripe information after compacting columns' individual data
+ * chunks into contiguous data streams.
+ *
+ * @param[in] num_index_streams Total number of index streams
+ * @param[in] segmentation stripe and rowgroup ranges
+ * @param[in,out] enc_streams List of encoder chunk streams [column][rowgroup]
+ * @param[in,out] strm_desc List of stream descriptors [stripe][data_stream]
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @return The stripes' information
+ */
+std::vector<StripeInformation> gather_stripes(
   size_t num_index_streams,
   file_segmentation const& segmentation,
   hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
-  hostdevice_2dvector<gpu::StripeStream>* strm_desc)
+  hostdevice_2dvector<gpu::StripeStream>* strm_desc,
+  rmm::cuda_stream_view stream)
 {
   if (segmentation.num_stripes() == 0) { return {}; }
   std::vector<StripeInformation> stripes(segmentation.num_stripes());
@@ -1163,16 +1290,25 @@ hostdevice_vector<uint8_t> allocate_and_encode_blobs(
   return blobs;
 }
 
-writer::impl::intermediate_statistics writer::impl::gather_statistic_blobs(
-  statistics_freq const stats_freq,
-  orc_table_view const& orc_table,
-  file_segmentation const& segmentation)
+/**
+ * @brief Returns column statistics in an intermediate format.
+ *
+ * @param statistics_freq Frequency of statistics to be included in the output file
+ * @param orc_table Table information to be written
+ * @param segmentation stripe and rowgroup ranges
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The statistic information
+ */
+intermediate_statistics gather_statistic_blobs(statistics_freq const stats_freq,
+                                               orc_table_view const& orc_table,
+                                               file_segmentation const& segmentation,
+                                               rmm::cuda_stream_view stream)
 {
   auto const num_rowgroup_blobs     = segmentation.rowgroups.count();
   auto const num_stripe_blobs       = segmentation.num_stripes() * orc_table.num_columns();
   auto const are_statistics_enabled = stats_freq != statistics_freq::STATISTICS_NONE;
   if (not are_statistics_enabled or num_rowgroup_blobs + num_stripe_blobs == 0) {
-    return writer::impl::intermediate_statistics{stream};
+    return intermediate_statistics{stream};
   }
 
   hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
@@ -1290,8 +1426,17 @@ writer::impl::intermediate_statistics writer::impl::gather_statistic_blobs(
           std::move(col_types)};
 }
 
-writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
-  int num_stripes, writer::impl::persisted_statistics& per_chunk_stats)
+/**
+ * @brief Returns column statistics encoded in ORC protobuf format stored in the footer.
+ *
+ * @param num_stripes number of stripes in the data
+ * @param incoming_stats intermediate statistics returned from `gather_statistic_blobs`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The encoded statistic blobs
+ */
+encoded_footer_statistics finish_statistic_blobs(int num_stripes,
+                                                 persisted_statistics& per_chunk_stats,
+                                                 rmm::cuda_stream_view stream)
 {
   auto stripe_size_iter = thrust::make_transform_iterator(per_chunk_stats.stripe_stat_merge.begin(),
                                                           [](auto const& i) { return i.size(); });
@@ -1381,17 +1526,36 @@ writer::impl::encoded_footer_statistics writer::impl::finish_statistic_blobs(
   return {std::move(stripe_blobs), std::move(file_blobs)};
 }
 
-void writer::impl::write_index_stream(int32_t stripe_id,
-                                      int32_t stream_id,
-                                      host_span<orc_column_view const> columns,
-                                      file_segmentation const& segmentation,
-                                      host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
-                                      host_2dspan<gpu::StripeStream const> strm_desc,
-                                      host_span<compression_result const> comp_res,
-                                      std::vector<ColStatsBlob> const& rg_stats,
-                                      StripeInformation* stripe,
-                                      orc_streams* streams,
-                                      ProtobufWriter* pbw)
+/**
+ * @brief Writes the specified column's row index stream.
+ *
+ * @param[in] stripe_id Stripe's identifier
+ * @param[in] stream_id Stream identifier (column id + 1)
+ * @param[in] columns List of columns
+ * @param[in] segmentation stripe and rowgroup ranges
+ * @param[in] enc_streams List of encoder chunk streams [column][rowgroup]
+ * @param[in] strm_desc List of stream descriptors
+ * @param[in] comp_res Output status for compressed streams
+ * @param[in] rg_stats row group level statistics
+ * @param[in,out] stripe Stream's parent stripe
+ * @param[in,out] streams List of all streams
+ * @param[in] compression_kind The compression kind
+ * @param[in] compression_blocksize The block size used for compression
+ * @param[in] out_sink Sink for writing data
+ */
+void write_index_stream(int32_t stripe_id,
+                        int32_t stream_id,
+                        host_span<orc_column_view const> columns,
+                        file_segmentation const& segmentation,
+                        host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
+                        host_2dspan<gpu::StripeStream const> strm_desc,
+                        host_span<compression_result const> comp_res,
+                        std::vector<ColStatsBlob> const& rg_stats,
+                        StripeInformation* stripe,
+                        orc_streams* streams,
+                        CompressionKind compression_kind,
+                        size_t compression_blocksize,
+                        std::unique_ptr<data_sink> const& out_sink)
 {
   row_group_index_info present;
   row_group_index_info data;
@@ -1403,7 +1567,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     row_group_index_info record;
     if (stream.ids[type] > 0) {
       record.pos = 0;
-      if (compression_kind_ != NONE) {
+      if (compression_kind != NONE) {
         auto const& ss   = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)];
         record.blk_pos   = ss.first_block;
         record.comp_pos  = 0;
@@ -1418,10 +1582,10 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     if (record.pos >= 0) {
       record.pos += stream.lengths[type];
       while ((record.pos >= 0) && (record.blk_pos >= 0) &&
-             (static_cast<size_t>(record.pos) >= compression_blocksize_) &&
+             (static_cast<size_t>(record.pos) >= compression_blocksize) &&
              (record.comp_pos + block_header_size + comp_res[record.blk_pos].bytes_written <
               static_cast<size_t>(record.comp_size))) {
-        record.pos -= compression_blocksize_;
+        record.pos -= compression_blocksize;
         record.comp_pos += block_header_size + comp_res[record.blk_pos].bytes_written;
         record.blk_pos += 1;
       }
@@ -1443,21 +1607,21 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     }
   }
 
-  buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
+  ProtobufWriter pbw((compression_kind != NONE) ? 3 : 0);
 
   // Add row index entries
   auto const& rowgroups_range = segmentation.stripes[stripe_id];
   std::for_each(rowgroups_range.cbegin(), rowgroups_range.cend(), [&](auto rowgroup) {
-    pbw->put_row_index_entry(present.comp_pos,
-                             present.pos,
-                             data.comp_pos,
-                             data.pos,
-                             data2.comp_pos,
-                             data2.pos,
-                             kind,
-                             (rg_stats.empty() or stream_id == 0)
-                               ? nullptr
-                               : (&rg_stats[column_id * segmentation.num_rowgroups() + rowgroup]));
+    pbw.put_row_index_entry(present.comp_pos,
+                            present.pos,
+                            data.comp_pos,
+                            data.pos,
+                            data2.comp_pos,
+                            data2.pos,
+                            kind,
+                            (rg_stats.empty() or stream_id == 0)
+                              ? nullptr
+                              : (&rg_stats[column_id * segmentation.num_rowgroups() + rowgroup]));
 
     if (stream_id != 0) {
       const auto& strm = enc_streams[column_id][rowgroup];
@@ -1467,23 +1631,40 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     }
   });
 
-  (*streams)[stream_id].length = buffer_.size();
-  if (compression_kind_ != NONE) {
+  (*streams)[stream_id].length = pbw.size();
+  if (compression_kind != NONE) {
     uint32_t uncomp_ix_len = (uint32_t)((*streams)[stream_id].length - 3) * 2 + 1;
-    buffer_[0]             = static_cast<uint8_t>(uncomp_ix_len >> 0);
-    buffer_[1]             = static_cast<uint8_t>(uncomp_ix_len >> 8);
-    buffer_[2]             = static_cast<uint8_t>(uncomp_ix_len >> 16);
+    pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_ix_len >> 0);
+    pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_ix_len >> 8);
+    pbw.buffer()[2]        = static_cast<uint8_t>(uncomp_ix_len >> 16);
   }
-  out_sink_->host_write(buffer_.data(), buffer_.size());
-  stripe->indexLength += buffer_.size();
+  out_sink->host_write(pbw.data(), pbw.size());
+  stripe->indexLength += pbw.size();
 }
 
-std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_desc,
-                                                  gpu::encoder_chunk_streams const& enc_stream,
-                                                  uint8_t const* compressed_data,
-                                                  uint8_t* stream_out,
-                                                  StripeInformation* stripe,
-                                                  orc_streams* streams)
+/**
+ * @brief Write the specified column's data streams
+ *
+ * @param[in] strm_desc Stream's descriptor
+ * @param[in] enc_stream Chunk's streams
+ * @param[in] compressed_data Compressed stream data
+ * @param[in,out] stream_out Temporary host output buffer
+ * @param[in,out] stripe Stream's parent stripe
+ * @param[in,out] streams List of all streams
+ * @param[in] compression_kind The compression kind
+ * @param[in] out_sink Sink for writing data
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @return An std::future that should be synchronized to ensure the writing is complete
+ */
+std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
+                                    gpu::encoder_chunk_streams const& enc_stream,
+                                    uint8_t const* compressed_data,
+                                    uint8_t* stream_out,
+                                    StripeInformation* stripe,
+                                    orc_streams* streams,
+                                    CompressionKind compression_kind,
+                                    std::unique_ptr<data_sink> const& out_sink,
+                                    rmm::cuda_stream_view stream)
 {
   const auto length                                        = strm_desc.stream_size;
   (*streams)[enc_stream.ids[strm_desc.stream_type]].length = length;
@@ -1491,18 +1672,18 @@ std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_
     return std::async(std::launch::deferred, [] {});
   }
 
-  const auto* stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
-                                                      : (compressed_data + strm_desc.bfr_offset);
+  const auto* stream_in = (compression_kind == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
+                                                     : (compressed_data + strm_desc.bfr_offset);
 
   auto write_task = [&]() {
-    if (out_sink_->is_device_write_preferred(length)) {
-      return out_sink_->device_write_async(stream_in, length, stream);
+    if (out_sink->is_device_write_preferred(length)) {
+      return out_sink->device_write_async(stream_in, length, stream);
     } else {
       CUDF_CUDA_TRY(
         cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDefault, stream.value()));
       stream.synchronize();
 
-      out_sink_->host_write(stream_out, length);
+      out_sink->host_write(stream_out, length);
       return std::async(std::launch::deferred, [] {});
     }
   }();
@@ -1510,18 +1691,27 @@ std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_
   return write_task;
 }
 
-void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t>& v)
+/**
+ * @brief Insert 3-byte uncompressed block headers in a byte vector
+ *
+ * @param compression_kind The compression kind
+ * @param compression_blocksize The block size used for compression
+ * @param v The destitation byte vector to write, which must include initial 3-byte header
+ */
+void add_uncompressed_block_headers(CompressionKind compression_kind,
+                                    size_t compression_blocksize,
+                                    std::vector<uint8_t>& v)
 {
-  if (compression_kind_ != NONE) {
+  if (compression_kind != NONE) {
     size_t uncomp_len = v.size() - 3, pos = 0, block_len;
-    while (uncomp_len > compression_blocksize_) {
-      block_len  = compression_blocksize_ * 2 + 1;
+    while (uncomp_len > compression_blocksize) {
+      block_len  = compression_blocksize * 2 + 1;
       v[pos + 0] = static_cast<uint8_t>(block_len >> 0);
       v[pos + 1] = static_cast<uint8_t>(block_len >> 8);
       v[pos + 2] = static_cast<uint8_t>(block_len >> 16);
-      pos += 3 + compression_blocksize_;
+      pos += 3 + compression_blocksize;
       v.insert(v.begin() + pos, 3, 0);
-      uncomp_len -= compression_blocksize_;
+      uncomp_len -= compression_blocksize;
     }
     block_len  = uncomp_len * 2 + 1;
     v[pos + 0] = static_cast<uint8_t>(block_len >> 0);
@@ -1530,58 +1720,6 @@ void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t>& v)
   }
 }
 
-writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   orc_writer_options const& options,
-                   SingleWriteMode mode,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : _mr(mr),
-    stream(stream),
-    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
-    row_index_stride{options.get_row_index_stride()},
-    compression_kind_(to_orc_compression(options.get_compression())),
-    compression_blocksize_(compression_block_size(compression_kind_)),
-    stats_freq_(options.get_statistics_freq()),
-    single_write_mode(mode == SingleWriteMode::YES),
-    kv_meta(options.get_key_value_metadata()),
-    out_sink_(std::move(sink))
-{
-  if (options.get_metadata()) {
-    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
-  }
-  init_state();
-}
-
-writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   chunked_orc_writer_options const& options,
-                   SingleWriteMode mode,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : _mr(mr),
-    stream(stream),
-    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
-    row_index_stride{options.get_row_index_stride()},
-    compression_kind_(to_orc_compression(options.get_compression())),
-    compression_blocksize_(compression_block_size(compression_kind_)),
-    stats_freq_(options.get_statistics_freq()),
-    single_write_mode(mode == SingleWriteMode::YES),
-    kv_meta(options.get_key_value_metadata()),
-    out_sink_(std::move(sink))
-{
-  if (options.get_metadata()) {
-    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
-  }
-  init_state();
-}
-
-writer::impl::~impl() { close(); }
-
-void writer::impl::init_state()
-{
-  // Write file header
-  out_sink_->host_write(MAGIC, std::strlen(MAGIC));
-}
-
 void pushdown_lists_null_mask(orc_column_view const& col,
                               device_span<orc_column_device_view> d_columns,
                               bitmask_type const* parent_pd_mask,
@@ -1676,7 +1814,8 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
   }
 
   // Attach null masks to device column views (async)
-  auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(mask_ptrs, stream);
+  auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(
+    mask_ptrs, stream, rmm::mr::get_current_device_resource());
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator(0ul),
@@ -1766,7 +1905,8 @@ orc_table_view make_orc_table_view(table_view const& table,
     orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) {
       return orc_column.orc_kind();
     });
-  auto const d_type_kinds = cudf::detail::make_device_uvector_async(type_kinds, stream);
+  auto const d_type_kinds = cudf::detail::make_device_uvector_async(
+    type_kinds, stream, rmm::mr::get_current_device_resource());
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
   using stack_value_type = thrust::pair<column_device_view const*, thrust::optional<uint32_t>>;
@@ -1816,7 +1956,8 @@ orc_table_view make_orc_table_view(table_view const& table,
   return {std::move(orc_columns),
           std::move(d_orc_columns),
           str_col_indexes,
-          cudf::detail::make_device_uvector_sync(str_col_indexes, stream)};
+          cudf::detail::make_device_uvector_sync(
+            str_col_indexes, stream, rmm::mr::get_current_device_resource())};
 }
 
 hostdevice_2dvector<rowgroup_rows> calculate_rowgroup_bounds(orc_table_view const& orc_table,
@@ -1984,7 +2125,7 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
                  std::back_inserter(data),
                  [&](auto& idx) {
                    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     orc_table.columns[idx].size(), stream);
+                     orc_table.columns[idx].size(), stream, rmm::mr::get_current_device_resource());
                  });
   std::vector<rmm::device_uvector<uint32_t>> index;
   std::transform(orc_table.string_column_indices.begin(),
@@ -1992,7 +2133,7 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
                  std::back_inserter(index),
                  [&](auto& idx) {
                    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                     orc_table.columns[idx].size(), stream);
+                     orc_table.columns[idx].size(), stream, rmm::mr::get_current_device_resource());
                  });
   stream.synchronize();
 
@@ -2007,53 +2148,13 @@ string_dictionaries allocate_dictionaries(orc_table_view const& orc_table,
 
   return {std::move(data),
           std::move(index),
-          cudf::detail::make_device_uvector_sync(data_ptrs, stream),
-          cudf::detail::make_device_uvector_sync(index_ptrs, stream),
+          cudf::detail::make_device_uvector_sync(
+            data_ptrs, stream, rmm::mr::get_current_device_resource()),
+          cudf::detail::make_device_uvector_sync(
+            index_ptrs, stream, rmm::mr::get_current_device_resource()),
           std::move(is_dict_enabled)};
 }
 
-struct string_length_functor {
-  __device__ inline size_type operator()(int const i) const
-  {
-    // we translate from 0 -> num_chunks * 2 because each statistic has a min and max
-    // string and we need to calculate lengths for both.
-    if (i >= num_chunks * 2) return 0;
-
-    // min strings are even values, max strings are odd values of i
-    auto const should_copy_min = i % 2 == 0;
-    // index of the chunk
-    auto const idx = i / 2;
-    auto& str_val  = should_copy_min ? stripe_stat_chunks[idx].min_value.str_val
-                                     : stripe_stat_chunks[idx].max_value.str_val;
-    auto const str = stripe_stat_merge[idx].stats_dtype == dtype_string;
-    return str ? str_val.length : 0;
-  }
-
-  int const num_chunks;
-  statistics_chunk const* stripe_stat_chunks;
-  statistics_merge_group const* stripe_stat_merge;
-};
-
-__global__ void copy_string_data(char* string_pool,
-                                 size_type* offsets,
-                                 statistics_chunk* chunks,
-                                 statistics_merge_group const* groups)
-{
-  auto const idx = blockIdx.x / 2;
-  if (groups[idx].stats_dtype == dtype_string) {
-    // min strings are even values, max strings are odd values of i
-    auto const should_copy_min = blockIdx.x % 2 == 0;
-    auto& str_val = should_copy_min ? chunks[idx].min_value.str_val : chunks[idx].max_value.str_val;
-    auto dst      = &string_pool[offsets[blockIdx.x]];
-    auto src      = str_val.ptr;
-
-    for (int i = threadIdx.x; i < str_val.length; i += blockDim.x) {
-      dst[i] = src[i];
-    }
-    if (threadIdx.x == 0) { str_val.ptr = dst; }
-  }
-}
-
 size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize)
 {
   if (compression_kind == NONE) return 0;
@@ -2062,60 +2163,14 @@ size_t max_compression_output_size(CompressionKind compression_kind, uint32_t co
                                         compression_blocksize);
 }
 
-void writer::impl::persisted_statistics::persist(int num_table_rows,
-                                                 bool single_write_mode,
-                                                 intermediate_statistics& intermediate_stats,
-                                                 rmm::cuda_stream_view stream)
-{
-  if (not single_write_mode) {
-    // persist the strings in the chunks into a string pool and update pointers
-    auto const num_chunks = static_cast<int>(intermediate_stats.stripe_stat_chunks.size());
-    // min offset and max offset + 1 for total size
-    rmm::device_uvector<size_type> offsets((num_chunks * 2) + 1, stream);
-
-    auto iter = cudf::detail::make_counting_transform_iterator(
-      0,
-      string_length_functor{num_chunks,
-                            intermediate_stats.stripe_stat_chunks.data(),
-                            intermediate_stats.stripe_stat_merge.device_ptr()});
-    thrust::exclusive_scan(rmm::exec_policy(stream), iter, iter + offsets.size(), offsets.begin());
-
-    // pull size back to host
-    auto const total_string_pool_size = offsets.element(num_chunks * 2, stream);
-    if (total_string_pool_size > 0) {
-      rmm::device_uvector<char> string_pool(total_string_pool_size, stream);
-
-      // offsets describes where in the string pool each string goes. Going with the simple
-      // approach for now, but it is possible something fancier with breaking up each thread into
-      // copying x bytes instead of a single string is the better method since we are dealing in
-      // min/max strings they almost certainly will not be uniform length.
-      copy_string_data<<<num_chunks * 2, 256, 0, stream.value()>>>(
-        string_pool.data(),
-        offsets.data(),
-        intermediate_stats.stripe_stat_chunks.data(),
-        intermediate_stats.stripe_stat_merge.device_ptr());
-      string_pools.emplace_back(std::move(string_pool));
-    }
-  }
-
-  stripe_stat_chunks.emplace_back(std::move(intermediate_stats.stripe_stat_chunks));
-  stripe_stat_merge.emplace_back(std::move(intermediate_stats.stripe_stat_merge));
-  stats_dtypes = std::move(intermediate_stats.stats_dtypes);
-  col_types    = std::move(intermediate_stats.col_types);
-  num_rows     = num_table_rows;
-}
-
-void writer::impl::write(table_view const& table)
+std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
 {
-  CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
-  auto const num_rows = table.num_rows();
-
-  if (not table_meta) { table_meta = std::make_unique<table_input_metadata>(table); }
+  auto table_meta = std::make_unique<table_input_metadata>(input);
 
   // Fill unnamed columns' names in table_meta
   std::function<void(column_in_metadata&, std::string)> add_default_name =
     [&](column_in_metadata& col_meta, std::string default_name) {
-      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      if (col_meta.get_name().empty()) { col_meta.set_name(default_name); }
       for (size_type i = 0; i < col_meta.num_children(); ++i) {
         add_default_name(col_meta.child(i), std::to_string(i));
       }
@@ -2124,11 +2179,55 @@ void writer::impl::write(table_view const& table)
     add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
   }
 
-  auto const d_table = table_device_view::create(table, stream);
+  return table_meta;
+}
 
-  auto orc_table = make_orc_table_view(table, *d_table, *table_meta, stream);
+/**
+ * @brief Perform the processing steps needed to convert the input table into the output ORC data
+ * for writing, such as compression and ORC encoding.
+ *
+ * @param input The input table
+ * @param table_meta The table metadata
+ * @param max_stripe_size Maximum size of stripes in the output file
+ * @param row_index_stride The row index stride
+ * @param enable_dictionary Whether dictionary is enabled
+ * @param compression_kind The compression kind
+ * @param compression_blocksize The block size used for compression
+ * @param stats_freq Column statistics granularity type for parquet/orc writers
+ * @param single_write_mode Flag to indicate if there is only a single table write
+ * @param out_sink Sink for writing data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A tuple of the intermediate results containing the processed data
+ */
+std::tuple<orc_streams,
+           hostdevice_vector<compression_result>,
+           hostdevice_2dvector<gpu::StripeStream>,
+           encoded_data,
+           file_segmentation,
+           hostdevice_2dvector<gpu::StripeDictionary>,
+           std::vector<StripeInformation>,
+           orc_table_view,
+           rmm::device_buffer,
+           intermediate_statistics,
+           pinned_buffer<uint8_t>>
+convert_table_to_orc_data(table_view const& input,
+                          table_input_metadata const& table_meta,
+                          stripe_size_limits max_stripe_size,
+                          size_type row_index_stride,
+                          bool enable_dictionary,
+                          CompressionKind compression_kind,
+                          size_t compression_blocksize,
+                          statistics_freq stats_freq,
+                          bool single_write_mode,
+                          data_sink const& out_sink,
+                          rmm::cuda_stream_view stream)
+{
+  auto const input_tview = table_device_view::create(input, stream);
 
-  auto const pd_masks = init_pushdown_null_masks(orc_table, stream);
+  auto orc_table = make_orc_table_view(input, *input_tview, table_meta, stream);
+
+  // This is unused but it holds memory buffers for later access thus needs to be kept alive.
+  [[maybe_unused]] auto const pd_masks = init_pushdown_null_masks(orc_table, stream);
 
   auto rowgroup_bounds = calculate_rowgroup_bounds(orc_table, row_index_stride, stream);
 
@@ -2146,7 +2245,7 @@ void writer::impl::write(table_view const& table)
   }
 
   // Decide stripe boundaries based on rowgroups and dict chunks
-  auto const segmentation =
+  auto segmentation =
     calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
 
   // Build stripe-level dictionaries
@@ -2158,15 +2257,22 @@ void writer::impl::write(table_view const& table)
                        dict,
                        dictionaries.index,
                        dictionaries.dictionary_enabled,
-                       stripe_dict);
+                       stripe_dict,
+                       enable_dictionary,
+                       stream);
   }
 
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
-  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind_);
-  auto const compressed_block_align   = comp_block_alignment(compression_kind_);
-  auto streams =
-    create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes));
+  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind);
+  auto const compressed_block_align   = comp_block_alignment(compression_kind);
+
+  auto streams  = create_streams(orc_table.columns,
+                                segmentation,
+                                decimal_column_sizes(dec_chunk_sizes.rg_sizes),
+                                enable_dictionary,
+                                compression_kind,
+                                single_write_mode);
   auto enc_data = encode_columns(orc_table,
                                  std::move(dictionaries),
                                  std::move(dec_chunk_sizes),
@@ -2175,155 +2281,317 @@ void writer::impl::write(table_view const& table)
                                  uncompressed_block_align,
                                  stream);
 
+  auto const num_rows = input.num_rows();
+
   // Assemble individual disparate column chunks into contiguous data streams
   size_type const num_index_streams = (orc_table.num_columns() + 1);
   const auto num_data_streams       = streams.size() - num_index_streams;
   hostdevice_2dvector<gpu::StripeStream> strm_descs(
     segmentation.num_stripes(), num_data_streams, stream);
-  auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
-
-  if (num_rows > 0) {
-    // Allocate intermediate output stream buffer
-    size_t compressed_bfr_size   = 0;
-    size_t num_compressed_blocks = 0;
-
-    auto const max_compressed_block_size =
-      max_compression_output_size(compression_kind_, compression_blocksize_);
-    auto const padded_max_compressed_block_size =
-      util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
-    auto const padded_block_header_size =
-      util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
-
-    auto stream_output = [&]() {
-      size_t max_stream_size = 0;
-      bool all_device_write  = true;
-
-      for (auto& ss : strm_descs.host_view().flat_view()) {
-        if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
-        size_t stream_size = ss.stream_size;
-        if (compression_kind_ != NONE) {
-          ss.first_block = num_compressed_blocks;
-          ss.bfr_offset  = compressed_bfr_size;
-
-          auto num_blocks = std::max<uint32_t>(
-            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-          stream_size += num_blocks * block_header_size;
-          num_compressed_blocks += num_blocks;
-          compressed_bfr_size +=
-            (padded_block_header_size + padded_max_compressed_block_size) * num_blocks;
-        }
-        max_stream_size = std::max(max_stream_size, stream_size);
-      }
+  auto stripes =
+    gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs, stream);
+
+  if (num_rows == 0) {
+    return {std::move(streams),
+            hostdevice_vector<compression_result>{},  // comp_results
+            std::move(strm_descs),
+            std::move(enc_data),
+            std::move(segmentation),
+            std::move(stripe_dict),
+            std::move(stripes),
+            std::move(orc_table),
+            rmm::device_buffer{},  // compressed_data
+            intermediate_statistics{stream},
+            pinned_buffer<uint8_t>{nullptr, cudaFreeHost}};
+  }
 
-      if (all_device_write) {
-        return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-      } else {
-        return pinned_buffer<uint8_t>{[](size_t size) {
-                                        uint8_t* ptr = nullptr;
-                                        CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
-                                        return ptr;
-                                      }(max_stream_size),
-                                      cudaFreeHost};
+  // Allocate intermediate output stream buffer
+  size_t compressed_bfr_size   = 0;
+  size_t num_compressed_blocks = 0;
+
+  auto const max_compressed_block_size =
+    max_compression_output_size(compression_kind, compression_blocksize);
+  auto const padded_max_compressed_block_size =
+    util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
+  auto const padded_block_header_size =
+    util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
+
+  auto stream_output = [&]() {
+    size_t max_stream_size = 0;
+    bool all_device_write  = true;
+
+    for (auto& ss : strm_descs.host_view().flat_view()) {
+      if (!out_sink.is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+      size_t stream_size = ss.stream_size;
+      if (compression_kind != NONE) {
+        ss.first_block = num_compressed_blocks;
+        ss.bfr_offset  = compressed_bfr_size;
+
+        auto num_blocks =
+          std::max<uint32_t>((stream_size + compression_blocksize - 1) / compression_blocksize, 1);
+        stream_size += num_blocks * block_header_size;
+        num_compressed_blocks += num_blocks;
+        compressed_bfr_size +=
+          (padded_block_header_size + padded_max_compressed_block_size) * num_blocks;
       }
-    }();
+      max_stream_size = std::max(max_stream_size, stream_size);
+    }
 
-    // Compress the data streams
-    rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-    hostdevice_vector<compression_result> comp_results(num_compressed_blocks, stream);
-    thrust::fill(rmm::exec_policy(stream),
-                 comp_results.d_begin(),
-                 comp_results.d_end(),
-                 compression_result{0, compression_status::FAILURE});
-    if (compression_kind_ != NONE) {
-      strm_descs.host_to_device(stream);
-      gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
-                                  num_compressed_blocks,
-                                  compression_kind_,
-                                  compression_blocksize_,
-                                  max_compressed_block_size,
-                                  compressed_block_align,
-                                  strm_descs,
-                                  enc_data.streams,
-                                  comp_results,
-                                  stream);
-
-      // deallocate encoded data as it is not needed anymore
-      enc_data.data = rmm::device_uvector<uint8_t>{0, stream};
-
-      strm_descs.device_to_host(stream);
-      comp_results.device_to_host(stream, true);
+    if (all_device_write) {
+      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+    } else {
+      return pinned_buffer<uint8_t>{[](size_t size) {
+                                      uint8_t* ptr = nullptr;
+                                      CUDF_CUDA_TRY(cudaMallocHost(&ptr, size));
+                                      return ptr;
+                                    }(max_stream_size),
+                                    cudaFreeHost};
     }
+  }();
+
+  // Compress the data streams
+  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+  hostdevice_vector<compression_result> comp_results(num_compressed_blocks, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               comp_results.d_begin(),
+               comp_results.d_end(),
+               compression_result{0, compression_status::FAILURE});
+  if (compression_kind != NONE) {
+    strm_descs.host_to_device(stream);
+    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+                                num_compressed_blocks,
+                                compression_kind,
+                                compression_blocksize,
+                                max_compressed_block_size,
+                                compressed_block_align,
+                                strm_descs,
+                                enc_data.streams,
+                                comp_results,
+                                stream);
+
+    // deallocate encoded data as it is not needed anymore
+    enc_data.data = rmm::device_uvector<uint8_t>{0, stream};
+
+    strm_descs.device_to_host(stream);
+    comp_results.device_to_host(stream, true);
+  }
+
+  auto intermediate_stats = gather_statistic_blobs(stats_freq, orc_table, segmentation, stream);
+
+  return {std::move(streams),
+          std::move(comp_results),
+          std::move(strm_descs),
+          std::move(enc_data),
+          std::move(segmentation),
+          std::move(stripe_dict),
+          std::move(stripes),
+          std::move(orc_table),
+          std::move(compressed_data),
+          std::move(intermediate_stats),
+          std::move(stream_output)};
+}
+
+}  // namespace
+
+writer::impl::impl(std::unique_ptr<data_sink> sink,
+                   orc_writer_options const& options,
+                   SingleWriteMode mode,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : _mr(mr),
+    stream(stream),
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
+    compression_kind_(to_orc_compression(options.get_compression())),
+    compression_blocksize_(compression_block_size(compression_kind_)),
+    stats_freq_(options.get_statistics_freq()),
+    single_write_mode(mode == SingleWriteMode::YES),
+    kv_meta(options.get_key_value_metadata()),
+    out_sink_(std::move(sink))
+{
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
+  }
+  init_state();
+}
+
+writer::impl::impl(std::unique_ptr<data_sink> sink,
+                   chunked_orc_writer_options const& options,
+                   SingleWriteMode mode,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : _mr(mr),
+    stream(stream),
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
+    compression_kind_(to_orc_compression(options.get_compression())),
+    compression_blocksize_(compression_block_size(compression_kind_)),
+    stats_freq_(options.get_statistics_freq()),
+    single_write_mode(mode == SingleWriteMode::YES),
+    kv_meta(options.get_key_value_metadata()),
+    out_sink_(std::move(sink))
+{
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
+  }
+  init_state();
+}
+
+writer::impl::~impl() { close(); }
 
-    ProtobufWriter pbw_(&buffer_);
+void writer::impl::init_state()
+{
+  // Write file header
+  out_sink_->host_write(MAGIC, std::strlen(MAGIC));
+}
 
-    auto intermediate_stats = gather_statistic_blobs(stats_freq_, orc_table, segmentation);
+void writer::impl::write(table_view const& input)
+{
+  CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
 
-    if (intermediate_stats.stripe_stat_chunks.size() > 0) {
-      persisted_stripe_statistics.persist(
-        orc_table.num_rows(), single_write_mode, intermediate_stats, stream);
+  if (not table_meta) { table_meta = make_table_meta(input); }
+
+  // All kinds of memory allocation and data compressions/encoding are performed here.
+  // If any error occurs, such as out-of-memory exception, the internal state of the current writer
+  // is still intact.
+  // Note that `out_sink_` is intentionally passed by const reference to prevent accidentally
+  // writing anything to it.
+  [[maybe_unused]] auto [streams,
+                         comp_results,
+                         strm_descs,
+                         enc_data,
+                         segmentation,
+                         stripe_dict, /* unused, but its data will be accessed via pointer later */
+                         stripes,
+                         orc_table,
+                         compressed_data,
+                         intermediate_stats,
+                         stream_output] = [&] {
+    try {
+      return convert_table_to_orc_data(input,
+                                       *table_meta,
+                                       max_stripe_size,
+                                       row_index_stride,
+                                       enable_dictionary_,
+                                       compression_kind_,
+                                       compression_blocksize_,
+                                       stats_freq_,
+                                       single_write_mode,
+                                       *out_sink_,
+                                       stream);
+    } catch (...) {  // catch any exception type
+      CUDF_LOG_ERROR(
+        "ORC writer encountered exception during processing. "
+        "No data has been written to the sink.");
+      throw;  // this throws the same exception
     }
+  }();
 
-    // Write stripes
-    std::vector<std::future<void>> write_tasks;
-    for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-      auto& stripe = stripes[stripe_id];
-
-      stripe.offset = out_sink_->bytes_written();
-
-      // Column (skippable) index streams appear at the start of the stripe
-      for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
-        write_index_stream(stripe_id,
-                           stream_id,
-                           orc_table.columns,
-                           segmentation,
-                           enc_data.streams,
-                           strm_descs,
-                           comp_results,
-                           intermediate_stats.rowgroup_blobs,
-                           &stripe,
-                           &streams,
-                           &pbw_);
-      }
+  // Compression/encoding were all successful. Now write the intermediate results.
+  write_orc_data_to_sink(streams,
+                         comp_results,
+                         strm_descs,
+                         enc_data,
+                         segmentation,
+                         stripes,
+                         orc_table,
+                         compressed_data,
+                         intermediate_stats,
+                         stream_output.get());
+
+  // Update data into the footer. This needs to be called even when num_rows==0.
+  add_table_to_footer_data(orc_table, stripes);
+}
 
-      // Column data consisting one or more separate streams
-      for (auto const& strm_desc : strm_descs[stripe_id]) {
-        write_tasks.push_back(write_data_stream(
-          strm_desc,
-          enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
-          static_cast<uint8_t const*>(compressed_data.data()),
-          stream_output.get(),
-          &stripe,
-          &streams));
-      }
+void writer::impl::write_orc_data_to_sink(orc_streams& streams,
+                                          hostdevice_vector<compression_result> const& comp_results,
+                                          hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
+                                          encoded_data const& enc_data,
+                                          file_segmentation const& segmentation,
+                                          std::vector<StripeInformation>& stripes,
+                                          orc_table_view const& orc_table,
+                                          rmm::device_buffer const& compressed_data,
+                                          intermediate_statistics& intermediate_stats,
+                                          uint8_t* stream_output)
+{
+  if (orc_table.num_rows() == 0) { return; }
 
-      // Write stripefooter consisting of stream information
-      StripeFooter sf;
-      sf.streams = streams;
-      sf.columns.resize(orc_table.num_columns() + 1);
-      sf.columns[0].kind = DIRECT;
-      for (size_t i = 1; i < sf.columns.size(); ++i) {
-        sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
-        sf.columns[i].dictionarySize =
-          (sf.columns[i].kind == DICTIONARY_V2)
-            ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
-            : 0;
-        if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
-      }
-      buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-      pbw_.write(sf);
-      stripe.footerLength = buffer_.size();
-      if (compression_kind_ != NONE) {
-        uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-        buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
-        buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
-        buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
-      }
-      out_sink_->host_write(buffer_.data(), buffer_.size());
+  if (intermediate_stats.stripe_stat_chunks.size() > 0) {
+    persisted_stripe_statistics.persist(
+      orc_table.num_rows(), single_write_mode, intermediate_stats, stream);
+  }
+
+  // Write stripes
+  std::vector<std::future<void>> write_tasks;
+  for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
+    auto& stripe = stripes[stripe_id];
+
+    stripe.offset = out_sink_->bytes_written();
+
+    // Column (skippable) index streams appear at the start of the stripe
+    size_type const num_index_streams = (orc_table.num_columns() + 1);
+    for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
+      write_index_stream(stripe_id,
+                         stream_id,
+                         orc_table.columns,
+                         segmentation,
+                         enc_data.streams,
+                         strm_descs,
+                         comp_results,
+                         intermediate_stats.rowgroup_blobs,
+                         &stripe,
+                         &streams,
+                         compression_kind_,
+                         compression_blocksize_,
+                         out_sink_);
     }
-    for (auto const& task : write_tasks) {
-      task.wait();
+
+    // Column data consisting one or more separate streams
+    for (auto const& strm_desc : strm_descs[stripe_id]) {
+      write_tasks.push_back(write_data_stream(
+        strm_desc,
+        enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
+        static_cast<uint8_t const*>(compressed_data.data()),
+        stream_output,
+        &stripe,
+        &streams,
+        compression_kind_,
+        out_sink_,
+        stream));
     }
+
+    // Write stripefooter consisting of stream information
+    StripeFooter sf;
+    sf.streams = streams;
+    sf.columns.resize(orc_table.num_columns() + 1);
+    sf.columns[0].kind = DIRECT;
+    for (size_t i = 1; i < sf.columns.size(); ++i) {
+      sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
+      sf.columns[i].dictionarySize =
+        (sf.columns[i].kind == DICTIONARY_V2)
+          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+          : 0;
+      if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+    }
+    ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
+    pbw.write(sf);
+    stripe.footerLength = pbw.size();
+    if (compression_kind_ != NONE) {
+      uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
+      pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_sf_len >> 0);
+      pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_sf_len >> 8);
+      pbw.buffer()[2]        = static_cast<uint8_t>(uncomp_sf_len >> 16);
+    }
+    out_sink_->host_write(pbw.data(), pbw.size());
+  }
+  for (auto const& task : write_tasks) {
+    task.wait();
   }
+}
+
+void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table,
+                                            std::vector<StripeInformation>& stripes)
+{
   if (ff.headerLength == 0) {
     // First call
     ff.headerLength   = std::strlen(MAGIC);
@@ -2369,26 +2637,26 @@ void writer::impl::write(table_view const& table)
   ff.stripes.insert(ff.stripes.end(),
                     std::make_move_iterator(stripes.begin()),
                     std::make_move_iterator(stripes.end()));
-  ff.numberOfRows += num_rows;
+  ff.numberOfRows += orc_table.num_rows();
 }
 
 void writer::impl::close()
 {
   if (closed) { return; }
   closed = true;
-  ProtobufWriter pbw_(&buffer_);
   PostScript ps;
 
-  auto const statistics = finish_statistic_blobs(ff.stripes.size(), persisted_stripe_statistics);
+  auto const statistics =
+    finish_statistic_blobs(ff.stripes.size(), persisted_stripe_statistics, stream);
 
   // File-level statistics
   if (not statistics.file_level.empty()) {
-    buffer_.resize(0);
-    pbw_.put_uint(encode_field_number<size_type>(1));
-    pbw_.put_uint(persisted_stripe_statistics.num_rows);
+    ProtobufWriter pbw;
+    pbw.put_uint(encode_field_number<size_type>(1));
+    pbw.put_uint(persisted_stripe_statistics.num_rows);
     // First entry contains total number of rows
     ff.statistics.reserve(ff.types.size());
-    ff.statistics.emplace_back(std::move(buffer_));
+    ff.statistics.emplace_back(pbw.release());
     // Add file stats, stored after stripe stats in `column_stats`
     ff.statistics.insert(ff.statistics.end(),
                          std::make_move_iterator(statistics.file_level.begin()),
@@ -2400,10 +2668,10 @@ void writer::impl::close()
     md.stripeStats.resize(ff.stripes.size());
     for (size_t stripe_id = 0; stripe_id < ff.stripes.size(); stripe_id++) {
       md.stripeStats[stripe_id].colStats.resize(ff.types.size());
-      buffer_.resize(0);
-      pbw_.put_uint(encode_field_number<size_type>(1));
-      pbw_.put_uint(ff.stripes[stripe_id].numberOfRows);
-      md.stripeStats[stripe_id].colStats[0] = std::move(buffer_);
+      ProtobufWriter pbw;
+      pbw.put_uint(encode_field_number<size_type>(1));
+      pbw.put_uint(ff.stripes[stripe_id].numberOfRows);
+      md.stripeStats[stripe_id].colStats[0] = pbw.release();
       for (size_t col_idx = 0; col_idx < ff.types.size() - 1; col_idx++) {
         size_t idx                                      = ff.stripes.size() * col_idx + stripe_id;
         md.stripeStats[stripe_id].colStats[1 + col_idx] = std::move(statistics.stripe_level[idx]);
@@ -2421,27 +2689,28 @@ void writer::impl::close()
 
   // Write statistics metadata
   if (md.stripeStats.size() != 0) {
-    buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-    pbw_.write(md);
-    add_uncompressed_block_headers(buffer_);
-    ps.metadataLength = buffer_.size();
-    out_sink_->host_write(buffer_.data(), buffer_.size());
+    ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
+    pbw.write(md);
+    add_uncompressed_block_headers(compression_kind_, compression_blocksize_, pbw.buffer());
+    ps.metadataLength = pbw.size();
+    out_sink_->host_write(pbw.data(), pbw.size());
   } else {
     ps.metadataLength = 0;
   }
-  buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-  pbw_.write(ff);
-  add_uncompressed_block_headers(buffer_);
+  ProtobufWriter pbw((compression_kind_ != NONE) ? 3 : 0);
+  pbw.write(ff);
+  add_uncompressed_block_headers(compression_kind_, compression_blocksize_, pbw.buffer());
 
   // Write postscript metadata
-  ps.footerLength         = buffer_.size();
+  ps.footerLength         = pbw.size();
   ps.compression          = compression_kind_;
   ps.compressionBlockSize = compression_blocksize_;
   ps.version              = {0, 12};
   ps.magic                = MAGIC;
-  const auto ps_length    = static_cast<uint8_t>(pbw_.write(ps));
-  buffer_.push_back(ps_length);
-  out_sink_->host_write(buffer_.data(), buffer_.size());
+
+  const auto ps_length = static_cast<uint8_t>(pbw.write(ps));
+  pbw.put_byte(ps_length);
+  out_sink_->host_write(pbw.data(), pbw.size());
   out_sink_->flush();
 }
 
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index dc8aad33af0..27d74e45b46 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -176,6 +176,72 @@ struct stripe_size_limits {
   size_type rows;
 };
 
+/**
+ * @brief Statistics data stored between calls to write for chunked writes
+ *
+ */
+struct intermediate_statistics {
+  explicit intermediate_statistics(rmm::cuda_stream_view stream) : stripe_stat_chunks(0, stream) {}
+
+  intermediate_statistics(std::vector<ColStatsBlob> rb,
+                          rmm::device_uvector<statistics_chunk> sc,
+                          hostdevice_vector<statistics_merge_group> smg,
+                          std::vector<statistics_dtype> sdt,
+                          std::vector<data_type> sct)
+    : rowgroup_blobs(std::move(rb)),
+      stripe_stat_chunks(std::move(sc)),
+      stripe_stat_merge(std::move(smg)),
+      stats_dtypes(std::move(sdt)),
+      col_types(std::move(sct))
+  {
+  }
+
+  // blobs for the rowgroups. Not persisted
+  std::vector<ColStatsBlob> rowgroup_blobs;
+
+  rmm::device_uvector<statistics_chunk> stripe_stat_chunks;
+  hostdevice_vector<statistics_merge_group> stripe_stat_merge;
+  std::vector<statistics_dtype> stats_dtypes;
+  std::vector<data_type> col_types;
+};
+
+/**
+ * @brief used for chunked writes to persist data between calls to write.
+ *
+ */
+struct persisted_statistics {
+  void clear()
+  {
+    stripe_stat_chunks.clear();
+    stripe_stat_merge.clear();
+    string_pools.clear();
+    stats_dtypes.clear();
+    col_types.clear();
+    num_rows = 0;
+  }
+
+  void persist(int num_table_rows,
+               bool single_write_mode,
+               intermediate_statistics& intermediate_stats,
+               rmm::cuda_stream_view stream);
+
+  std::vector<rmm::device_uvector<statistics_chunk>> stripe_stat_chunks;
+  std::vector<hostdevice_vector<statistics_merge_group>> stripe_stat_merge;
+  std::vector<rmm::device_uvector<char>> string_pools;
+  std::vector<statistics_dtype> stats_dtypes;
+  std::vector<data_type> col_types;
+  int num_rows = 0;
+};
+
+/**
+ * @brief Protobuf encoded statistics created at file close
+ *
+ */
+struct encoded_footer_statistics {
+  std::vector<ColStatsBlob> stripe_level;
+  std::vector<ColStatsBlob> file_level;
+};
+
 /**
  * @brief Implementation for ORC writer
  */
@@ -227,7 +293,7 @@ class writer::impl {
   /**
    * @brief Writes a single subtable as part of a larger ORC file/table write.
    *
-   * @param[in] table The table information to be written
+   * @param table The table information to be written
    */
   void write(table_view const& table);
 
@@ -238,188 +304,41 @@ class writer::impl {
 
  private:
   /**
-   * @brief Builds up per-stripe dictionaries for string columns.
-   *
-   * @param orc_table Non-owning view of a cuDF table w/ ORC-related info
-   * @param stripe_bounds List of stripe boundaries
-   * @param dict List of dictionary chunks [rowgroup][column]
-   * @param dict_index List of dictionary indices
-   * @param dictionary_enabled Whether dictionary encoding is enabled for a given column
-   * @param stripe_dict List of stripe dictionaries
-   */
-  void build_dictionaries(orc_table_view& orc_table,
-                          host_span<stripe_rowgroups const> stripe_bounds,
-                          hostdevice_2dvector<gpu::DictionaryChunk> const& dict,
-                          host_span<rmm::device_uvector<uint32_t>> dict_index,
-                          host_span<bool const> dictionary_enabled,
-                          hostdevice_2dvector<gpu::StripeDictionary>& stripe_dict);
-
-  /**
-   * @brief Builds up per-column streams.
-   *
-   * @param[in,out] columns List of columns
-   * @param[in] segmentation stripe and rowgroup ranges
-   * @param[in] decimal_column_sizes Sizes of encoded decimal columns
-   * @return List of stream descriptors
-   */
-  orc_streams create_streams(host_span<orc_column_view> columns,
-                             file_segmentation const& segmentation,
-                             std::map<uint32_t, size_t> const& decimal_column_sizes);
-
-  /**
-   * @brief Returns stripe information after compacting columns' individual data
-   * chunks into contiguous data streams.
-   *
-   * @param[in] num_index_streams Total number of index streams
-   * @param[in] segmentation stripe and rowgroup ranges
-   * @param[in,out] enc_streams List of encoder chunk streams [column][rowgroup]
-   * @param[in,out] strm_desc List of stream descriptors [stripe][data_stream]
+   * @brief Write the intermediate ORC data into the data sink.
    *
-   * @return The stripes' information
-   */
-  std::vector<StripeInformation> gather_stripes(
-    size_t num_index_streams,
-    file_segmentation const& segmentation,
-    hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
-    hostdevice_2dvector<gpu::StripeStream>* strm_desc);
-
-  /**
-   * @brief Statistics data stored between calls to write for chunked writes
-   *
-   */
-  struct intermediate_statistics {
-    explicit intermediate_statistics(rmm::cuda_stream_view stream)
-      : stripe_stat_chunks(0, stream){};
-    intermediate_statistics(std::vector<ColStatsBlob> rb,
-                            rmm::device_uvector<statistics_chunk> sc,
-                            hostdevice_vector<statistics_merge_group> smg,
-                            std::vector<statistics_dtype> sdt,
-                            std::vector<data_type> sct)
-      : rowgroup_blobs(std::move(rb)),
-        stripe_stat_chunks(std::move(sc)),
-        stripe_stat_merge(std::move(smg)),
-        stats_dtypes(std::move(sdt)),
-        col_types(std::move(sct)){};
-
-    // blobs for the rowgroups. Not persisted
-    std::vector<ColStatsBlob> rowgroup_blobs;
-
-    rmm::device_uvector<statistics_chunk> stripe_stat_chunks;
-    hostdevice_vector<statistics_merge_group> stripe_stat_merge;
-    std::vector<statistics_dtype> stats_dtypes;
-    std::vector<data_type> col_types;
-  };
-
-  /**
-   * @brief used for chunked writes to persist data between calls to write.
-   *
-   */
-  struct persisted_statistics {
-    void clear()
-    {
-      stripe_stat_chunks.clear();
-      stripe_stat_merge.clear();
-      string_pools.clear();
-      stats_dtypes.clear();
-      col_types.clear();
-      num_rows = 0;
-    }
-
-    void persist(int num_table_rows,
-                 bool single_write_mode,
-                 intermediate_statistics& intermediate_stats,
-                 rmm::cuda_stream_view stream);
-
-    std::vector<rmm::device_uvector<statistics_chunk>> stripe_stat_chunks;
-    std::vector<hostdevice_vector<statistics_merge_group>> stripe_stat_merge;
-    std::vector<rmm::device_uvector<char>> string_pools;
-    std::vector<statistics_dtype> stats_dtypes;
-    std::vector<data_type> col_types;
-    int num_rows = 0;
-  };
-
-  /**
-   * @brief Protobuf encoded statistics created at file close
-   *
-   */
-  struct encoded_footer_statistics {
-    std::vector<ColStatsBlob> stripe_level;
-    std::vector<ColStatsBlob> file_level;
-  };
-
-  /**
-   * @brief Returns column statistics in an intermediate format.
-   *
-   * @param statistics_freq Frequency of statistics to be included in the output file
-   * @param orc_table Table information to be written
-   * @param segmentation stripe and rowgroup ranges
-   * @return The statistic information
-   */
-  intermediate_statistics gather_statistic_blobs(statistics_freq const statistics_freq,
-                                                 orc_table_view const& orc_table,
-                                                 file_segmentation const& segmentation);
-
-  /**
-   * @brief Returns column statistics encoded in ORC protobuf format stored in the footer.
-   *
-   * @param num_stripes number of stripes in the data
-   * @param incoming_stats intermediate statistics returned from `gather_statistic_blobs`
-   * @return The encoded statistic blobs
-   */
-  encoded_footer_statistics finish_statistic_blobs(
-    int num_stripes, writer::impl::persisted_statistics& incoming_stats);
-
-  /**
-   * @brief Writes the specified column's row index stream.
-   *
-   * @param[in] stripe_id Stripe's identifier
-   * @param[in] stream_id Stream identifier (column id + 1)
-   * @param[in] columns List of columns
-   * @param[in] segmentation stripe and rowgroup ranges
-   * @param[in] enc_streams List of encoder chunk streams [column][rowgroup]
-   * @param[in] strm_desc List of stream descriptors
-   * @param[in] comp_out Output status for compressed streams
-   * @param[in] rg_stats row group level statistics
-   * @param[in,out] stripe Stream's parent stripe
-   * @param[in,out] streams List of all streams
-   * @param[in,out] pbw Protobuf writer
-   */
-  void write_index_stream(int32_t stripe_id,
-                          int32_t stream_id,
-                          host_span<orc_column_view const> columns,
-                          file_segmentation const& segmentation,
-                          host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
-                          host_2dspan<gpu::StripeStream const> strm_desc,
-                          host_span<compression_result const> comp_out,
-                          std::vector<ColStatsBlob> const& rg_stats,
-                          StripeInformation* stripe,
-                          orc_streams* streams,
-                          ProtobufWriter* pbw);
-
-  /**
-   * @brief Write the specified column's data streams
+   * The intermediate data is generated from processing (compressing/encoding) an cuDF input table
+   * by `process_for_write` called in the `write()` function.
    *
-   * @param[in] strm_desc Stream's descriptor
-   * @param[in] enc_stream Chunk's streams
-   * @param[in] compressed_data Compressed stream data
-   * @param[in,out] stream_out Temporary host output buffer
-   * @param[in,out] stripe Stream's parent stripe
-   * @param[in,out] streams List of all streams
-   * @return An std::future that should be synchronized to ensure the writing is complete
+   * @param streams List of stream descriptors
+   * @param comp_results Status of data compression
+   * @param strm_descs List of stream descriptors
+   * @param enc_data ORC per-chunk streams of encoded data
+   * @param segmentation Description of how the ORC file is segmented into stripes and rowgroups
+   * @param stripes List of stripe description
+   * @param orc_table Non-owning view of a cuDF table that includes ORC-related information
+   * @param compressed_data Compressed stream data
+   * @param intermediate_stats Statistics data stored between calls to write
+   * @param stream_output Temporary host output buffer
    */
-  std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
-                                      gpu::encoder_chunk_streams const& enc_stream,
-                                      uint8_t const* compressed_data,
-                                      uint8_t* stream_out,
-                                      StripeInformation* stripe,
-                                      orc_streams* streams);
+  void write_orc_data_to_sink(orc_streams& streams,
+                              hostdevice_vector<compression_result> const& comp_results,
+                              hostdevice_2dvector<gpu::StripeStream> const& strm_descs,
+                              encoded_data const& enc_data,
+                              file_segmentation const& segmentation,
+                              std::vector<StripeInformation>& stripes,
+                              orc_table_view const& orc_table,
+                              rmm::device_buffer const& compressed_data,
+                              intermediate_statistics& intermediate_stats,
+                              uint8_t* stream_output);
 
   /**
-   * @brief Insert 3-byte uncompressed block headers in a byte vector
+   * @brief Add the processed table data into the internal file footer.
    *
-   * @param byte_vector Raw data (must include initial 3-byte header)
+   * @param orc_table Non-owning view of a cuDF table that includes ORC-related information
+   * @param stripes List of stripe description
    */
-  void add_uncompressed_block_headers(std::vector<uint8_t>& byte_vector);
+  void add_table_to_footer_data(orc_table_view const& orc_table,
+                                std::vector<StripeInformation>& stripes);
 
  private:
   rmm::mr::device_memory_resource* _mr = nullptr;
@@ -451,7 +370,6 @@ class writer::impl {
   // statistics data saved between calls to write before a close writes out the statistics
   persisted_statistics persisted_stripe_statistics;
 
-  std::vector<uint8_t> buffer_;
   std::unique_ptr<data_sink> out_sink_;
 };
 
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 5a12acec2a3..e48696fcb9b 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -282,6 +282,7 @@ __global__ void __launch_bounds__(128)
       g.col            = ck_g->col_desc;
       g.start_row      = fragments[frag_id].start_value_idx;
       g.num_rows       = fragments[frag_id].num_leaf_values;
+      g.non_leaf_nulls = fragments[frag_id].num_values - g.num_rows;
       groups[frag_id]  = g;
     }
   }
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 5f09ec33811..363192dda2e 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -468,10 +468,12 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
 
     host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
                                                              codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, stream);
+    auto const d_comp_in = cudf::detail::make_device_uvector_async(
+      comp_in_view, stream, rmm::mr::get_current_device_resource());
     host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
                                                         codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, stream);
+    auto const d_comp_out = cudf::detail::make_device_uvector_async(
+      comp_out_view, stream, rmm::mr::get_current_device_resource());
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
@@ -523,8 +525,10 @@ void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
 
   // now copy the uncompressed V2 def and rep level data
   if (not copy_in.empty()) {
-    auto const d_copy_in  = cudf::detail::make_device_uvector_async(copy_in, stream);
-    auto const d_copy_out = cudf::detail::make_device_uvector_async(copy_out, stream);
+    auto const d_copy_in = cudf::detail::make_device_uvector_async(
+      copy_in, stream, rmm::mr::get_current_device_resource());
+    auto const d_copy_out = cudf::detail::make_device_uvector_async(
+      copy_out, stream, rmm::mr::get_current_device_resource());
 
     gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
     stream.synchronize();
@@ -1531,8 +1535,8 @@ void reader::impl::preprocess_pages(size_t skip_rows,
     // Build index for string dictionaries since they can't be indexed
     // directly due to variable-sized elements
     _chunk_itm_data.str_dict_index =
-      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(total_str_dict_indexes,
-                                                                        _stream);
+      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+        total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
 
     // Update chunks with pointers to string dict indices
     for (size_t c = 0, page_count = 0, str_ofs = 0; c < chunks.size(); c++) {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 2c9bff33a14..e6e14908f36 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -509,20 +509,15 @@ inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
                             column_in_metadata const& col_meta,
                             bool single_write_mode)
 {
-  if (single_write_mode) {
-    return col->nullable();
-  } else {
-    if (col_meta.is_nullability_defined()) {
-      CUDF_EXPECTS(col_meta.nullable() || !col->nullable(),
-                   "Mismatch in metadata prescribed nullability and input column nullability. "
-                   "Metadata for nullable input column cannot prescribe nullability = false");
-      return col_meta.nullable();
-    } else {
-      // For chunked write, when not provided nullability, we assume the worst case scenario
-      // that all columns are nullable.
-      return true;
-    }
-  }
+  if (col_meta.is_nullability_defined()) {
+    CUDF_EXPECTS(col_meta.nullable() || !col->nullable(),
+                 "Mismatch in metadata prescribed nullability and input column nullability. "
+                 "Metadata for nullable input column cannot prescribe nullability = false");
+    return col_meta.nullable();
+  }
+  // For chunked write, when not provided nullability, we assume the worst case scenario
+  // that all columns are nullable.
+  return not single_write_mode or col->nullable();
 }
 
 /**
@@ -858,7 +853,8 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   _nullability = std::vector<uint8_t>(r_nullability.crbegin(), r_nullability.crend());
   // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
   // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
-  _d_nullability = cudf::detail::make_device_uvector_async(_nullability, stream);
+  _d_nullability = cudf::detail::make_device_uvector_async(
+    _nullability, stream, rmm::mr::get_current_device_resource());
 
   _is_list = (_max_rep_level > 0);
 
@@ -933,7 +929,8 @@ void writer::impl::init_row_group_fragments(
   device_span<int const> part_frag_offset,
   uint32_t fragment_size)
 {
-  auto d_partitions = cudf::detail::make_device_uvector_async(partitions, stream);
+  auto d_partitions = cudf::detail::make_device_uvector_async(
+    partitions, stream, rmm::mr::get_current_device_resource());
   gpu::InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host(stream, true);
 }
@@ -941,7 +938,8 @@ void writer::impl::init_row_group_fragments(
 void writer::impl::calculate_page_fragments(device_span<gpu::PageFragment> frag,
                                             host_span<size_type const> frag_sizes)
 {
-  auto d_frag_sz = cudf::detail::make_device_uvector_async(frag_sizes, stream);
+  auto d_frag_sz = cudf::detail::make_device_uvector_async(
+    frag_sizes, stream, rmm::mr::get_current_device_resource());
   gpu::CalculatePageFragments(frag, d_frag_sz, stream);
 }
 
@@ -1512,7 +1510,8 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
     num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0);
   part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
 
-  auto d_part_frag_offset = cudf::detail::make_device_uvector_async(part_frag_offset, stream);
+  auto d_part_frag_offset = cudf::detail::make_device_uvector_async(
+    part_frag_offset, stream, rmm::mr::get_current_device_resource());
   cudf::detail::hostdevice_2dvector<gpu::PageFragment> row_group_fragments(
     num_columns, num_fragments, stream);
 
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index 125235ebf2f..0b09cb63d19 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,7 +129,13 @@ struct calculate_group_statistics_functor {
 
     chunk = block_reduce(chunk, storage);
 
-    if (t == 0) { s.ck = get_untyped_chunk(chunk); }
+    if (t == 0) {
+      // parquet wants total null count in stats, not just count of null leaf values
+      if constexpr (IO == detail::io_file_format::PARQUET) {
+        chunk.null_count += s.group.non_leaf_nulls;
+      }
+      s.ck = get_untyped_chunk(chunk);
+    }
   }
 
   template <typename T,
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index f2611f7cc26..8d24d443d42 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,6 +111,7 @@ struct statistics_group {
   const stats_column_desc* col;  //!< Column information
   uint32_t start_row;            //!< Start row of this group
   uint32_t num_rows;             //!< Number of rows in group
+  uint32_t non_leaf_nulls;       //!< Number of null non-leaf values in the group
 };
 
 struct statistics_merge_group {
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index a0ba3e3ee35..afa260e215a 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -379,9 +379,11 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   // must be at least 32 when using warp-reduce on partials
   // must be at least 1 more than max possible concurrent tiles
   // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
-  auto num_tile_states  = std::max(32, TILES_PER_CHUNK * concurrency + 32);
-  auto tile_multistates = scan_tile_state<multistate>(num_tile_states, stream);
-  auto tile_offsets     = scan_tile_state<output_offset>(num_tile_states, stream);
+  auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32);
+  auto tile_multistates =
+    scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
+  auto tile_offsets =
+    scan_tile_state<output_offset>(num_tile_states, stream, rmm::mr::get_current_device_resource());
 
   multibyte_split_init_kernel<<<TILES_PER_CHUNK,
                                 THREADS_PER_TILE,
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 6c14975101c..430d7c4a26d 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 namespace cudf {
 namespace io {
 namespace detail {
@@ -43,7 +45,8 @@ void column_buffer::create(size_type _size,
       // make_zeroed_device_uvector_async here and instead let it use the
       // default rmm memory resource.
       _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
-        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(size, stream));
+        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+          size, stream, rmm::mr::get_current_device_resource()));
       break;
 
     // list columns store a buffer of int32's as offsets to represent
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index cba45f693f9..40b70986eca 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,8 +63,8 @@ class file_sink : public data_sink {
 
   [[nodiscard]] bool is_device_write_preferred(size_t size) const override
   {
-    return !_kvikio_file.closed() ||
-           (_cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size));
+    if (size < _gds_write_preferred_threshold) { return false; }
+    return supports_device_write();
   }
 
   std::future<void> device_write_async(void const* gpu_data,
@@ -96,6 +96,8 @@ class file_sink : public data_sink {
   size_t _bytes_written = 0;
   std::unique_ptr<detail::cufile_output_impl> _cufile_out;
   kvikio::FileHandle _kvikio_file;
+  // The write size above which GDS is faster then d2h-copy + posix-write
+  static constexpr size_t _gds_write_preferred_threshold = 128 << 10;  // 128KB
 };
 
 /**
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 71d64900398..e2cea7a56ff 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -56,8 +56,8 @@ class file_source : public datasource {
 
   [[nodiscard]] bool is_device_read_preferred(size_t size) const override
   {
-    return !_kvikio_file.closed() ||
-           (_cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size));
+    if (size < _gds_read_preferred_threshold) { return false; }
+    return supports_device_read();
   }
 
   std::future<size_t> device_read_async(size_t offset,
@@ -98,6 +98,8 @@ class file_source : public datasource {
  private:
   std::unique_ptr<detail::cufile_input_impl> _cufile_in;
   kvikio::FileHandle _kvikio_file;
+  // The read size above which GDS is faster then posix-read + h2d-copy
+  static constexpr size_t _gds_read_preferred_threshold = 128 << 10;  // 128KB
 };
 
 /**
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 38674892966..b55dd3b1583 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,36 +49,10 @@ class file_wrapper {
   [[nodiscard]] auto desc() const { return fd; }
 };
 
-/**
- * @brief Base class for cuFile input/output.
- *
- * Contains the common API for cuFile input and output classes.
- */
-class cufile_io_base {
- public:
-  /**
-   * @brief Returns an estimate of whether the cuFile operation is the optimal option.
-   *
-   * @param size Read/write operation size, in bytes.
-   * @return Whether a cuFile operation with the given size is expected to be faster than a host
-   * read + H2D copy
-   */
-  static bool is_cufile_io_preferred(size_t size) { return size > op_size_threshold; }
-
- protected:
-  /**
-   * @brief The read/write size above which cuFile is faster then host read + copy
-   *
-   * This may not be the optimal threshold for all systems. Derived `is_cufile_io_preferred`
-   * implementations can use a different logic.
-   */
-  static constexpr size_t op_size_threshold = 128 << 10;
-};
-
 /**
  * @brief Interface class for cufile input.
  */
-class cufile_input : public cufile_io_base {
+class cufile_input {
  public:
   /**
    * @brief Asynchronously reads into existing device memory.
@@ -101,7 +75,7 @@ class cufile_input : public cufile_io_base {
 /**
  * @brief Interface class for cufile output.
  */
-class cufile_output : public cufile_io_base {
+class cufile_output {
  public:
   /**
    * @brief Asynchronously writes the data from a device buffer into a file.
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index a03789464cc..5c5cbd1c01d 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -126,7 +126,8 @@ cudf::size_type find_all_from_set(device_span<char const> data,
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
   const int grid_size = divCeil(data.size(), (size_t)block_size);
 
-  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
+  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
+    1, stream, rmm::mr::get_current_device_resource());
   for (char key : keys) {
     count_and_set_positions<T><<<grid_size, block_size, 0, stream.value()>>>(
       data.data(), data.size(), result_offset, key, d_count.data(), positions);
@@ -143,7 +144,8 @@ cudf::size_type find_all_from_set(host_span<char const> data,
                                   rmm::cuda_stream_view stream)
 {
   rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream);
-  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
+  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
+    1, stream, rmm::mr::get_current_device_resource());
 
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
diff --git a/cpp/src/io/utilities/trie.cu b/cpp/src/io/utilities/trie.cu
index bf03d6a6a89..e2ace7258f7 100644
--- a/cpp/src/io/utilities/trie.cu
+++ b/cpp/src/io/utilities/trie.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,7 +101,8 @@ rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<s
     // Only add the terminating character if any nodes were added
     if (has_children) { nodes.push_back(serial_trie_node(trie_terminating_character)); }
   }
-  return cudf::detail::make_device_uvector_sync(nodes, stream);
+  return cudf::detail::make_device_uvector_sync(
+    nodes, stream, rmm::mr::get_current_device_resource());
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index cf1476d8bcc..c3073524467 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,8 @@ conditional_join(table_view const& left,
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream);
+      case join_kind::FULL_JOIN:
+        return get_trivial_left_join_indices(left, stream, rmm::mr::get_current_device_resource());
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
@@ -75,7 +76,8 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped = get_trivial_left_join_indices(right, stream);
+        auto ret_flipped =
+          get_trivial_left_join_indices(right, stream, rmm::mr::get_current_device_resource());
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 7c329cd8e17..9bc6024ee7e 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,9 +47,9 @@ conditional_join(table_view const& left,
                  table_view const& right,
                  ast::expression const& binary_predicate,
                  join_kind JoinKind,
-                 std::optional<std::size_t> output_size = {},
-                 rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+                 std::optional<std::size_t> output_size,
+                 rmm::cuda_stream_view stream,
+                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Computes the size of a join operation between two tables without
@@ -63,13 +63,12 @@ conditional_join(table_view const& left,
  *
  * @return Join output indices vector pair
  */
-std::size_t compute_conditional_join_output_size(
-  table_view const& left,
-  table_view const& right,
-  ast::expression const& binary_predicate,
-  join_kind JoinKind,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::size_t compute_conditional_join_output_size(table_view const& left,
+                                                 table_view const& right,
+                                                 ast::expression const& binary_predicate,
+                                                 join_kind JoinKind,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index 7358726d69d..07057acf37e 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,11 +37,10 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<cudf::table> cross_join(
-  cudf::table_view const& left,
-  cudf::table_view const& right,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
+                                        cudf::table_view const& right,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty");
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 7fb35e179e9..d0bdad73614 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -284,7 +284,8 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
                              cudf::null_equality compare_nulls,
                              rmm::cuda_stream_view stream)
   : _is_empty{build.num_rows() == 0},
-    _composite_bitmask{cudf::detail::bitmask_and(build, stream).first},
+    _composite_bitmask{
+      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first},
     _nulls_equal{compare_nulls},
     _hash_table{compute_hash_table_size(build.num_rows()),
                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
@@ -298,8 +299,13 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
                "Build column size is too big for hash join");
 
   // need to store off the owning structures for some of the views in _build
-  _flattened_build_table = structs::detail::flatten_nested_columns(
-    build, {}, {}, structs::detail::column_nullability::FORCE, stream);
+  _flattened_build_table =
+    structs::detail::flatten_nested_columns(build,
+                                            {},
+                                            {},
+                                            structs::detail::column_nullability::FORCE,
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
   _build = _flattened_build_table->flattened_columns();
 
   if (_is_empty) { return; }
@@ -356,8 +362,13 @@ std::size_t hash_join<Hasher>::inner_join_size(cudf::table_view const& probe,
   // Return directly if build table is empty
   if (_is_empty) { return 0; }
 
-  auto flattened_probe = structs::detail::flatten_nested_columns(
-    probe, {}, {}, structs::detail::column_nullability::FORCE, stream);
+  auto flattened_probe =
+    structs::detail::flatten_nested_columns(probe,
+                                            {},
+                                            {},
+                                            structs::detail::column_nullability::FORCE,
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
   auto const flattened_probe_table = flattened_probe->flattened_columns();
 
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
@@ -381,8 +392,13 @@ std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& probe,
   // Trivial left join case - exit early
   if (_is_empty) { return probe.num_rows(); }
 
-  auto flattened_probe = structs::detail::flatten_nested_columns(
-    probe, {}, {}, structs::detail::column_nullability::FORCE, stream);
+  auto flattened_probe =
+    structs::detail::flatten_nested_columns(probe,
+                                            {},
+                                            {},
+                                            structs::detail::column_nullability::FORCE,
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
   auto const flattened_probe_table = flattened_probe->flattened_columns();
 
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
@@ -407,8 +423,13 @@ std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& probe,
   // Trivial left join case - exit early
   if (_is_empty) { return probe.num_rows(); }
 
-  auto flattened_probe = structs::detail::flatten_nested_columns(
-    probe, {}, {}, structs::detail::column_nullability::FORCE, stream);
+  auto flattened_probe =
+    structs::detail::flatten_nested_columns(probe,
+                                            {},
+                                            {},
+                                            structs::detail::column_nullability::FORCE,
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
   auto const flattened_probe_table = flattened_probe->flattened_columns();
 
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
@@ -474,8 +495,13 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
   CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
                "Probe column size is too big for hash join");
 
-  auto flattened_probe = structs::detail::flatten_nested_columns(
-    probe, {}, {}, structs::detail::column_nullability::FORCE, stream);
+  auto flattened_probe =
+    structs::detail::flatten_nested_columns(probe,
+                                            {},
+                                            {},
+                                            structs::detail::column_nullability::FORCE,
+                                            stream,
+                                            rmm::mr::get_current_device_resource());
   auto const flattened_probe_table = flattened_probe->flattened_columns();
 
   CUDF_EXPECTS(_build.num_columns() == flattened_probe_table.num_columns(),
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 44cddd2720e..bc4c62291b2 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -140,10 +140,9 @@ class pair_equality {
  */
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-get_trivial_left_join_indices(
-  table_view const& left,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+get_trivial_left_join_indices(table_view const& left,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Builds the hash table based on the given `build_table`.
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 46e337a3363..8d66cba8f8d 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -80,7 +80,9 @@ mixed_join(
       // Left and full joins all return all the row indices from
       // left with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
-      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream);
+      case join_kind::FULL_JOIN:
+        return get_trivial_left_join_indices(
+          left_conditional, stream, rmm::mr::get_current_device_resource());
       // Inner joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
         return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
@@ -96,7 +98,8 @@ mixed_join(
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped = get_trivial_left_join_indices(right_conditional, stream);
+        auto ret_flipped = get_trivial_left_join_indices(
+          right_conditional, stream, rmm::mr::get_current_device_resource());
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
@@ -136,7 +139,8 @@ mixed_join(
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
-  auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
+  auto const row_bitmask =
+    cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
   build_join_hash_table(
     build, hash_table, compare_nulls, static_cast<bitmask_type const*>(row_bitmask.data()), stream);
   auto hash_table_view = hash_table.get_device_view();
@@ -384,7 +388,8 @@ compute_mixed_join_output_size(table_view const& left_equality,
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
-  auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
+  auto const row_bitmask =
+    cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
   build_join_hash_table(
     build, hash_table, compare_nulls, static_cast<bitmask_type const*>(row_bitmask.data()), stream);
   auto hash_table_view = hash_table.get_device_view();
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index b32df9316e2..1304c4ae3b0 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -118,7 +118,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
       // Anti and semi return all the row indices from left
       // with a corresponding NULL from the right.
       case join_kind::LEFT_ANTI_JOIN:
-        return get_trivial_left_join_indices(left_conditional, stream).first;
+        return get_trivial_left_join_indices(
+                 left_conditional, stream, rmm::mr::get_current_device_resource())
+          .first;
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::LEFT_SEMI_JOIN:
         return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
@@ -193,7 +195,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
-    auto const [row_bitmask, _] = cudf::detail::bitmask_and(build, stream);
+    auto const [row_bitmask, _] =
+      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource());
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
@@ -431,7 +434,8 @@ compute_mixed_join_output_size_semi(table_view const& left_equality,
     hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
-    auto const [row_bitmask, _] = cudf::detail::bitmask_and(build, stream);
+    auto const [row_bitmask, _] =
+      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource());
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index cc523b2ac7f..dcb6835ec09 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   cudf::table_view const& right_keys,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty");
@@ -64,8 +64,12 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   // Previously, the gather map was generated directly without this array but by calling to
   // `map.contains` inside the `thrust::copy_if` kernel. However, that led to increasing register
   // usage and reducing performance, as reported here: https://github.com/rapidsai/cudf/pull/10511.
-  auto const flagged =
-    cudf::detail::contains(right_keys, left_keys, compare_nulls, nan_equality::ALL_EQUAL, stream);
+  auto const flagged = cudf::detail::contains(right_keys,
+                                              left_keys,
+                                              compare_nulls,
+                                              nan_equality::ALL_EQUAL,
+                                              stream,
+                                              rmm::mr::get_current_device_resource());
 
   auto const left_num_rows = left_keys.num_rows();
   auto gather_map =
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 8b006548391..993d5e3fc78 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -216,7 +216,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
   // concatenate the input table into one column.
   std::vector<column_view> cols(input.num_columns());
   std::copy(input.begin(), input.end(), cols.begin());
-  auto concat = cudf::detail::concatenate(cols, stream);
+  auto concat = cudf::detail::concatenate(cols, stream, rmm::mr::get_current_device_resource());
 
   // whether or not we should be generating a null mask at all
   auto const build_null_mask = concat->has_nulls();
@@ -246,7 +246,8 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
             auto const row_index = i % num_rows;
             return row_null_counts[row_index] != num_columns;
           },
-          stream);
+          stream,
+          rmm::mr::get_current_device_resource());
       }
       // NULLIFY_OUTPUT_ROW.  Output row is nullfied if any input row is null
       return cudf::detail::valid_if(
@@ -257,7 +258,8 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
           auto const row_index = i % num_rows;
           return row_null_counts[row_index] == 0;
         },
-        stream);
+        stream,
+        rmm::mr::get_current_device_resource());
     }();
     concat->set_null_mask(std::move(null_mask), null_count);
   }
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 22083f7ce99..8ca26c0ebfb 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,10 +91,9 @@ std::unique_ptr<column> merge_offsets(host_span<lists_column_view const> columns
 /**
  * @copydoc cudf::lists::detail::concatenate
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   std::vector<lists_column_view> lists_columns;
   lists_columns.reserve(columns.size());
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index c96a21df905..5136cc8cd37 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -266,7 +266,8 @@ dremel_data get_encoding(column_view h_col,
     max_vals_size += column_ends[l] - column_offsets[l];
   }
 
-  auto d_nullability = cudf::detail::make_device_uvector_async(nullability, stream);
+  auto d_nullability = cudf::detail::make_device_uvector_async(
+    nullability, stream, rmm::mr::get_current_device_resource());
 
   rmm::device_uvector<uint8_t> rep_level(max_vals_size, stream);
   rmm::device_uvector<uint8_t> def_level(max_vals_size, stream);
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index bff63871e29..5d4a20d1cb8 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -101,8 +101,10 @@ std::unique_ptr<cudf::column> make_index_child(size_type index,
  */
 std::unique_ptr<cudf::column> make_index_offsets(size_type num_lists, rmm::cuda_stream_view stream)
 {
-  return cudf::detail::sequence(
-    num_lists + 1, cudf::scalar_type_t<size_type>(0, true, stream), stream);
+  return cudf::detail::sequence(num_lists + 1,
+                                cudf::scalar_type_t<size_type>(0, true, stream),
+                                stream,
+                                rmm::mr::get_current_device_resource());
 }
 
 }  // namespace
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index c9c88270e10..d606f11bdb9 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
   auto const child = input.get_sliced_child(stream);
 
   // The labels are also a map from each list element to its corresponding zero-based list index.
-  auto const labels = generate_labels(input, child.size(), stream);
+  auto const labels =
+    generate_labels(input, child.size(), stream, rmm::mr::get_current_device_resource());
 
   // The offsets of the output lists column.
   auto out_offsets = get_normalized_offsets(input, stream, mr);
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index a31b7c6e5be..813bac54e08 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,16 +73,18 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
   // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence reults
   //   computed in the previous step.
 
-  auto const lhs_child  = lhs.get_sliced_child(stream);
-  auto const rhs_child  = rhs.get_sliced_child(stream);
-  auto const lhs_labels = generate_labels(lhs, lhs_child.size(), stream);
-  auto const rhs_labels = generate_labels(rhs, rhs_child.size(), stream);
-  auto const lhs_table  = table_view{{lhs_labels->view(), lhs_child}};
-  auto const rhs_table  = table_view{{rhs_labels->view(), rhs_child}};
+  auto const lhs_child = lhs.get_sliced_child(stream);
+  auto const rhs_child = rhs.get_sliced_child(stream);
+  auto const lhs_labels =
+    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const rhs_labels =
+    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
+  auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   // Check existence for each row of the rhs_table in lhs_table.
-  auto const contained =
-    cudf::detail::contains(lhs_table, rhs_table, nulls_equal, nans_equal, stream);
+  auto const contained = cudf::detail::contains(
+    lhs_table, rhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
 
   auto const num_rows = lhs.size();
 
@@ -140,20 +142,23 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
   // - Extract rows of the rhs table using the existence results computed in the previous step.
   // - Remove duplicate rows, and build the output lists.
 
-  auto const lhs_child  = lhs.get_sliced_child(stream);
-  auto const rhs_child  = rhs.get_sliced_child(stream);
-  auto const lhs_labels = generate_labels(lhs, lhs_child.size(), stream);
-  auto const rhs_labels = generate_labels(rhs, rhs_child.size(), stream);
-  auto const lhs_table  = table_view{{lhs_labels->view(), lhs_child}};
-  auto const rhs_table  = table_view{{rhs_labels->view(), rhs_child}};
+  auto const lhs_child = lhs.get_sliced_child(stream);
+  auto const rhs_child = rhs.get_sliced_child(stream);
+  auto const lhs_labels =
+    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const rhs_labels =
+    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
+  auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
-  auto const contained =
-    cudf::detail::contains(lhs_table, rhs_table, nulls_equal, nans_equal, stream);
+  auto const contained = cudf::detail::contains(
+    lhs_table, rhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
 
   auto const intersect_table = cudf::detail::copy_if(
     rhs_table,
     [contained = contained.begin()] __device__(auto const idx) { return contained[idx]; },
-    stream);
+    stream,
+    rmm::mr::get_current_device_resource());
 
   // A stable algorithm is required to ensure that list labels remain contiguous.
   auto out_table = cudf::detail::stable_distinct(intersect_table->view(),
@@ -191,8 +196,11 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
 
   // Algorithm: `return distinct(concatenate_rows(lhs, rhs))`.
 
-  auto const union_col = lists::detail::concatenate_rows(
-    table_view{{lhs.parent(), rhs.parent()}}, concatenate_null_policy::NULLIFY_OUTPUT_ROW, stream);
+  auto const union_col =
+    lists::detail::concatenate_rows(table_view{{lhs.parent(), rhs.parent()}},
+                                    concatenate_null_policy::NULLIFY_OUTPUT_ROW,
+                                    stream,
+                                    rmm::mr::get_current_device_resource());
 
   return cudf::lists::detail::distinct(
     lists_column_view{union_col->view()}, nulls_equal, nans_equal, stream, mr);
@@ -215,20 +223,23 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
   // - Extract rows of the lhs table using that difference results.
   // - Remove duplicate rows, and build the output lists.
 
-  auto const lhs_child  = lhs.get_sliced_child(stream);
-  auto const rhs_child  = rhs.get_sliced_child(stream);
-  auto const lhs_labels = generate_labels(lhs, lhs_child.size(), stream);
-  auto const rhs_labels = generate_labels(rhs, rhs_child.size(), stream);
-  auto const lhs_table  = table_view{{lhs_labels->view(), lhs_child}};
-  auto const rhs_table  = table_view{{rhs_labels->view(), rhs_child}};
+  auto const lhs_child = lhs.get_sliced_child(stream);
+  auto const rhs_child = rhs.get_sliced_child(stream);
+  auto const lhs_labels =
+    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const rhs_labels =
+    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+  auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
+  auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
-  auto const contained =
-    cudf::detail::contains(rhs_table, lhs_table, nulls_equal, nans_equal, stream);
+  auto const contained = cudf::detail::contains(
+    rhs_table, lhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
 
   auto const difference_table = cudf::detail::copy_if(
     lhs_table,
     [contained = contained.begin()] __device__(auto const idx) { return !contained[idx]; },
-    stream);
+    stream,
+    rmm::mr::get_current_device_resource());
 
   // A stable algorithm is required to ensure that list labels remain contiguous.
   auto out_table = cudf::detail::stable_distinct(difference_table->view(),
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index 5acb1cb8849..0aaa8356304 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -21,10 +21,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
-#include <cudf/detail/segmented_reduction_functions.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/lists/detail/stream_compaction.hpp>
 #include <cudf/lists/stream_compaction.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -65,12 +65,14 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
       cudf::detail::slice(
         boolean_mask.offsets(), {boolean_mask.offset(), boolean_mask.size() + 1}, stream)
         .front();
-    auto const sizes       = cudf::reduction::segmented_sum(boolean_mask_sliced_child,
-                                                      boolean_mask_sliced_offsets,
-                                                      offset_data_type,
-                                                      null_policy::EXCLUDE,
-                                                      std::nullopt,
-                                                      stream);
+    auto const sizes =
+      cudf::reduction::detail::segmented_sum(boolean_mask_sliced_child,
+                                             boolean_mask_sliced_offsets,
+                                             offset_data_type,
+                                             null_policy::EXCLUDE,
+                                             std::nullopt,
+                                             stream,
+                                             rmm::mr::get_current_device_resource());
     auto const d_sizes     = column_device_view::create(*sizes, stream);
     auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, offset_type{0});
     auto const sizes_end   = sizes_begin + sizes->size();
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index d0e4557663e..48d8babb4fa 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,8 +46,9 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
 
   if (input.is_empty()) { return empty_like(input.parent()); }
 
-  auto const child  = input.get_sliced_child(stream);
-  auto const labels = generate_labels(input, child.size(), stream);
+  auto const child = input.get_sliced_child(stream);
+  auto const labels =
+    generate_labels(input, child.size(), stream, rmm::mr::get_current_device_resource());
 
   auto const distinct_table =
     cudf::detail::stable_distinct(table_view{{labels->view(), child}},  // input table
diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp
index 76f8879c4d3..c881e828677 100644
--- a/cpp/src/lists/utilities.hpp
+++ b/cpp/src/lists/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,11 +33,10 @@ namespace cudf::lists::detail {
  * @param mr Device memory resource used to allocate the returned object
  * @return A column containing list labels corresponding to each element in the child column
  */
-std::unique_ptr<column> generate_labels(
-  lists_column_view const& input,
-  size_type n_elements,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> generate_labels(lists_column_view const& input,
+                                        size_type n_elements,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Reconstruct an offsets column from the input list labels column.
@@ -61,9 +60,8 @@ std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
  * @param mr Device memory resource used to allocate the returned object
  * @return The output offsets column with values start from 0
  */
-std::unique_ptr<column> get_normalized_offsets(
-  lists_column_view const& input,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index d9c573e8155..83ee6793efb 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -170,8 +170,8 @@ index_vector generate_merged_indices(table_view const& left_table,
                                      table_view const& right_table,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
-                                     bool nullable                = true,
-                                     rmm::cuda_stream_view stream = cudf::get_default_stream())
+                                     bool nullable,
+                                     rmm::cuda_stream_view stream)
 {
   const size_type left_size  = left_table.num_rows();
   const size_type right_size = right_table.num_rows();
@@ -187,10 +187,12 @@ index_vector generate_merged_indices(table_view const& left_table,
   auto lhs_device_view = table_device_view::create(left_table, stream);
   auto rhs_device_view = table_device_view::create(right_table, stream);
 
-  auto d_column_order = cudf::detail::make_device_uvector_async(column_order, stream);
+  auto d_column_order = cudf::detail::make_device_uvector_async(
+    column_order, stream, rmm::mr::get_current_device_resource());
 
   if (nullable) {
-    auto d_null_precedence = cudf::detail::make_device_uvector_async(null_precedence, stream);
+    auto d_null_precedence = cudf::detail::make_device_uvector_async(
+      null_precedence, stream, rmm::mr::get_current_device_resource());
 
     auto ineq_op = detail::row_lexicographic_tagged_comparator<true>(
       *lhs_device_view, *rhs_device_view, d_column_order.data(), d_null_precedence.data());
@@ -241,7 +243,7 @@ struct column_merger {
     column_view const& lcol,
     column_view const& rcol,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const
+    rmm::mr::device_memory_resource* mr) const
   {
     auto lsz         = lcol.size();
     auto merged_size = lsz + rcol.size();
@@ -410,7 +412,7 @@ table_ptr_type merge(cudf::table_view const& left_table,
   // extract merged row order according to indices:
   //
   auto const merged_indices = generate_merged_indices(
-    index_left_view, index_right_view, column_order, null_precedence, nullable);
+    index_left_view, index_right_view, column_order, null_precedence, nullable, stream);
 
   // create merged table:
   //
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 54dffc85aca..13f46195392 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -493,11 +493,11 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     rmm::device_uvector<size_type>(grid_size * num_partitions, stream);
 
   // Holds the total number of rows in each partition
-  auto global_partition_sizes =
-    cudf::detail::make_zeroed_device_uvector_async<size_type>(num_partitions, stream);
+  auto global_partition_sizes = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+    num_partitions, stream, rmm::mr::get_current_device_resource());
 
-  auto row_partition_offset =
-    cudf::detail::make_zeroed_device_uvector_async<size_type>(num_rows, stream);
+  auto row_partition_offset = cudf::detail::make_zeroed_device_uvector_async<size_type>(
+    num_rows, stream, rmm::mr::get_current_device_resource());
 
   auto const row_hasher = experimental::row::hash::row_hasher(table_to_hash, stream);
   auto const hasher =
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 990992cd8f2..00f64b36e2d 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -151,9 +151,9 @@ namespace detail {
 std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_partition(
   table_view const& input,
   cudf::size_type num_partitions,
-  cudf::size_type start_partition     = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  cudf::size_type start_partition,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   auto nrows = input.num_rows();
 
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 785aa839956..4a9c2e3a902 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,7 +85,8 @@ struct quantile_functor {
     auto d_input  = column_device_view::create(input, stream);
     auto d_output = mutable_column_device_view::create(output->mutable_view(), stream);
 
-    auto q_device = cudf::detail::make_device_uvector_sync(q, stream);
+    auto q_device =
+      cudf::detail::make_device_uvector_sync(q, stream, rmm::mr::get_current_device_resource());
 
     if (!cudf::is_dictionary(input.type())) {
       auto sorted_data =
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index e71508bab09..c6760e77403 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,8 @@ std::unique_ptr<table> quantiles(table_view const& input,
     return detail::select_quantile<size_type>(selector, size, q, interp);
   };
 
-  auto const q_device = cudf::detail::make_device_uvector_async(q, stream);
+  auto const q_device =
+    cudf::detail::make_device_uvector_async(q, stream, rmm::mr::get_current_device_resource());
 
   auto quantile_idx_iter = thrust::make_transform_iterator(q_device.begin(), quantile_idx_lookup);
 
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index e231d515e86..094e554c3d2 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1120,7 +1120,8 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                  tdigests.end(),
                  std::back_inserter(tdigest_views),
                  [](std::unique_ptr<table> const& t) { return t->view(); });
-  auto merged = cudf::detail::concatenate(tdigest_views, stream);
+  auto merged =
+    cudf::detail::concatenate(tdigest_views, stream, rmm::mr::get_current_device_resource());
 
   // generate cumulative weights
   auto merged_weights     = merged->get_column(1).view();
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 185e14b6e2f..9d32bc4c7f6 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,8 +56,7 @@ struct all_fn {
   {
     auto const d_dict = cudf::column_device_view::create(input, stream);
     auto const iter   = [&] {
-      auto null_iter =
-        cudf::reduction::op::min{}.template get_null_replacing_element_transformer<bool>();
+      auto null_iter = op::min{}.template get_null_replacing_element_transformer<bool>();
       auto pair_iter =
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
@@ -78,7 +78,6 @@ struct all_fn {
 };
 
 }  // namespace
-}  // namespace detail
 
 std::unique_ptr<cudf::scalar> all(column_view const& col,
                                   cudf::data_type const output_dtype,
@@ -93,15 +92,11 @@ std::unique_ptr<cudf::scalar> all(column_view const& col,
     return cudf::type_dispatcher(
       dictionary_column_view(col).keys().type(), detail::all_fn{}, col, stream, mr);
   }
+  using reducer = simple::detail::bool_result_element_dispatcher<op::min>;
   // dispatch for non-dictionary types
-  return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::bool_result_element_dispatcher<cudf::reduction::op::min>{},
-    col,
-    init,
-    stream,
-    mr);
+  return cudf::type_dispatcher(col.type(), reducer{}, col, init, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index 871672e5c03..07977d2417f 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,8 +56,7 @@ struct any_fn {
   {
     auto const d_dict = cudf::column_device_view::create(input, stream);
     auto const iter   = [&] {
-      auto null_iter =
-        cudf::reduction::op::max{}.template get_null_replacing_element_transformer<bool>();
+      auto null_iter = op::max{}.template get_null_replacing_element_transformer<bool>();
       auto pair_iter =
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
@@ -78,7 +78,6 @@ struct any_fn {
 };
 
 }  // namespace
-}  // namespace detail
 
 std::unique_ptr<cudf::scalar> any(column_view const& col,
                                   cudf::data_type const output_dtype,
@@ -93,15 +92,11 @@ std::unique_ptr<cudf::scalar> any(column_view const& col,
     return cudf::type_dispatcher(
       dictionary_column_view(col).keys().type(), detail::any_fn{}, col, stream, mr);
   }
+  using reducer = simple::detail::bool_result_element_dispatcher<op::max>;
   // dispatch for non-dictionary types
-  return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::bool_result_element_dispatcher<cudf::reduction::op::max>{},
-    col,
-    init,
-    stream,
-    mr);
+  return cudf::type_dispatcher(col.type(), reducer{}, col, init, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/collect_ops.cu b/cpp/src/reductions/collect_ops.cu
index 4d6a32b528a..743eddbffaf 100644
--- a/cpp/src/reductions/collect_ops.cu
+++ b/cpp/src/reductions/collect_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,15 +17,15 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy_if.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/reduction_functions.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
 namespace cudf {
 namespace reduction {
-
+namespace detail {
 namespace {
 
 /**
@@ -49,8 +49,8 @@ std::unique_ptr<scalar> collect_list(column_view const& col,
 {
   if (need_handle_nulls(col, null_handling)) {
     auto d_view             = column_device_view::create(col, stream);
-    auto filter             = detail::validity_accessor(*d_view);
-    auto null_purged_table  = detail::copy_if(table_view{{col}}, filter, stream, mr);
+    auto filter             = cudf::detail::validity_accessor(*d_view);
+    auto null_purged_table  = cudf::detail::copy_if(table_view{{col}}, filter, stream, mr);
     column* null_purged_col = null_purged_table->release().front().release();
     null_purged_col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
     return std::make_unique<list_scalar>(std::move(*null_purged_col), true, stream, mr);
@@ -86,13 +86,13 @@ std::unique_ptr<scalar> collect_set(column_view const& col,
     return std::pair(col, std::unique_ptr<scalar>(nullptr));
   }();
 
-  auto distinct_table = detail::distinct(table_view{{input_as_collect_list}},
-                                         std::vector<size_type>{0},
-                                         duplicate_keep_option::KEEP_ANY,
-                                         nulls_equal,
-                                         nans_equal,
-                                         stream,
-                                         mr);
+  auto distinct_table = cudf::detail::distinct(table_view{{input_as_collect_list}},
+                                               std::vector<size_type>{0},
+                                               duplicate_keep_option::KEEP_ANY,
+                                               nulls_equal,
+                                               nans_equal,
+                                               stream,
+                                               mr);
 
   return std::make_unique<list_scalar>(std::move(distinct_table->get_column(0)), true, stream, mr);
 }
@@ -104,15 +104,15 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    rmm::mr::device_memory_resource* mr)
 {
   auto flatten_col    = col.get_sliced_child(stream);
-  auto distinct_table = detail::distinct(table_view{{flatten_col}},
-                                         std::vector<size_type>{0},
-                                         duplicate_keep_option::KEEP_ANY,
-                                         nulls_equal,
-                                         nans_equal,
-                                         stream,
-                                         mr);
+  auto distinct_table = cudf::detail::distinct(table_view{{flatten_col}},
+                                               std::vector<size_type>{0},
+                                               duplicate_keep_option::KEEP_ANY,
+                                               nulls_equal,
+                                               nans_equal,
+                                               stream,
+                                               mr);
   return std::make_unique<list_scalar>(std::move(distinct_table->get_column(0)), true, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 9458ae2d581..3428130d912 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <cudf/detail/reduction.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
+#include <cudf/reduction/detail/reduction.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
diff --git a/cpp/src/reductions/max.cu b/cpp/src/reductions/max.cu
index b57896e5fc0..1cf2b6f53b6 100644
--- a/cpp/src/reductions/max.cu
+++ b/cpp/src/reductions/max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> max(column_view const& col,
                                   cudf::data_type const output_dtype,
@@ -35,14 +37,11 @@ std::unique_ptr<cudf::scalar> max(column_view const& col,
   auto const dispatch_type = cudf::is_dictionary(col.type())
                                ? cudf::dictionary_column_view(col).indices().type()
                                : col.type();
-  return cudf::type_dispatcher(
-    dispatch_type,
-    simple::detail::same_element_type_dispatcher<cudf::reduction::op::max>{},
-    col,
-    init,
-    stream,
-    mr);
+
+  using reducer = simple::detail::same_element_type_dispatcher<op::max>;
+  return cudf::type_dispatcher(dispatch_type, reducer{}, col, init, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/mean.cu b/cpp/src/reductions/mean.cu
index e4b5f754b9b..e64660932ce 100644
--- a/cpp/src/reductions/mean.cu
+++ b/cpp/src/reductions/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,26 +14,30 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "compound.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/compound.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> mean(column_view const& col,
                                    cudf::data_type const output_dtype,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
-  using reducer = compound::detail::element_type_dispatcher<cudf::reduction::op::mean>;
   auto col_type =
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
+
+  using reducer = compound::detail::element_type_dispatcher<op::mean>;
   return cudf::type_dispatcher(
     col_type, reducer(), col, output_dtype, /* ddof is not used for mean*/ 1, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/min.cu b/cpp/src/reductions/min.cu
index ed16cec5ffd..792965e8b99 100644
--- a/cpp/src/reductions/min.cu
+++ b/cpp/src/reductions/min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
-
+namespace detail {
 std::unique_ptr<cudf::scalar> min(column_view const& col,
                                   data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
@@ -33,14 +34,10 @@ std::unique_ptr<cudf::scalar> min(column_view const& col,
   auto const dispatch_type = cudf::is_dictionary(col.type())
                                ? cudf::dictionary_column_view(col).indices().type()
                                : col.type();
-  return cudf::type_dispatcher(
-    dispatch_type,
-    simple::detail::same_element_type_dispatcher<cudf::reduction::op::min>{},
-    col,
-    init,
-    stream,
-    mr);
-}
 
+  using reducer = simple::detail::same_element_type_dispatcher<op::min>;
+  return cudf::type_dispatcher(dispatch_type, reducer{}, col, init, stream, mr);
+}
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/nth_element.cu b/cpp/src/reductions/nth_element.cu
index 78c469ee767..ef58ec3f42e 100644
--- a/cpp/src/reductions/nth_element.cu
+++ b/cpp/src/reductions/nth_element.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -28,11 +28,13 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 
-std::unique_ptr<cudf::scalar> cudf::reduction::nth_element(column_view const& col,
-                                                           size_type n,
-                                                           null_policy null_handling,
-                                                           rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+namespace cudf::reduction::detail {
+
+std::unique_ptr<cudf::scalar> nth_element(column_view const& col,
+                                          size_type n,
+                                          null_policy null_handling,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(n >= -col.size() and n < col.size(), "Index out of bounds");
   auto wrap_n = [n](size_type size) { return (n < 0 ? size + n : n); };
@@ -60,3 +62,5 @@ std::unique_ptr<cudf::scalar> cudf::reduction::nth_element(column_view const& co
     return cudf::detail::get_element(col, n, stream, mr);
   }
 }
+
+}  // namespace cudf::reduction::detail
diff --git a/cpp/src/reductions/product.cu b/cpp/src/reductions/product.cu
index 39e031f69d1..2e483813939 100644
--- a/cpp/src/reductions/product.cu
+++ b/cpp/src/reductions/product.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> product(column_view const& col,
                                       cudf::data_type const output_dtype,
@@ -31,13 +33,13 @@ std::unique_ptr<cudf::scalar> product(column_view const& col,
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
-    simple::detail::element_type_dispatcher<cudf::reduction::op::product>{},
+    simple::detail::element_type_dispatcher<op::product>{},
     col,
     output_dtype,
     init,
     stream,
     mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d28cdee1de2..2fef8aa8785 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,11 +19,11 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/quantiles.hpp>
-#include <cudf/detail/reduction_functions.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
+namespace reduction {
 namespace detail {
 struct reduce_dispatch_functor {
   column_view const col;
@@ -52,89 +53,89 @@ struct reduce_dispatch_functor {
   std::unique_ptr<scalar> operator()(reduce_aggregation const& agg)
   {
     switch (k) {
-      case aggregation::SUM: return reduction::sum(col, output_dtype, init, stream, mr);
-      case aggregation::PRODUCT: return reduction::product(col, output_dtype, init, stream, mr);
-      case aggregation::MIN: return reduction::min(col, output_dtype, init, stream, mr);
-      case aggregation::MAX: return reduction::max(col, output_dtype, init, stream, mr);
-      case aggregation::ANY: return reduction::any(col, output_dtype, init, stream, mr);
-      case aggregation::ALL: return reduction::all(col, output_dtype, init, stream, mr);
-      case aggregation::SUM_OF_SQUARES:
-        return reduction::sum_of_squares(col, output_dtype, stream, mr);
-      case aggregation::MEAN: return reduction::mean(col, output_dtype, stream, mr);
+      case aggregation::SUM: return sum(col, output_dtype, init, stream, mr);
+      case aggregation::PRODUCT: return product(col, output_dtype, init, stream, mr);
+      case aggregation::MIN: return min(col, output_dtype, init, stream, mr);
+      case aggregation::MAX: return max(col, output_dtype, init, stream, mr);
+      case aggregation::ANY: return any(col, output_dtype, init, stream, mr);
+      case aggregation::ALL: return all(col, output_dtype, init, stream, mr);
+      case aggregation::SUM_OF_SQUARES: return sum_of_squares(col, output_dtype, stream, mr);
+      case aggregation::MEAN: return mean(col, output_dtype, stream, mr);
       case aggregation::VARIANCE: {
-        auto var_agg = static_cast<var_aggregation const&>(agg);
-        return reduction::variance(col, output_dtype, var_agg._ddof, stream, mr);
+        auto var_agg = static_cast<cudf::detail::var_aggregation const&>(agg);
+        return variance(col, output_dtype, var_agg._ddof, stream, mr);
       }
       case aggregation::STD: {
-        auto var_agg = static_cast<std_aggregation const&>(agg);
-        return reduction::standard_deviation(col, output_dtype, var_agg._ddof, stream, mr);
+        auto var_agg = static_cast<cudf::detail::std_aggregation const&>(agg);
+        return standard_deviation(col, output_dtype, var_agg._ddof, stream, mr);
       }
       case aggregation::MEDIAN: {
-        auto current_mr = rmm::mr::get_current_device_resource();
-        auto sorted_indices =
-          sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
+        auto current_mr     = rmm::mr::get_current_device_resource();
+        auto sorted_indices = cudf::detail::sorted_order(
+          table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
         auto valid_sorted_indices =
-          split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
-        auto col_ptr = quantile(
+          cudf::detail::split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
+        auto col_ptr = cudf::detail::quantile(
           col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream, current_mr);
-        return get_element(*col_ptr, 0, stream, mr);
+        return cudf::detail::get_element(*col_ptr, 0, stream, mr);
       }
       case aggregation::QUANTILE: {
-        auto quantile_agg = static_cast<quantile_aggregation const&>(agg);
+        auto quantile_agg = static_cast<cudf::detail::quantile_aggregation const&>(agg);
         CUDF_EXPECTS(quantile_agg._quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
-        auto current_mr = rmm::mr::get_current_device_resource();
-        auto sorted_indices =
-          sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
+        auto current_mr     = rmm::mr::get_current_device_resource();
+        auto sorted_indices = cudf::detail::sorted_order(
+          table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
         auto valid_sorted_indices =
-          split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
+          cudf::detail::split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
 
-        auto col_ptr = quantile(col,
-                                quantile_agg._quantiles,
-                                quantile_agg._interpolation,
-                                valid_sorted_indices,
-                                true,
-                                stream,
-                                current_mr);
-        return get_element(*col_ptr, 0, stream, mr);
+        auto col_ptr = cudf::detail::quantile(col,
+                                              quantile_agg._quantiles,
+                                              quantile_agg._interpolation,
+                                              valid_sorted_indices,
+                                              true,
+                                              stream,
+                                              current_mr);
+        return cudf::detail::get_element(*col_ptr, 0, stream, mr);
       }
       case aggregation::NUNIQUE: {
-        auto nunique_agg = static_cast<nunique_aggregation const&>(agg);
-        return make_fixed_width_scalar(
-          detail::distinct_count(col, nunique_agg._null_handling, nan_policy::NAN_IS_VALID, stream),
+        auto nunique_agg = static_cast<cudf::detail::nunique_aggregation const&>(agg);
+        return cudf::make_fixed_width_scalar(
+          cudf::detail::distinct_count(
+            col, nunique_agg._null_handling, nan_policy::NAN_IS_VALID, stream),
           stream,
           mr);
       }
       case aggregation::NTH_ELEMENT: {
-        auto nth_agg = static_cast<nth_element_aggregation const&>(agg);
-        return reduction::nth_element(col, nth_agg._n, nth_agg._null_handling, stream, mr);
+        auto nth_agg = static_cast<cudf::detail::nth_element_aggregation const&>(agg);
+        return nth_element(col, nth_agg._n, nth_agg._null_handling, stream, mr);
       }
       case aggregation::COLLECT_LIST: {
-        auto col_agg = static_cast<collect_list_aggregation const&>(agg);
-        return reduction::collect_list(col, col_agg._null_handling, stream, mr);
+        auto col_agg = static_cast<cudf::detail::collect_list_aggregation const&>(agg);
+        return collect_list(col, col_agg._null_handling, stream, mr);
       }
       case aggregation::COLLECT_SET: {
-        auto col_agg = static_cast<collect_set_aggregation const&>(agg);
-        return reduction::collect_set(
+        auto col_agg = static_cast<cudf::detail::collect_set_aggregation const&>(agg);
+        return collect_set(
           col, col_agg._null_handling, col_agg._nulls_equal, col_agg._nans_equal, stream, mr);
       }
       case aggregation::MERGE_LISTS: {
-        return reduction::merge_lists(col, stream, mr);
+        return merge_lists(col, stream, mr);
       }
       case aggregation::MERGE_SETS: {
-        auto col_agg = static_cast<merge_sets_aggregation const&>(agg);
-        return reduction::merge_sets(col, col_agg._nulls_equal, col_agg._nans_equal, stream, mr);
+        auto col_agg = static_cast<cudf::detail::merge_sets_aggregation const&>(agg);
+        return merge_sets(col, col_agg._nulls_equal, col_agg._nans_equal, stream, mr);
       }
       case aggregation::TDIGEST: {
         CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT,
                      "Tdigest aggregations expect output type to be STRUCT");
-        auto td_agg = static_cast<tdigest_aggregation const&>(agg);
+        auto td_agg = static_cast<cudf::detail::tdigest_aggregation const&>(agg);
         return tdigest::detail::reduce_tdigest(col, td_agg.max_centroids, stream, mr);
       }
       case aggregation::MERGE_TDIGEST: {
         CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT,
                      "Tdigest aggregations expect output type to be STRUCT");
-        auto td_agg = static_cast<merge_tdigest_aggregation const&>(agg);
+        auto td_agg = static_cast<cudf::detail::merge_tdigest_aggregation const&>(agg);
         return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr);
       }
       default: CUDF_FAIL("Unsupported reduction operator");
@@ -142,13 +143,12 @@ struct reduce_dispatch_functor {
   }
 };
 
-std::unique_ptr<scalar> reduce(
-  column_view const& col,
-  reduce_aggregation const& agg,
-  data_type output_dtype,
-  std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<scalar> reduce(column_view const& col,
+                               reduce_aggregation const& agg,
+                               data_type output_dtype,
+                               std::optional<std::reference_wrapper<scalar const>> init,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!init.has_value() || col.type() == init.value().get().type(),
                "column and initial value must be the same type");
@@ -162,7 +162,7 @@ std::unique_ptr<scalar> reduce(
   // Returns default scalar if input column is empty or all null
   if (col.size() <= col.null_count()) {
     if (agg.kind == aggregation::TDIGEST || agg.kind == aggregation::MERGE_TDIGEST) {
-      return tdigest::detail::make_empty_tdigest_scalar(stream);
+      return tdigest::detail::make_empty_tdigest_scalar(stream, mr);
     }
 
     if (output_dtype.id() == type_id::LIST) {
@@ -184,10 +184,11 @@ std::unique_ptr<scalar> reduce(
     return result;
   }
 
-  return aggregation_dispatcher(
+  return cudf::detail::aggregation_dispatcher(
     agg.kind, reduce_dispatch_functor{col, output_dtype, init, stream, mr}, agg);
 }
 }  // namespace detail
+}  // namespace reduction
 
 std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
@@ -195,7 +196,8 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reduce(col, agg, output_dtype, std::nullopt, cudf::get_default_stream(), mr);
+  return reduction::detail::reduce(
+    col, agg, output_dtype, std::nullopt, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<scalar> reduce(column_view const& col,
@@ -205,6 +207,6 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr);
+  return reduction::detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 04a96666962..f453e7757a7 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -246,12 +246,11 @@ struct scan_dispatcher {
 
 }  // namespace
 
-std::unique_ptr<column> scan_inclusive(
-  column_view const& input,
-  scan_aggregation const& agg,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       scan_aggregation const& agg,
+                                       null_policy null_handling,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   auto output = scan_agg_dispatch<scan_dispatcher>(input, agg, null_handling, stream, mr);
 
diff --git a/cpp/src/reductions/segmented/all.cu b/cpp/src/reductions/segmented/all.cu
index f75fcd8066c..b81a088155c 100644
--- a/cpp/src/reductions/segmented/all.cu
+++ b/cpp/src/reductions/segmented/all.cu
@@ -16,10 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_all(
   column_view const& col,
@@ -33,17 +34,12 @@ std::unique_ptr<cudf::column> segmented_all(
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "segmented_all() operation requires output type `BOOL8`");
 
+  using reducer = simple::detail::bool_result_column_dispatcher<op::min>;
   // A minimum over bool types is used to implement all()
   return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::bool_result_column_dispatcher<cudf::reduction::op::min>{},
-    col,
-    offsets,
-    null_handling,
-    init,
-    stream,
-    mr);
+    col.type(), reducer{}, col, offsets, null_handling, init, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/any.cu b/cpp/src/reductions/segmented/any.cu
index 6a4fc70d438..9210fbd3c7c 100644
--- a/cpp/src/reductions/segmented/any.cu
+++ b/cpp/src/reductions/segmented/any.cu
@@ -16,10 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_any(
   column_view const& col,
@@ -33,17 +34,12 @@ std::unique_ptr<cudf::column> segmented_any(
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "segmented_any() operation requires output type `BOOL8`");
 
+  using reducer = simple::detail::bool_result_column_dispatcher<op::max>;
   // A maximum over bool types is used to implement any()
   return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::bool_result_column_dispatcher<cudf::reduction::op::max>{},
-    col,
-    offsets,
-    null_handling,
-    init,
-    stream,
-    mr);
+    col.type(), reducer{}, col, offsets, null_handling, init, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/compound.cuh b/cpp/src/reductions/segmented/compound.cuh
index e8abd32cf61..395ad4c1dc9 100644
--- a/cpp/src/reductions/segmented/compound.cuh
+++ b/cpp/src/reductions/segmented/compound.cuh
@@ -21,7 +21,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.cuh>
-#include <cudf/detail/segmented_reduction.cuh>
+#include <cudf/reduction/detail/segmented_reduction.cuh>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/reductions/segmented/max.cu b/cpp/src/reductions/segmented/max.cu
index d72b65301c1..c07c8fb2269 100644
--- a/cpp/src/reductions/segmented/max.cu
+++ b/cpp/src/reductions/segmented/max.cu
@@ -16,10 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_max(
   column_view const& col,
@@ -32,16 +33,10 @@ std::unique_ptr<cudf::column> segmented_max(
 {
   CUDF_EXPECTS(col.type() == output_dtype,
                "segmented_max() operation requires matching output type");
+  using reducer = simple::detail::same_column_type_dispatcher<op::max>;
   return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::same_column_type_dispatcher<cudf::reduction::op::max>{},
-    col,
-    offsets,
-    null_handling,
-    init,
-    stream,
-    mr);
+    col.type(), reducer{}, col, offsets, null_handling, init, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/mean.cu b/cpp/src/reductions/segmented/mean.cu
index b7a5bfa43d6..99f1533a154 100644
--- a/cpp/src/reductions/segmented/mean.cu
+++ b/cpp/src/reductions/segmented/mean.cu
@@ -16,12 +16,13 @@
 
 #include "compound.cuh"
 
-#include <cudf/detail/segmented_reduction_functions.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_mean(column_view const& col,
                                              device_span<size_type const> offsets,
@@ -30,11 +31,12 @@ std::unique_ptr<cudf::column> segmented_mean(column_view const& col,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
-  using reducer = compound::detail::compound_segmented_dispatcher<cudf::reduction::op::mean>;
+  using reducer            = compound::detail::compound_segmented_dispatcher<op::mean>;
   constexpr size_type ddof = 1;  // ddof for mean calculation
   return cudf::type_dispatcher(
     col.type(), reducer{}, col, offsets, output_dtype, null_handling, ddof, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/min.cu b/cpp/src/reductions/segmented/min.cu
index b7fbedf2690..f1597f90267 100644
--- a/cpp/src/reductions/segmented/min.cu
+++ b/cpp/src/reductions/segmented/min.cu
@@ -16,10 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_min(
   column_view const& col,
@@ -32,16 +33,10 @@ std::unique_ptr<cudf::column> segmented_min(
 {
   CUDF_EXPECTS(col.type() == output_dtype,
                "segmented_min() operation requires matching output type");
+  using reducer = simple::detail::same_column_type_dispatcher<op::min>;
   return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::same_column_type_dispatcher<cudf::reduction::op::min>{},
-    col,
-    offsets,
-    null_handling,
-    init,
-    stream,
-    mr);
+    col.type(), reducer{}, col, offsets, null_handling, init, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/nunique.cu b/cpp/src/reductions/segmented/nunique.cu
new file mode 100644
index 00000000000..bd1efb41df8
--- /dev/null
+++ b/cpp/src/reductions/segmented/nunique.cu
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "update_validity.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/labeling/label_segments.cuh>
+#include <cudf/reduction/detail/segmented_reduction.cuh>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace reduction {
+namespace detail {
+namespace {
+template <typename ComparatorType>
+struct is_unique_fn {
+  column_device_view const d_col;
+  ComparatorType row_equal;
+  null_policy null_handling;
+  size_type const* offsets;
+  size_type const* labels;
+
+  __device__ size_type operator()(size_type idx) const
+  {
+    if (null_handling == null_policy::EXCLUDE && d_col.is_null(idx)) { return 0; }
+    return static_cast<size_type>(offsets[labels[idx]] == idx || (!row_equal(idx, idx - 1)));
+  }
+};
+}  // namespace
+
+std::unique_ptr<cudf::column> segmented_nunique(column_view const& col,
+                                                device_span<size_type const> offsets,
+                                                null_policy null_handling,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  // only support non-nested types
+  CUDF_EXPECTS(!cudf::is_nested(col.type()),
+               "segmented reduce nunique only supports non-nested column types");
+
+  // compute the unique identifiers within each segment
+  auto const identifiers = [&] {
+    auto const d_col = column_device_view::create(col, stream);
+    auto const comparator =
+      cudf::experimental::row::equality::self_comparator{table_view({col}), stream};
+    auto const row_equal =
+      comparator.equal_to<false>(cudf::nullate::DYNAMIC{col.has_nulls()}, null_equality::EQUAL);
+
+    auto labels = rmm::device_uvector<size_type>(col.size(), stream);
+    cudf::detail::label_segments(
+      offsets.begin(), offsets.end(), labels.begin(), labels.end(), stream);
+    auto fn = is_unique_fn<decltype(row_equal)>{
+      *d_col, row_equal, null_handling, offsets.data(), labels.data()};
+
+    auto identifiers = rmm::device_uvector<size_type>(col.size(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(col.size()),
+                      identifiers.begin(),
+                      fn);
+    return identifiers;
+  }();
+
+  auto result = cudf::make_numeric_column(data_type(type_to_id<size_type>()),
+                                          static_cast<size_type>(offsets.size() - 1),
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream,
+                                          mr);
+
+  // Sum the unique identifiers within each segment
+  auto add_op = op::sum{};
+  cudf::reduction::detail::segmented_reduce(identifiers.begin(),
+                                            offsets.begin(),
+                                            offsets.end(),
+                                            result->mutable_view().data<size_type>(),
+                                            add_op.get_binary_op(),
+                                            0,
+                                            stream);
+
+  // Compute the output null mask
+  // - only empty segments are tagged as null
+  // - nulls are counted appropriately above per null_handling policy
+  auto const bitmask_col = null_handling == null_policy::EXCLUDE ? col : result->view();
+  cudf::reduction::detail::segmented_update_validity(
+    *result, bitmask_col, offsets, null_policy::EXCLUDE, std::nullopt, stream, mr);
+
+  return result;
+}
+}  // namespace detail
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/src/reductions/segmented/product.cu b/cpp/src/reductions/segmented/product.cu
index d5442126660..ea9c6f484c0 100644
--- a/cpp/src/reductions/segmented/product.cu
+++ b/cpp/src/reductions/segmented/product.cu
@@ -16,11 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
-
+namespace detail {
 std::unique_ptr<cudf::column> segmented_product(
   column_view const& col,
   device_span<size_type const> offsets,
@@ -30,17 +30,10 @@ std::unique_ptr<cudf::column> segmented_product(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
+  using reducer = simple::detail::column_type_dispatcher<op::product>;
   return cudf::type_dispatcher(
-    col.type(),
-    simple::detail::column_type_dispatcher<cudf::reduction::op::product>{},
-    col,
-    offsets,
-    output_dtype,
-    null_handling,
-    init,
-    stream,
-    mr);
+    col.type(), reducer{}, col, offsets, output_dtype, null_handling, init, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 1de55b371b3..cee82560794 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -18,7 +18,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/segmented_reduction_functions.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
+namespace reduction {
 namespace detail {
 struct segmented_reduce_dispatch_functor {
   column_view const& col;
@@ -69,41 +70,34 @@ struct segmented_reduce_dispatch_functor {
   {
     switch (k) {
       case segmented_reduce_aggregation::SUM:
-        return reduction::segmented_sum(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_sum(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::PRODUCT:
-        return reduction::segmented_product(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_product(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::MIN:
-        return reduction::segmented_min(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_min(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::MAX:
-        return reduction::segmented_max(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_max(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::ANY:
-        return reduction::segmented_any(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_any(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::ALL:
-        return reduction::segmented_all(
-          col, offsets, output_dtype, null_handling, init, stream, mr);
+        return segmented_all(col, offsets, output_dtype, null_handling, init, stream, mr);
       case segmented_reduce_aggregation::SUM_OF_SQUARES:
-        return reduction::segmented_sum_of_squares(
-          col, offsets, output_dtype, null_handling, stream, mr);
+        return segmented_sum_of_squares(col, offsets, output_dtype, null_handling, stream, mr);
       case segmented_reduce_aggregation::MEAN:
-        return reduction::segmented_mean(col, offsets, output_dtype, null_handling, stream, mr);
-      case aggregation::VARIANCE: {
-        auto var_agg = static_cast<var_aggregation const&>(agg);
-        return reduction::segmented_variance(
+        return segmented_mean(col, offsets, output_dtype, null_handling, stream, mr);
+      case segmented_reduce_aggregation::VARIANCE: {
+        auto var_agg = static_cast<cudf::detail::var_aggregation const&>(agg);
+        return segmented_variance(
           col, offsets, output_dtype, null_handling, var_agg._ddof, stream, mr);
       }
-      case aggregation::STD: {
-        auto var_agg = static_cast<std_aggregation const&>(agg);
-        return reduction::segmented_standard_deviation(
+      case segmented_reduce_aggregation::STD: {
+        auto var_agg = static_cast<cudf::detail::std_aggregation const&>(agg);
+        return segmented_standard_deviation(
           col, offsets, output_dtype, null_handling, var_agg._ddof, stream, mr);
       }
-      default:
-        CUDF_FAIL("Unsupported aggregation type.");
-        // TODO: Add support for compound_ops. GH #10432
+      case segmented_reduce_aggregation::NUNIQUE:
+        return segmented_nunique(col, offsets, null_handling, stream, mr);
+      default: CUDF_FAIL("Unsupported aggregation type.");
     }
   }
 };
@@ -127,13 +121,14 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
   }
   CUDF_EXPECTS(offsets.size() > 0, "`offsets` should have at least 1 element.");
 
-  return aggregation_dispatcher(
+  return cudf::detail::aggregation_dispatcher(
     agg.kind,
     segmented_reduce_dispatch_functor{
       segmented_values, offsets, output_dtype, null_handling, init, stream, mr},
     agg);
 }
 }  // namespace detail
+}  // namespace reduction
 
 std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          device_span<size_type const> offsets,
@@ -143,14 +138,14 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_reduce(segmented_values,
-                                  offsets,
-                                  agg,
-                                  output_dtype,
-                                  null_handling,
-                                  std::nullopt,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return reduction::detail::segmented_reduce(segmented_values,
+                                             offsets,
+                                             agg,
+                                             output_dtype,
+                                             null_handling,
+                                             std::nullopt,
+                                             cudf::get_default_stream(),
+                                             mr);
 }
 
 std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
@@ -162,14 +157,14 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_reduce(segmented_values,
-                                  offsets,
-                                  agg,
-                                  output_dtype,
-                                  null_handling,
-                                  init,
-                                  cudf::get_default_stream(),
-                                  mr);
+  return reduction::detail::segmented_reduce(segmented_values,
+                                             offsets,
+                                             agg,
+                                             output_dtype,
+                                             null_handling,
+                                             init,
+                                             cudf::get_default_stream(),
+                                             mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh
index 0c22848fd89..32138f0835b 100644
--- a/cpp/src/reductions/segmented/simple.cuh
+++ b/cpp/src/reductions/segmented/simple.cuh
@@ -22,11 +22,11 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/segmented_reduction.cuh>
 #include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/reduction/detail/segmented_reduction.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -133,8 +133,8 @@ std::unique_ptr<column> simple_segmented_reduction(
 
 template <typename InputType,
           typename Op,
-          CUDF_ENABLE_IF(std::is_same_v<Op, cudf::reduction::op::min> ||
-                         std::is_same_v<Op, cudf::reduction::op::max>)>
+          CUDF_ENABLE_IF(std::is_same_v<Op, cudf::reduction::detail::op::min> ||
+                         std::is_same_v<Op, cudf::reduction::detail::op::max>)>
 std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                    device_span<size_type const> offsets,
                                                    null_policy null_handling,
@@ -147,7 +147,7 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
   auto it                 = thrust::make_counting_iterator(0);
   auto const num_segments = static_cast<size_type>(offsets.size()) - 1;
 
-  bool constexpr is_argmin = std::is_same_v<Op, cudf::reduction::op::min>;
+  bool constexpr is_argmin = std::is_same_v<Op, cudf::reduction::detail::op::min>;
   auto string_comparator =
     cudf::detail::element_argminmax_fn<InputType>{*device_col, col.has_nulls(), is_argmin};
   auto constexpr identity =
@@ -178,8 +178,8 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
 
 template <typename InputType,
           typename Op,
-          CUDF_ENABLE_IF(!std::is_same_v<Op, cudf::reduction::op::min>() &&
-                         !std::is_same_v<Op, cudf::reduction::op::max>())>
+          CUDF_ENABLE_IF(!std::is_same_v<Op, cudf::reduction::detail::op::min>() &&
+                         !std::is_same_v<Op, cudf::reduction::detail::op::max>())>
 std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                    device_span<size_type const> offsets,
                                                    null_policy null_handling,
@@ -215,7 +215,7 @@ std::unique_ptr<column> fixed_point_segmented_reduction(
   auto result =
     simple_segmented_reduction<RepType, RepType, Op>(col, offsets, null_handling, init, stream, mr);
   auto const scale = [&] {
-    if constexpr (std::is_same_v<Op, cudf::reduction::op::product>) {
+    if constexpr (std::is_same_v<Op, cudf::reduction::detail::op::product>) {
       // The product aggregation requires updating the scale of the fixed-point output column.
       // The output scale needs to be the maximum count of all segments multiplied by
       // the input scale value.
@@ -245,7 +245,7 @@ std::unique_ptr<column> fixed_point_segmented_reduction(
       return new_scale;
     }
 
-    if constexpr (std::is_same_v<Op, cudf::reduction::op::sum_of_squares>) {
+    if constexpr (std::is_same_v<Op, cudf::reduction::detail::op::sum_of_squares>) {
       return numeric::scale_type{col.type().scale() * 2};
     }
 
diff --git a/cpp/src/reductions/segmented/std.cu b/cpp/src/reductions/segmented/std.cu
index 6af5a9cf9b6..5f5ced63b8f 100644
--- a/cpp/src/reductions/segmented/std.cu
+++ b/cpp/src/reductions/segmented/std.cu
@@ -16,12 +16,13 @@
 
 #include "compound.cuh"
 
-#include <cudf/detail/segmented_reduction_functions.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_standard_deviation(column_view const& col,
                                                            device_span<size_type const> offsets,
@@ -31,11 +32,11 @@ std::unique_ptr<cudf::column> segmented_standard_deviation(column_view const& co
                                                            rmm::cuda_stream_view stream,
                                                            rmm::mr::device_memory_resource* mr)
 {
-  using reducer =
-    compound::detail::compound_segmented_dispatcher<cudf::reduction::op::standard_deviation>;
+  using reducer = compound::detail::compound_segmented_dispatcher<op::standard_deviation>;
   return cudf::type_dispatcher(
     col.type(), reducer(), col, offsets, output_dtype, null_handling, ddof, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/sum.cu b/cpp/src/reductions/segmented/sum.cu
index 0cb8decdc58..7e84961dee0 100644
--- a/cpp/src/reductions/segmented/sum.cu
+++ b/cpp/src/reductions/segmented/sum.cu
@@ -16,10 +16,11 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/reduction_functions.hpp>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_sum(
   column_view const& col,
@@ -30,16 +31,10 @@ std::unique_ptr<cudf::column> segmented_sum(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  return cudf::type_dispatcher(col.type(),
-                               simple::detail::column_type_dispatcher<cudf::reduction::op::sum>{},
-                               col,
-                               offsets,
-                               output_dtype,
-                               null_handling,
-                               init,
-                               stream,
-                               mr);
+  using reducer = simple::detail::column_type_dispatcher<op::sum>;
+  return cudf::type_dispatcher(
+    col.type(), reducer{}, col, offsets, output_dtype, null_handling, init, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/sum_of_squares.cu b/cpp/src/reductions/segmented/sum_of_squares.cu
index 1ee4f992b6d..6c3f286fd8d 100644
--- a/cpp/src/reductions/segmented/sum_of_squares.cu
+++ b/cpp/src/reductions/segmented/sum_of_squares.cu
@@ -16,12 +16,13 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/segmented_reduction_functions.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_sum_of_squares(column_view const& col,
                                                        device_span<size_type const> offsets,
@@ -30,10 +31,11 @@ std::unique_ptr<cudf::column> segmented_sum_of_squares(column_view const& col,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::mr::device_memory_resource* mr)
 {
-  using reducer = simple::detail::column_type_dispatcher<cudf::reduction::op::sum_of_squares>;
+  using reducer = simple::detail::column_type_dispatcher<op::sum_of_squares>;
   return cudf::type_dispatcher(
     col.type(), reducer{}, col, offsets, output_dtype, null_handling, std::nullopt, stream, mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/var.cu b/cpp/src/reductions/segmented/var.cu
index 84adf353700..4ac815b542f 100644
--- a/cpp/src/reductions/segmented/var.cu
+++ b/cpp/src/reductions/segmented/var.cu
@@ -16,12 +16,13 @@
 
 #include "compound.cuh"
 
-#include <cudf/detail/segmented_reduction_functions.hpp>
+#include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::column> segmented_variance(column_view const& col,
                                                  device_span<size_type const> offsets,
@@ -31,10 +32,10 @@ std::unique_ptr<cudf::column> segmented_variance(column_view const& col,
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
-  using reducer = compound::detail::compound_segmented_dispatcher<cudf::reduction::op::variance>;
+  using reducer = compound::detail::compound_segmented_dispatcher<op::variance>;
   return cudf::type_dispatcher(
     col.type(), reducer(), col, offsets, output_dtype, null_handling, ddof, stream, mr);
 }
-
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index c7c0d400106..189c17f9b28 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include <reductions/struct_minmax_util.cuh>
+#include "struct_minmax_util.cuh"
 
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/reduction.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/reduction/detail/reduction.cuh>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/struct_view.hpp>
@@ -117,10 +117,10 @@ std::unique_ptr<scalar> fixed_point_reduction(
   auto result = simple_reduction<Type, Type, Op>(col, init, stream, mr);
 
   auto const scale = [&] {
-    if (std::is_same_v<Op, cudf::reduction::op::product>) {
+    if (std::is_same_v<Op, cudf::reduction::detail::op::product>) {
       auto const valid_count = static_cast<int32_t>(col.size() - col.null_count());
       return numeric::scale_type{col.type().scale() * (valid_count + (init.has_value() ? 1 : 0))};
-    } else if (std::is_same_v<Op, cudf::reduction::op::sum_of_squares>) {
+    } else if (std::is_same_v<Op, cudf::reduction::detail::op::sum_of_squares>) {
       return numeric::scale_type{col.type().scale() * 2};
     }
     return numeric::scale_type{col.type().scale()};
@@ -300,8 +300,8 @@ struct same_element_type_dispatcher {
  public:
   template <typename ElementType,
             std::enable_if_t<std::is_same_v<ElementType, cudf::struct_view> &&
-                             (std::is_same_v<Op, cudf::reduction::op::min> ||
-                              std::is_same_v<Op, cudf::reduction::op::max>)>* = nullptr>
+                             (std::is_same_v<Op, cudf::reduction::detail::op::min> ||
+                              std::is_same_v<Op, cudf::reduction::detail::op::max>)>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu
index e9ba75f68e6..9df83634667 100644
--- a/cpp/src/reductions/std.cu
+++ b/cpp/src/reductions/std.cu
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "compound.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/compound.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> standard_deviation(column_view const& col,
                                                  cudf::data_type const output_dtype,
@@ -31,8 +33,7 @@ std::unique_ptr<cudf::scalar> standard_deviation(column_view const& col,
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
-  using reducer =
-    compound::detail::element_type_dispatcher<cudf::reduction::op::standard_deviation>;
+  using reducer = compound::detail::element_type_dispatcher<op::standard_deviation>;
   auto col_type =
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
   return cudf::type_dispatcher(col_type, reducer(), col, output_dtype, ddof, stream, mr);
@@ -43,5 +44,6 @@ std::unique_ptr<cudf::scalar> standard_deviation(column_view const& col,
 #endif
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
index 796d10a3477..7b56646b153 100644
--- a/cpp/src/reductions/struct_minmax_util.cuh
+++ b/cpp/src/reductions/struct_minmax_util.cuh
@@ -17,10 +17,10 @@
 #pragma once
 
 #include <cudf/aggregation.hpp>
-#include <cudf/detail/reduction_operators.cuh>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/reduction/detail/reduction_operators.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
@@ -102,7 +102,8 @@ class comparison_binop_generator {
         {},
         std::vector<null_order>{DEFAULT_NULL_ORDER},
         cudf::structs::detail::column_nullability::MATCH_INCOMING,
-        stream)},
+        stream,
+        rmm::mr::get_current_device_resource())},
       d_flattened_input_ptr{
         table_device_view::create(flattened_input->flattened_columns(), stream)},
       is_min_op(is_min_op),
@@ -118,7 +119,8 @@ class comparison_binop_generator {
       // level structs column (which is stored at the first position in the null_orders array) to
       // achieve this purpose.
       if (input.has_nulls()) { null_orders.front() = cudf::null_order::AFTER; }
-      null_orders_dvec = cudf::detail::make_device_uvector_async(null_orders, stream);
+      null_orders_dvec = cudf::detail::make_device_uvector_async(
+        null_orders, stream, rmm::mr::get_current_device_resource());
     }
     // else: Don't need to generate nulls order to copy to device memory if we have all null orders
     // are BEFORE (that happens when we have is_min_op == false).
@@ -133,10 +135,10 @@ class comparison_binop_generator {
   template <typename BinOp>
   static auto create(column_view const& input, rmm::cuda_stream_view stream)
   {
-    return comparison_binop_generator(
-      input,
-      stream,
-      std::is_same_v<BinOp, cudf::reduction::op::min> || std::is_same_v<BinOp, cudf::DeviceMin>);
+    return comparison_binop_generator(input,
+                                      stream,
+                                      std::is_same_v<BinOp, cudf::reduction::detail::op::min> ||
+                                        std::is_same_v<BinOp, cudf::DeviceMin>);
   }
 
   template <cudf::aggregation::Kind K>
diff --git a/cpp/src/reductions/sum.cu b/cpp/src/reductions/sum.cu
index b919d871cc2..85c6b32dbaf 100644
--- a/cpp/src/reductions/sum.cu
+++ b/cpp/src/reductions/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> sum(column_view const& col,
                                   cudf::data_type const output_dtype,
@@ -31,7 +33,7 @@ std::unique_ptr<cudf::scalar> sum(column_view const& col,
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
-    simple::detail::element_type_dispatcher<cudf::reduction::op::sum>{},
+    simple::detail::element_type_dispatcher<op::sum>{},
     col,
     output_dtype,
     init,
@@ -39,5 +41,6 @@ std::unique_ptr<cudf::scalar> sum(column_view const& col,
     mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/sum_of_squares.cu b/cpp/src/reductions/sum_of_squares.cu
index af28ba19c9a..7b85c4e6dc9 100644
--- a/cpp/src/reductions/sum_of_squares.cu
+++ b/cpp/src/reductions/sum_of_squares.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "simple.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/simple.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> sum_of_squares(column_view const& col,
                                              cudf::data_type const output_dtype,
@@ -30,7 +32,7 @@ std::unique_ptr<cudf::scalar> sum_of_squares(column_view const& col,
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
-    simple::detail::element_type_dispatcher<cudf::reduction::op::sum_of_squares>{},
+    simple::detail::element_type_dispatcher<op::sum_of_squares>{},
     col,
     output_dtype,
     std::nullopt,
@@ -38,5 +40,6 @@ std::unique_ptr<cudf::scalar> sum_of_squares(column_view const& col,
     mr);
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu
index 4d86918d6c6..d559531dc59 100644
--- a/cpp/src/reductions/var.cu
+++ b/cpp/src/reductions/var.cu
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/reduction_functions.hpp>
+#include "compound.cuh"
+
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <reductions/compound.cuh>
+#include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
 
 std::unique_ptr<cudf::scalar> variance(column_view const& col,
                                        cudf::data_type const output_dtype,
@@ -31,7 +33,7 @@ std::unique_ptr<cudf::scalar> variance(column_view const& col,
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
-  using reducer = compound::detail::element_type_dispatcher<cudf::reduction::op::variance>;
+  using reducer = compound::detail::element_type_dispatcher<op::variance>;
   auto col_type =
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
   return cudf::type_dispatcher(col_type, reducer(), col, output_dtype, ddof, stream, mr);
@@ -42,5 +44,6 @@ std::unique_ptr<cudf::scalar> variance(column_view const& col,
 #endif
 }
 
+}  // namespace detail
 }  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index a275683d82c..68b496e0ab8 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -213,28 +213,26 @@ std::enable_if_t<std::is_same_v<T, string_view>, std::unique_ptr<cudf::column>>
 }  // namespace
 
 template <typename T, typename OptionalScalarIterator, typename ReplaceScalarIterator>
-std::unique_ptr<column> clamp(
-  column_view const& input,
-  OptionalScalarIterator lo_itr,
-  ReplaceScalarIterator lo_replace_itr,
-  OptionalScalarIterator hi_itr,
-  ReplaceScalarIterator hi_replace_itr,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> clamp(column_view const& input,
+                              OptionalScalarIterator lo_itr,
+                              ReplaceScalarIterator lo_replace_itr,
+                              OptionalScalarIterator hi_itr,
+                              ReplaceScalarIterator hi_replace_itr,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   return clamper<T>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
 }
 
 struct dispatch_clamp {
   template <typename T>
-  std::unique_ptr<column> operator()(
-    column_view const& input,
-    scalar const& lo,
-    scalar const& lo_replace,
-    scalar const& hi,
-    scalar const& hi_replace,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  std::unique_ptr<column> operator()(column_view const& input,
+                                     scalar const& lo,
+                                     scalar const& lo_replace,
+                                     scalar const& hi,
+                                     scalar const& hi_replace,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
 
@@ -352,14 +350,13 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> clamp(
-  column_view const& input,
-  scalar const& lo,
-  scalar const& lo_replace,
-  scalar const& hi,
-  scalar const& hi_replace,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> clamp(column_view const& input,
+                              scalar const& lo,
+                              scalar const& lo_replace,
+                              scalar const& hi,
+                              scalar const& hi_replace,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(lo.type() == hi.type(), "mismatching types of limit scalars");
   CUDF_EXPECTS(lo_replace.type() == hi_replace.type(), "mismatching types of replace scalars");
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 7f184f793de..373e5ee97e2 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -449,7 +449,9 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
 
   auto matched_input = [&] {
     auto new_keys = cudf::detail::concatenate(
-      std::vector<cudf::column_view>({values.keys(), replacements.keys()}), stream);
+      std::vector<cudf::column_view>({values.keys(), replacements.keys()}),
+      stream,
+      rmm::mr::get_current_device_resource());
     return cudf::dictionary::detail::add_keys(input, new_keys->view(), stream, mr);
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index 859ed7e5d53..d2fe9fabd1b 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/copy.h>
@@ -191,7 +192,8 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                          scatter_map,
                          out_of_bounds_policy::DONT_CHECK,
                          cudf::detail::negative_index_policy::NOT_ALLOWED,
-                         stream);
+                         stream,
+                         rmm::mr::get_current_device_resource());
 
   // Scatter defaults into locations where LEAD/LAG computed nulls.
   auto scattered_results = cudf::detail::scatter(
diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp
index 506bd54e5eb..d1de7adba7a 100644
--- a/cpp/src/rolling/detail/range_window_bounds.hpp
+++ b/cpp/src/rolling/detail/range_window_bounds.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -146,10 +146,9 @@ RepT range_comparable_value_impl(scalar const& range_scalar,
  * @return RepType Value of the range scalar
  */
 template <typename OrderByType>
-range_rep_type<OrderByType> range_comparable_value(
-  range_window_bounds const& range_bounds,
-  data_type const& order_by_data_type = data_type{type_to_id<OrderByType>()},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream())
+range_rep_type<OrderByType> range_comparable_value(range_window_bounds const& range_bounds,
+                                                   data_type const& order_by_data_type,
+                                                   rmm::cuda_stream_view stream)
 {
   auto const& range_scalar = range_bounds.range_scalar();
   using range_type         = cudf::detail::range_type<OrderByType>;
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 2b4b6373c35..b208e7cd980 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -467,8 +467,10 @@ get_null_bounds_for_orderby_column(column_view const& orderby_column,
       cudf::device_span<cudf::size_type const>(group_offsets.data(), num_groups);
 
     // When there are no nulls, just copy the input group offsets to the output.
-    return std::make_tuple(cudf::detail::make_device_uvector_async(group_offsets_span, stream),
-                           cudf::detail::make_device_uvector_async(group_offsets_span, stream));
+    return std::make_tuple(cudf::detail::make_device_uvector_async(
+                             group_offsets_span, stream, rmm::mr::get_current_device_resource()),
+                           cudf::detail::make_device_uvector_async(
+                             group_offsets_span, stream, rmm::mr::get_current_device_resource()));
   }
 }
 
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 046bfee9e41..403dc8c9189 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -583,7 +583,8 @@ table struct_scalar::init_data(table&& data,
   auto data_cols = data.release();
 
   // push validity mask down
-  auto const validity = cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream);
+  auto const validity = cudf::detail::create_null_mask(
+    1, mask_state::ALL_NULL, stream, rmm::mr::get_current_device_resource());
   for (auto& col : data_cols) {
     col = cudf::structs::detail::superimpose_nulls(
       static_cast<bitmask_type const*>(validity.data()), 1, std::move(col), stream, mr);
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 093a1f8f1ed..7c16a1b12ef 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -30,6 +30,7 @@
 
 #include <thrust/count.h>
 #include <thrust/pair.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
@@ -108,16 +109,24 @@ struct contains_scalar_dispatch {
     auto const haystack_cdv_ptr = column_device_view::create(haystack, stream);
 
     auto const d_comp = comparator.equal_to<true>(nullate::DYNAMIC{has_nulls});
-    return thrust::count_if(
-             rmm::exec_policy(stream),
-             begin,
-             end,
-             [d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
-               if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
-                 return false;
-               }
-               return d_comp(idx, rhs_index_type{0});  // compare haystack[idx] == needle[0].
-             }) > 0;
+
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator in a transform iterator with thrust::count_if.
+    auto d_results = rmm::device_uvector<bool>(haystack.size(), stream);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      begin,
+      end,
+      d_results.begin(),
+      [d_comp, check_nulls, d_haystack = *haystack_cdv_ptr] __device__(auto const idx) {
+        if (check_nulls && d_haystack.is_null_nocheck(static_cast<size_type>(idx))) {
+          return false;
+        }
+        return d_comp(idx, rhs_index_type{0});  // compare haystack[idx] == needle[0].
+      });
+
+    return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true) > 0;
   }
 };
 
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index f770b4598cf..1a2f242ef87 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -125,7 +125,10 @@ std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view
   // If there are more than one nullable column, we compute `bitmask_and` of their null masks.
   // Otherwise, we have only one nullable column and can use its null mask directly.
   if (nullable_columns.size() > 1) {
-    auto row_bitmask = cudf::detail::bitmask_and(table_view{nullable_columns}, stream).first;
+    auto row_bitmask =
+      cudf::detail::bitmask_and(
+        table_view{nullable_columns}, stream, rmm::mr::get_current_device_resource())
+        .first;
     auto const row_bitmask_ptr = static_cast<bitmask_type const*>(row_bitmask.data());
     return std::pair(std::move(row_bitmask), row_bitmask_ptr);
   }
@@ -322,13 +325,13 @@ rmm::device_uvector<bool> contains_without_lists_or_nans(table_view const& hayst
   auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
 
   // Flatten the input tables.
-  auto const flatten_nullability = has_any_nulls
-                                     ? structs::detail::column_nullability::FORCE
-                                     : structs::detail::column_nullability::MATCH_INCOMING;
-  auto const haystack_flattened_tables =
-    structs::detail::flatten_nested_columns(haystack, {}, {}, flatten_nullability, stream);
-  auto const needles_flattened_tables =
-    structs::detail::flatten_nested_columns(needles, {}, {}, flatten_nullability, stream);
+  auto const flatten_nullability       = has_any_nulls
+                                           ? structs::detail::column_nullability::FORCE
+                                           : structs::detail::column_nullability::MATCH_INCOMING;
+  auto const haystack_flattened_tables = structs::detail::flatten_nested_columns(
+    haystack, {}, {}, flatten_nullability, stream, rmm::mr::get_current_device_resource());
+  auto const needles_flattened_tables = structs::detail::flatten_nested_columns(
+    needles, {}, {}, flatten_nullability, stream, rmm::mr::get_current_device_resource());
   auto const haystack_flattened = haystack_flattened_tables->flattened_columns();
   auto const needles_flattened  = needles_flattened_tables->flattened_columns();
   auto const haystack_tdv_ptr   = table_device_view::create(haystack_flattened, stream);
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index 4c5ad1ef0ea..25c594e9e74 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -27,13 +27,15 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/sort.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace detail {
 
-auto is_sorted(cudf::table_view const& in,
+bool is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
                std::vector<null_order> const& null_precedence,
                rmm::cuda_stream_view stream)
@@ -44,16 +46,25 @@ auto is_sorted(cudf::table_view const& in,
   if (cudf::detail::has_nested_columns(in)) {
     auto const device_comparator = comparator.less<true>(has_nested_nulls(in));
 
-    return thrust::is_sorted(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(in.num_rows()),
-                             device_comparator);
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly over using the comparator directly
+    // in thrust::is_sorted.
+    auto d_results = rmm::device_uvector<bool>(in.num_rows(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::counting_iterator<size_type>(0),
+                      thrust::counting_iterator<size_type>(in.num_rows()),
+                      d_results.begin(),
+                      [device_comparator] __device__(auto idx) -> bool {
+                        return (idx == 0) || device_comparator(idx - 1, idx);
+                      });
+
+    return thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), false) == 0;
   } else {
     auto const device_comparator = comparator.less<false>(has_nested_nulls(in));
 
     return thrust::is_sorted(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(in.num_rows()),
+                             thrust::counting_iterator<size_type>(0),
+                             thrust::counting_iterator<size_type>(in.num_rows()),
                              device_comparator);
   }
 }
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index a32382b840f..b7347974173 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cub/device/device_segmented_sort.cuh>
 
@@ -72,8 +73,11 @@ struct column_fast_sort_fn {
   {
     // CUB's segmented sort functions cannot accept iterators.
     // We create a temporary column here for it to use.
-    auto temp_col =
-      cudf::detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream);
+    auto temp_col                   = cudf::detail::allocate_like(input,
+                                                input.size(),
+                                                mask_allocation_policy::NEVER,
+                                                stream,
+                                                rmm::mr::get_current_device_resource());
     mutable_column_view output_view = temp_col->mutable_view();
 
     // DeviceSegmentedSort is faster than DeviceSegmentedRadixSort at this time
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index e15d54b4251..cc60b2a12ea 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -25,6 +25,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -102,7 +104,8 @@ rmm::device_uvector<size_type> get_distinct_indices(table_view const& input,
                                                     keep,
                                                     nulls_equal,
                                                     nans_equal,
-                                                    stream);
+                                                    stream,
+                                                    rmm::mr::get_current_device_resource());
 
   // Extract the desired output indices from reduction results.
   auto const map_end = [&] {
@@ -145,8 +148,12 @@ std::unique_ptr<table> distinct(table_view const& input,
     return empty_like(input);
   }
 
-  auto const gather_map =
-    get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream);
+  auto const gather_map = get_distinct_indices(input.select(keys),
+                                               keep,
+                                               nulls_equal,
+                                               nans_equal,
+                                               stream,
+                                               rmm::mr::get_current_device_resource());
   return detail::gather(input,
                         gather_map,
                         out_of_bounds_policy::DONT_CHECK,
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 760fcf4bb6b..8c50f8d29e8 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -28,6 +28,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -150,7 +151,8 @@ cudf::size_type distinct_count(table_view const& keys,
     // when nulls are equal, insert non-null rows only to improve efficiency
     if (nulls_equal == null_equality::EQUAL and has_nulls) {
       thrust::counting_iterator<size_type> stencil(0);
-      auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream);
+      auto const [row_bitmask, null_count] =
+        cudf::detail::bitmask_or(keys, stream, rmm::mr::get_current_device_resource());
       row_validity pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
       key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value());
@@ -209,6 +211,6 @@ cudf::size_type distinct_count(column_view const& input,
 cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, nulls_equal);
+  return detail::distinct_count(input, nulls_equal, cudf::get_default_stream());
 }
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/distinct_reduce.cuh b/cpp/src/stream_compaction/distinct_reduce.cuh
index e360d03280a..8ec1fa18205 100644
--- a/cpp/src/stream_compaction/distinct_reduce.cuh
+++ b/cpp/src/stream_compaction/distinct_reduce.cuh
@@ -82,6 +82,6 @@ rmm::device_uvector<size_type> hash_reduce_by_row(
   null_equality nulls_equal,
   nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index dc80a454777..d45897930b0 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,8 +38,12 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
     return empty_like(input);
   }
 
-  auto const distinct_indices =
-    get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream);
+  auto const distinct_indices = get_distinct_indices(input.select(keys),
+                                                     keep,
+                                                     nulls_equal,
+                                                     nans_equal,
+                                                     stream,
+                                                     rmm::mr::get_current_device_resource());
 
   // Markers to denote which rows to be copied to the output.
   auto const output_markers = [&] {
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index 511a7b7ae1c..2d81c00e9d9 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -67,38 +67,51 @@ std::unique_ptr<table> unique(table_view const& input,
 
   auto comp = cudf::experimental::row::equality::self_comparator(keys_view, stream);
 
-  auto const comparator_helper = [&](auto const row_equal) {
-    // get indices of unique rows
-    auto result_end = unique_copy(thrust::counting_iterator<size_type>(0),
-                                  thrust::counting_iterator<size_type>(num_rows),
-                                  mutable_view->begin<size_type>(),
-                                  row_equal,
-                                  keep,
-                                  stream);
+  size_type const unique_size = [&] {
+    if (cudf::detail::has_nested_columns(keys_view)) {
+      // Using a temporary buffer for intermediate transform results from the functor containing
+      // the comparator speeds up compile-time significantly without much degradation in
+      // runtime performance over using the comparator directly in thrust::unique_copy.
+      auto row_equal =
+        comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
+      auto d_results = rmm::device_uvector<bool>(num_rows, stream);
+      auto itr       = thrust::make_counting_iterator<size_type>(0);
+      thrust::transform(
+        rmm::exec_policy(stream),
+        itr,
+        itr + num_rows,
+        d_results.begin(),
+        unique_copy_fn<decltype(itr), decltype(row_equal)>{itr, keep, row_equal, num_rows - 1});
+      auto result_end = thrust::copy_if(rmm::exec_policy(stream),
+                                        itr,
+                                        itr + num_rows,
+                                        d_results.begin(),
+                                        mutable_view->begin<size_type>(),
+                                        thrust::identity<bool>{});
+      return static_cast<size_type>(thrust::distance(mutable_view->begin<size_type>(), result_end));
+    } else {
+      // Using thrust::unique_copy with the comparator directly will compile more slowly but
+      // improves runtime by up to 2x over the transform/copy_if approach above.
+      auto row_equal =
+        comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
+      auto result_end = unique_copy(thrust::counting_iterator<size_type>(0),
+                                    thrust::counting_iterator<size_type>(num_rows),
+                                    mutable_view->begin<size_type>(),
+                                    row_equal,
+                                    keep,
+                                    stream);
+      return static_cast<size_type>(thrust::distance(mutable_view->begin<size_type>(), result_end));
+    }
+  }();
+  auto indices_view = cudf::detail::slice(column_view(*unique_indices), 0, unique_size);
 
-    auto indices_view =
-      cudf::detail::slice(column_view(*unique_indices),
-                          0,
-                          thrust::distance(mutable_view->begin<size_type>(), result_end));
-
-    // gather unique rows and return
-    return detail::gather(input,
-                          indices_view,
-                          out_of_bounds_policy::DONT_CHECK,
-                          detail::negative_index_policy::NOT_ALLOWED,
-                          stream,
-                          mr);
-  };
-
-  if (cudf::detail::has_nested_columns(keys_view)) {
-    auto row_equal =
-      comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
-    return comparator_helper(row_equal);
-  } else {
-    auto row_equal =
-      comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys_view)}, nulls_equal);
-    return comparator_helper(row_equal);
-  }
+  // gather unique rows and return
+  return detail::gather(input,
+                        indices_view,
+                        out_of_bounds_policy::DONT_CHECK,
+                        detail::negative_index_policy::NOT_ALLOWED,
+                        stream,
+                        mr);
 }
 }  // namespace detail
 
diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu
index c7c10438d7a..ac9924311c2 100644
--- a/cpp/src/stream_compaction/unique_count.cu
+++ b/cpp/src/stream_compaction/unique_count.cu
@@ -28,6 +28,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -37,6 +38,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
+#include <thrust/transform.h>
 
 #include <cmath>
 #include <cstddef>
@@ -75,14 +77,23 @@ cudf::size_type unique_count(table_view const& keys,
   if (cudf::detail::has_nested_columns(keys)) {
     auto const comp =
       row_comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
-    return thrust::count_if(
-      rmm::exec_policy(stream),
-      thrust::counting_iterator<cudf::size_type>(0),
-      thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
-      [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
+    // Using a temporary buffer for intermediate transform results from the lambda containing
+    // the comparator speeds up compile-time significantly without much degradation in
+    // runtime performance over using the comparator directly in thrust::count_if.
+    auto d_results = rmm::device_uvector<bool>(keys.num_rows(), stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(keys.num_rows()),
+                      d_results.begin(),
+                      [comp] __device__(auto i) { return (i == 0 or not comp(i, i - 1)); });
+
+    return static_cast<size_type>(
+      thrust::count(rmm::exec_policy(stream), d_results.begin(), d_results.end(), true));
   } else {
     auto const comp =
       row_comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
+    // Using thrust::copy_if with the comparator directly will compile more slowly but
+    // improves runtime by up to 2x over the transform/count approach above.
     return thrust::count_if(
       rmm::exec_policy(stream),
       thrust::counting_iterator<cudf::size_type>(0),
@@ -144,7 +155,7 @@ cudf::size_type unique_count(column_view const& input,
 cudf::size_type unique_count(table_view const& input, null_equality nulls_equal)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique_count(input, nulls_equal);
+  return detail::unique_count(input, nulls_equal, cudf::get_default_stream());
 }
 
 }  // namespace cudf
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 66288c7d14d..3a1b7044b56 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -60,16 +60,16 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 64;
 
 /**
  * @brief Returns a numeric column containing lengths of each string in
- * based on the provided unary function.
+ * based on the provided unary function
  *
  * Any null string will result in a null entry for that row in the output column.
  *
- * @tparam UnaryFunction Device function that returns an integer given a string_view.
- * @param strings Strings instance for this operation.
- * @param ufn Function returns an integer for each string.
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @tparam UnaryFunction Device function that returns an integer given a string_view
+ * @param strings Strings instance for this operation
+ * @param ufn Function returns an integer for each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT32 column with lengths for each string.
+ * @return New column with lengths for each string
  */
 template <typename UnaryFunction>
 std::unique_ptr<column> counts_fn(strings_column_view const& strings,
@@ -78,7 +78,7 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   // create output column
-  auto results   = make_numeric_column(data_type{type_id::INT32},
+  auto results   = make_numeric_column(data_type{type_to_id<size_type>()},
                                      strings.size(),
                                      cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                                      strings.null_count(),
@@ -176,12 +176,12 @@ std::unique_ptr<column> count_characters(strings_column_view const& input,
   return count_characters_parallel(input, stream, mr);
 }
 
-std::unique_ptr<column> count_bytes(strings_column_view const& strings,
+std::unique_ptr<column> count_bytes(strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   auto ufn = [] __device__(string_view const& d_str) { return d_str.size_bytes(); };
-  return counts_fn(strings, ufn, stream, mr);
+  return counts_fn(input, ufn, stream, mr);
 }
 
 }  // namespace detail
@@ -214,19 +214,19 @@ struct code_points_fn {
 
 namespace detail {
 //
-std::unique_ptr<column> code_points(strings_column_view const& strings,
+std::unique_ptr<column> code_points(strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
 
   // create offsets vector to account for each string's character length
-  rmm::device_uvector<size_type> offsets(strings.size() + 1, stream);
+  rmm::device_uvector<size_type> offsets(input.size() + 1, stream);
   thrust::transform_inclusive_scan(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(strings.size()),
+    thrust::make_counting_iterator<size_type>(input.size()),
     offsets.begin() + 1,
     [d_column] __device__(size_type idx) {
       size_type length = 0;
@@ -248,7 +248,7 @@ std::unique_ptr<column> code_points(strings_column_view const& strings,
   // now set the ranges from each strings' character values
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
-                     strings.size(),
+                     input.size(),
                      code_points_fn{d_column, offsets.data(), d_results});
 
   results->set_null_count(0);
@@ -259,25 +259,25 @@ std::unique_ptr<column> code_points(strings_column_view const& strings,
 
 // external APIS
 
-std::unique_ptr<column> count_characters(strings_column_view const& strings,
+std::unique_ptr<column> count_characters(strings_column_view const& input,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_characters(strings, cudf::get_default_stream(), mr);
+  return detail::count_characters(input, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> count_bytes(strings_column_view const& strings,
+std::unique_ptr<column> count_bytes(strings_column_view const& input,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_bytes(strings, cudf::get_default_stream(), mr);
+  return detail::count_bytes(input, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> code_points(strings_column_view const& strings,
+std::unique_ptr<column> code_points(strings_column_view const& input,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::code_points(strings, cudf::get_default_stream(), mr);
+  return detail::code_points(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 177fcab03f9..8d273eff4bb 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -160,7 +160,8 @@ struct format_compiler {
     }
 
     // copy format_items to device memory
-    d_items = cudf::detail::make_device_uvector_async(items, stream);
+    d_items = cudf::detail::make_device_uvector_async(
+      items, stream, rmm::mr::get_current_device_resource());
   }
 
   device_span<format_item const> format_items() { return device_span<format_item const>(d_items); }
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index e3ee59c631f..92b71d128e1 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -85,7 +85,8 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
       return static_cast<size_t>(col.size());
     });
   thrust::inclusive_scan(thrust::host, offset_it, input_offsets.end(), offset_it);
-  auto d_input_offsets   = cudf::detail::make_device_uvector_async(input_offsets, stream);
+  auto d_input_offsets = cudf::detail::make_device_uvector_async(
+    input_offsets, stream, rmm::mr::get_current_device_resource());
   auto const output_size = input_offsets.back();
 
   // Compute the partition offsets and size of chars column
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index 9d29bbb8c96..1fde3a54089 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ std::unique_ptr<column> count_matches(column_device_view const& d_strings,
   assert(output_size >= d_strings.size() and "Unexpected output size");
 
   auto results = make_numeric_column(
-    data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, output_size, mask_state::UNALLOCATED, stream, mr);
 
   if (d_strings.size() == 0) return results;
 
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 8a6a4d44b1e..3e38b5fa775 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -132,7 +132,8 @@ std::unique_ptr<column> filter_characters(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
     });
-  rmm::device_uvector<char_range> table = cudf::detail::make_device_uvector_async(htable, stream);
+  rmm::device_uvector<char_range> table =
+    cudf::detail::make_device_uvector_async(htable, stream, rmm::mr::get_current_device_resource());
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index c6ea47ec0f3..128d450cbe8 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -673,11 +673,10 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
   } while (op.type != path_operator_type::END);
 
   auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END;
-  return is_empty
-           ? std::pair(thrust::nullopt, 0)
-           : std::pair(
-               thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)),
-               max_stack_depth);
+  return is_empty ? std::pair(thrust::nullopt, 0)
+                  : std::pair(thrust::make_optional(cudf::detail::make_device_uvector_sync(
+                                h_operators, stream, rmm::mr::get_current_device_resource())),
+                              max_stack_depth);
 }
 
 #define PARSE_TRY(_x)                                                       \
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 383337c9088..d25af8c8931 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,9 +118,9 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
 
   // parse the repl string for back-ref indicators
   auto group_count = std::min(99, d_prog->group_counts());  // group count should NOT exceed 99
-  auto const parse_result = parse_backrefs(replacement, group_count);
-  rmm::device_uvector<backref_type> backrefs =
-    cudf::detail::make_device_uvector_async(parse_result.second, stream);
+  auto const parse_result                    = parse_backrefs(replacement, group_count);
+  rmm::device_uvector<backref_type> backrefs = cudf::detail::make_device_uvector_async(
+    parse_result.second, stream, rmm::mr::get_current_device_resource());
   string_scalar repl_scalar(parse_result.first, true, stream);
   string_view const d_repl_template = repl_scalar.value();
 
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
new file mode 100644
index 00000000000..92ace4e7bc7
--- /dev/null
+++ b/cpp/src/strings/replace/multi.cu
@@ -0,0 +1,500 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/char_tables.hpp>
+#include <cudf/strings/detail/replace.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/replace.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+/**
+ * @brief Threshold to decide on using string or character-parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the character-parallel function is used.
+ * Otherwise, a regular string-parallel function is used.
+ *
+ * This value was found using the replace-multi benchmark results using an
+ * RTX A6000.
+ */
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256;
+
+/**
+ * @brief Type used for holding the target position (first) and the
+ * target index (second).
+ */
+using target_pair = thrust::pair<size_type, size_type>;
+
+/**
+ * @brief Helper functions for performing character-parallel replace
+ */
+struct replace_multi_parallel_fn {
+  __device__ char const* get_base_ptr() const
+  {
+    return d_strings.child(strings_column_view::chars_column_index).data<char>();
+  }
+
+  __device__ size_type const* get_offsets_ptr() const
+  {
+    return d_strings.child(strings_column_view::offsets_column_index).data<size_type>() +
+           d_strings.offset();
+  }
+
+  __device__ string_view const get_string(size_type idx) const
+  {
+    return d_strings.element<string_view>(idx);
+  }
+
+  __device__ string_view const get_replacement_string(size_type idx) const
+  {
+    return d_replacements.size() == 1 ? d_replacements[0] : d_replacements[idx];
+  }
+
+  __device__ bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
+
+  /**
+   * @brief Returns the index of the target string found at the given byte position
+   * in the input strings column
+   *
+   * @param idx Index of the byte position in the chars column
+   * @param chars_bytes Number of bytes in the chars column
+   */
+  __device__ thrust::optional<size_type> has_target(size_type idx, size_type chars_bytes) const
+  {
+    auto const d_offsets = get_offsets_ptr();
+    auto const d_chars   = get_base_ptr() + d_offsets[0] + idx;
+    size_type str_idx    = -1;
+    for (std::size_t t = 0; t < d_targets.size(); ++t) {
+      auto const d_tgt = d_targets[t];
+      if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) &&
+          (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) {
+        if (str_idx < 0) {
+          auto const idx_itr =
+            thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx);
+          str_idx = thrust::distance(d_offsets, idx_itr) - 1;
+        }
+        auto const d_str = get_string(str_idx - d_offsets[0]);
+        if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return t; }
+      }
+    }
+    return thrust::nullopt;
+  }
+
+  /**
+   * @brief Count the number of strings that will be produced by the replace
+   *
+   * This includes segments of the string that are not replaced as well as those
+   * that are replaced.
+   *
+   * @param idx Index of the row in d_strings to be processed
+   * @param d_positions Positions of the targets found in the chars column
+   * @param d_targets_offsets Offsets identify which target positions go with the current string
+   * @return Number of substrings resulting from the replace operations on this row
+   */
+  __device__ size_type count_strings(size_type idx,
+                                     target_pair const* d_positions,
+                                     size_type const* d_targets_offsets) const
+  {
+    if (!is_valid(idx)) { return 0; }
+
+    auto const d_str             = get_string(idx);
+    auto const d_str_end         = d_str.data() + d_str.size_bytes();
+    auto const base_ptr          = get_base_ptr();
+    auto const targets_positions = cudf::device_span<target_pair const>(
+      d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]);
+
+    size_type count = 1;  // always at least one string
+    auto str_ptr    = d_str.data();
+    for (auto d_pair : targets_positions) {
+      auto const d_pos   = d_pair.first;
+      auto const d_tgt   = d_targets[d_pair.second];
+      auto const tgt_ptr = base_ptr + d_pos;
+      if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
+        auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
+        if (keep_size > 0) { count++; }  // don't bother counting empty strings
+
+        auto const d_repl = get_replacement_string(d_pair.second);
+        if (!d_repl.empty()) { count++; }
+
+        str_ptr += keep_size + d_tgt.size_bytes();
+      }
+    }
+
+    return count;
+  }
+
+  /**
+   * @brief Retrieve the strings for each row
+   *
+   * This will return string segments as string_index_pair objects for
+   * parts of the string that are not replaced interlaced with the
+   * appropriate replacement string where replacement targets are found.
+   *
+   * This function is called only once to produce both the string_index_pair objects
+   * and the output row size in bytes.
+   *
+   * @param idx Index of the row in d_strings
+   * @param d_offsets Offsets to identify where to store the results of the replace for this string
+   * @param d_positions The target positions found in the chars column
+   * @param d_targets_offsets The offsets to identify which target positions go with this string
+   * @param d_all_strings The output of all the produced string segments
+   * @return The size in bytes of the output string for this row
+   */
+  __device__ size_type get_strings(size_type idx,
+                                   size_type const* d_offsets,
+                                   target_pair const* d_positions,
+                                   size_type const* d_targets_offsets,
+                                   string_index_pair* d_all_strings) const
+  {
+    if (!is_valid(idx)) { return 0; }
+
+    auto const d_output  = d_all_strings + d_offsets[idx];
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+
+    auto const targets_positions = cudf::device_span<target_pair const>(
+      d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]);
+
+    size_type output_idx  = 0;
+    size_type output_size = 0;
+    auto str_ptr          = d_str.data();
+    for (auto d_pair : targets_positions) {
+      auto const d_pos   = d_pair.first;
+      auto const d_tgt   = d_targets[d_pair.second];
+      auto const tgt_ptr = base_ptr + d_pos;
+      if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
+        auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
+        if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; }
+        output_size += keep_size;
+
+        auto const d_repl = get_replacement_string(d_pair.second);
+        if (!d_repl.empty()) {
+          d_output[output_idx++] = string_index_pair{d_repl.data(), d_repl.size_bytes()};
+        }
+        output_size += d_repl.size_bytes();
+
+        str_ptr += keep_size + d_tgt.size_bytes();
+      }
+    }
+    // include any leftover parts of the string
+    if (str_ptr <= d_str_end) {
+      auto const left_size = static_cast<size_type>(thrust::distance(str_ptr, d_str_end));
+      d_output[output_idx] = string_index_pair{str_ptr, left_size};
+      output_size += left_size;
+    }
+    return output_size;
+  }
+
+  replace_multi_parallel_fn(column_device_view const& d_strings,
+                            device_span<string_view const> d_targets,
+                            device_span<string_view const> d_replacements)
+    : d_strings(d_strings), d_targets{d_targets}, d_replacements{d_replacements}
+  {
+  }
+
+ protected:
+  column_device_view d_strings;
+  device_span<string_view const> d_targets;
+  device_span<string_view const> d_replacements;
+};
+
+/**
+ * @brief Used by the copy-if function to produce target_pair objects
+ *
+ * Using an inplace lambda caused a runtime crash in thrust::copy_if
+ * (this happens sometimes when passing device lambdas to thrust algorithms)
+ */
+struct pair_generator {
+  __device__ target_pair operator()(int idx) const
+  {
+    auto pos = fn.has_target(idx, chars_bytes);
+    return target_pair{idx, pos.value_or(-1)};
+  }
+  replace_multi_parallel_fn fn;
+  size_type chars_bytes;
+};
+
+struct copy_if_fn {
+  __device__ bool operator()(target_pair pos) { return pos.second >= 0; }
+};
+
+std::unique_ptr<column> replace_character_parallel(strings_column_view const& input,
+                                                   strings_column_view const& targets,
+                                                   strings_column_view const& repls,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto const strings_count = input.size();
+  auto const chars_bytes =
+    cudf::detail::get_value<size_type>(input.offsets(), input.offset() + strings_count, stream) -
+    cudf::detail::get_value<size_type>(input.offsets(), input.offset(), stream);
+
+  auto d_targets =
+    create_string_vector_from_column(targets, stream, rmm::mr::get_current_device_resource());
+  auto d_replacements =
+    create_string_vector_from_column(repls, stream, rmm::mr::get_current_device_resource());
+
+  replace_multi_parallel_fn fn{*d_strings, d_targets, d_replacements};
+
+  // count the number of targets in the entire column
+  auto const target_count = thrust::count_if(rmm::exec_policy(stream),
+                                             thrust::make_counting_iterator<size_type>(0),
+                                             thrust::make_counting_iterator<size_type>(chars_bytes),
+                                             [fn, chars_bytes] __device__(size_type idx) {
+                                               return fn.has_target(idx, chars_bytes).has_value();
+                                             });
+  // Create a vector of every target position in the chars column.
+  // These may include overlapping targets which will be resolved later.
+  auto targets_positions = rmm::device_uvector<target_pair>(target_count, stream);
+  auto d_positions       = targets_positions.data();
+
+  auto const copy_itr =
+    cudf::detail::make_counting_transform_iterator(0, pair_generator{fn, chars_bytes});
+  auto const copy_end = thrust::copy_if(
+    rmm::exec_policy(stream), copy_itr, copy_itr + chars_bytes, d_positions, copy_if_fn{});
+
+  // create a vector of offsets to each string's set of target positions
+  auto const targets_offsets = [&] {
+    auto string_indices = rmm::device_uvector<size_type>(target_count, stream);
+
+    auto const pos_itr = cudf::detail::make_counting_transform_iterator(
+      0, [d_positions] __device__(auto idx) -> size_type { return d_positions[idx].first; });
+    auto pos_count = std::distance(d_positions, copy_end);
+
+    thrust::upper_bound(rmm::exec_policy(stream),
+                        input.offsets_begin(),
+                        input.offsets_end(),
+                        pos_itr,
+                        pos_itr + pos_count,
+                        string_indices.begin());
+
+    // compute offsets per string
+    auto targets_offsets   = rmm::device_uvector<size_type>(strings_count + 1, stream);
+    auto d_targets_offsets = targets_offsets.data();
+
+    // memset to zero-out the target counts for any null-entries or strings with no targets
+    thrust::uninitialized_fill(
+      rmm::exec_policy(stream), targets_offsets.begin(), targets_offsets.end(), 0);
+
+    // next, count the number of targets per string
+    auto d_string_indices = string_indices.data();
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       target_count,
+                       [d_string_indices, d_targets_offsets] __device__(size_type idx) {
+                         auto const str_idx = d_string_indices[idx] - 1;
+                         atomicAdd(d_targets_offsets + str_idx, 1);
+                       });
+    // finally, convert the counts into offsets
+    thrust::exclusive_scan(rmm::exec_policy(stream),
+                           targets_offsets.begin(),
+                           targets_offsets.end(),
+                           targets_offsets.begin());
+    return targets_offsets;
+  }();
+  auto const d_targets_offsets = targets_offsets.data();
+
+  // compute the number of string segments produced by replace in each string
+  auto counts = rmm::device_uvector<size_type>(strings_count, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings_count),
+                    counts.begin(),
+                    [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type {
+                      return fn.count_strings(idx, d_positions, d_targets_offsets);
+                    });
+
+  // create offsets from the counts
+  auto offsets =
+    std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
+  auto const total_strings =
+    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
+  auto const d_strings_offsets = offsets->view().data<size_type>();
+
+  // build a vector of all the positions for all the strings
+  auto indices   = rmm::device_uvector<string_index_pair>(total_strings, stream);
+  auto d_indices = indices.data();
+  auto d_sizes   = counts.data();  // reusing this vector to hold output sizes now
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    strings_count,
+    [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__(
+      size_type idx) {
+      d_sizes[idx] =
+        fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices);
+    });
+
+  // use this utility to gather the string parts into a contiguous chars column
+  auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr);
+
+  // create offsets from the sizes
+  offsets =
+    std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
+
+  // build the strings columns from the chars and offsets
+  return make_strings_column(strings_count,
+                             std::move(offsets),
+                             std::move(chars->release().children.back()),
+                             input.null_count(),
+                             copy_bitmask(input.parent(), stream, mr));
+}
+
+/**
+ * @brief Function logic for the replace_string_parallel
+ *
+ * Performs the multi-replace operation with a thread per string.
+ * This performs best on smaller strings. @see AVG_CHAR_BYTES_THRESHOLD
+ */
+struct replace_multi_fn {
+  column_device_view const d_strings;
+  column_device_view const d_targets;
+  column_device_view const d_repls;
+  int32_t* d_offsets{};
+  char* d_chars{};
+
+  __device__ void operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) { d_offsets[idx] = 0; }
+      return;
+    }
+    auto const d_str   = d_strings.element<string_view>(idx);
+    char const* in_ptr = d_str.data();
+
+    size_type bytes = d_str.size_bytes();
+    size_type spos  = 0;
+    size_type lpos  = 0;
+    char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
+
+    // check each character against each target
+    while (spos < d_str.size_bytes()) {
+      for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) {
+        auto const d_tgt = d_targets.element<string_view>(tgt_idx);
+        if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
+            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
+        {
+          auto const d_repl = (d_repls.size() == 1) ? d_repls.element<string_view>(0)
+                                                    : d_repls.element<string_view>(tgt_idx);
+          bytes += d_repl.size_bytes() - d_tgt.size_bytes();
+          if (out_ptr) {
+            out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);
+            out_ptr = copy_string(out_ptr, d_repl);
+            lpos    = spos + d_tgt.size_bytes();
+          }
+          spos += d_tgt.size_bytes() - 1;
+          break;
+        }
+      }
+      ++spos;
+    }
+    if (out_ptr)  // copy remainder
+      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
+    else
+      d_offsets[idx] = bytes;
+  }
+};
+
+std::unique_ptr<column> replace_string_parallel(strings_column_view const& input,
+                                                strings_column_view const& targets,
+                                                strings_column_view const& repls,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  auto d_strings      = column_device_view::create(input.parent(), stream);
+  auto d_targets      = column_device_view::create(targets.parent(), stream);
+  auto d_replacements = column_device_view::create(repls.parent(), stream);
+
+  auto children = cudf::strings::detail::make_strings_children(
+    replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);
+
+  return make_strings_column(input.size(),
+                             std::move(children.first),
+                             std::move(children.second),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
+}
+
+}  // namespace
+
+std::unique_ptr<column> replace(strings_column_view const& input,
+                                strings_column_view const& targets,
+                                strings_column_view const& repls,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
+  CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
+               "Parameters targets must not be empty and must not have nulls");
+  CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)),
+               "Parameters repls must not be empty and must not have nulls");
+  if (repls.size() > 1)
+    CUDF_EXPECTS(repls.size() == targets.size(), "Sizes for targets and repls must match");
+
+  return (input.size() == input.null_count() ||
+          ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD))
+           ? replace_string_parallel(input, targets, repls, stream, mr)
+           : replace_character_parallel(input, targets, repls, stream, mr);
+}
+
+}  // namespace detail
+
+// external API
+
+std::unique_ptr<column> replace(strings_column_view const& strings,
+                                strings_column_view const& targets,
+                                strings_column_view const& repls,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::replace(strings, targets, repls, cudf::get_default_stream(), mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index f3bc7fc82ec..50b2dc27671 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,7 +169,8 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                    prog->set_working_memory(d_buffer, size);
                    return *prog;
                  });
-  auto d_progs = cudf::detail::make_device_uvector_async(progs, stream);
+  auto d_progs =
+    cudf::detail::make_device_uvector_async(progs, stream, rmm::mr::get_current_device_resource());
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
   auto const d_repls   = column_device_view::create(replacements.parent(), stream);
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index d1a377a4bda..3fc969a4c1f 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -704,92 +704,6 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
 
-namespace {
-/**
- * @brief Function logic for the replace_multi API.
- *
- * This will perform the multi-replace operation on each string.
- */
-struct replace_multi_fn {
-  column_device_view const d_strings;
-  column_device_view const d_targets;
-  column_device_view const d_repls;
-  int32_t* d_offsets{};
-  char* d_chars{};
-
-  __device__ void operator()(size_type idx)
-  {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const d_str   = d_strings.element<string_view>(idx);
-    char const* in_ptr = d_str.data();
-
-    size_type bytes = d_str.size_bytes();
-    size_type spos  = 0;
-    size_type lpos  = 0;
-    char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
-
-    // check each character against each target
-    while (spos < d_str.size_bytes()) {
-      for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) {
-        auto const d_tgt = d_targets.element<string_view>(tgt_idx);
-        if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
-            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
-        {
-          auto const d_repl = (d_repls.size() == 1) ? d_repls.element<string_view>(0)
-                                                    : d_repls.element<string_view>(tgt_idx);
-          bytes += d_repl.size_bytes() - d_tgt.size_bytes();
-          if (out_ptr) {
-            out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);
-            out_ptr = copy_string(out_ptr, d_repl);
-            lpos    = spos + d_tgt.size_bytes();
-          }
-          spos += d_tgt.size_bytes() - 1;
-          break;
-        }
-      }
-      ++spos;
-    }
-    if (out_ptr)  // copy remainder
-      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
-    else
-      d_offsets[idx] = bytes;
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<column> replace(strings_column_view const& strings,
-                                strings_column_view const& targets,
-                                strings_column_view const& repls,
-                                rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
-{
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
-  CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
-               "Parameters targets must not be empty and must not have nulls");
-  CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)),
-               "Parameters repls must not be empty and must not have nulls");
-  if (repls.size() > 1)
-    CUDF_EXPECTS(repls.size() == targets.size(), "Sizes for targets and repls must match");
-
-  auto d_strings = column_device_view::create(strings.parent(), stream);
-  auto d_targets = column_device_view::create(targets.parent(), stream);
-  auto d_repls   = column_device_view::create(repls.parent(), stream);
-
-  // this utility calls the given functor to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    replace_multi_fn{*d_strings, *d_targets, *d_repls}, strings.size(), stream, mr);
-
-  return make_strings_column(strings.size(),
-                             std::move(children.first),
-                             std::move(children.second),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
-}
-
 std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       rmm::cuda_stream_view stream,
@@ -854,14 +768,5 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
   return detail::replace_slice(strings, repl, start, stop, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> replace(strings_column_view const& strings,
-                                strings_column_view const& targets,
-                                strings_column_view const& repls,
-                                rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::replace(strings, targets, repls, cudf::get_default_stream(), mr);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 7f134059ded..e7b637c52f3 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,7 +106,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
   });
   // copy translate table to device memory
   rmm::device_uvector<translate_table> table =
-    cudf::detail::make_device_uvector_async(htable, stream);
+    cudf::detail::make_device_uvector_async(htable, stream, rmm::mr::get_current_device_resource());
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 8a63a6f6411..6997de18be5 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -25,6 +25,8 @@
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -268,7 +270,8 @@ auto list_lex_preprocess(table_view table, rmm::cuda_stream_view stream)
       dremel_device_views.push_back(dremel_data.back());
     }
   }
-  auto d_dremel_device_views = detail::make_device_uvector_sync(dremel_device_views, stream);
+  auto d_dremel_device_views = detail::make_device_uvector_sync(
+    dremel_device_views, stream, rmm::mr::get_current_device_resource());
   return std::make_tuple(std::move(dremel_data), std::move(d_dremel_device_views));
 }
 
@@ -333,7 +336,7 @@ void check_shape_compatibility(table_view const& lhs, table_view const& rhs)
   CUDF_EXPECTS(lhs.num_columns() == rhs.num_columns(),
                "Cannot compare tables with different number of columns");
   for (size_type i = 0; i < lhs.num_columns(); ++i) {
-    CUDF_EXPECTS(column_types_equal(lhs.column(i), rhs.column(i)),
+    CUDF_EXPECTS(column_types_equivalent(lhs.column(i), rhs.column(i)),
                  "Cannot compare tables with different column types");
   }
 }
@@ -355,10 +358,13 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(
   auto [verticalized_lhs, new_column_order, new_null_precedence, verticalized_col_depths] =
     decompose_structs(t, column_order, null_precedence);
 
-  auto d_t               = table_device_view::create(verticalized_lhs, stream);
-  auto d_column_order    = detail::make_device_uvector_async(new_column_order, stream);
-  auto d_null_precedence = detail::make_device_uvector_async(new_null_precedence, stream);
-  auto d_depths          = detail::make_device_uvector_async(verticalized_col_depths, stream);
+  auto d_t            = table_device_view::create(verticalized_lhs, stream);
+  auto d_column_order = detail::make_device_uvector_async(
+    new_column_order, stream, rmm::mr::get_current_device_resource());
+  auto d_null_precedence = detail::make_device_uvector_async(
+    new_null_precedence, stream, rmm::mr::get_current_device_resource());
+  auto d_depths = detail::make_device_uvector_async(
+    verticalized_col_depths, stream, rmm::mr::get_current_device_resource());
 
   if (detail::has_nested_columns(t)) {
     auto [dremel_data, d_dremel_device_view] = list_lex_preprocess(verticalized_lhs, stream);
@@ -397,9 +403,10 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(table_view const&
 {
   check_eq_compatibility(t);
 
-  auto [null_pushed_table, nullable_data] = structs::detail::push_down_nulls(t, stream);
-  auto struct_offset_removed_table        = remove_struct_child_offsets(null_pushed_table);
-  auto verticalized_t = std::get<0>(decompose_structs(struct_offset_removed_table));
+  auto [null_pushed_table, nullable_data] =
+    structs::detail::push_down_nulls(t, stream, rmm::mr::get_current_device_resource());
+  auto struct_offset_removed_table = remove_struct_child_offsets(null_pushed_table);
+  auto verticalized_t              = std::get<0>(decompose_structs(struct_offset_removed_table));
 
   auto d_t = table_device_view_owner(table_device_view::create(verticalized_t, stream));
   return std::shared_ptr<preprocessed_table>(new preprocessed_table(
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 1d5a738f8ce..8039729d749 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -108,7 +108,8 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                              if (d_strings.is_null(idx)) return false;
                              return !d_strings.element<cudf::string_view>(idx).empty();
                            },
-                           stream)
+                           stream,
+                           rmm::mr::get_current_device_resource())
                            ->release();
     strings_count = table_offsets.front()->size() - 1;
     auto result   = std::move(table_offsets.front());
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 634fdd70831..b982a010e6e 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -500,7 +500,8 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   auto d_cols = contiguous_copy_column_device_views<column_device_view>(cols, stream);
 
   // move stack info to the gpu
-  rmm::device_uvector<column_info> d_info = cudf::detail::make_device_uvector_async(info, stream);
+  rmm::device_uvector<column_info> d_info =
+    cudf::detail::make_device_uvector_async(info, stream, rmm::mr::get_current_device_resource());
 
   // each thread needs to maintain a stack of row spans of size max_branch_depth. we will use
   // shared memory to do this rather than allocating a potentially gigantic temporary buffer
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index b569ce04c31..6e19fc2ca3f 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -328,7 +328,7 @@ struct dispatch_unary_cast_to {
       auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
                                              size,
                                              rmm::device_buffer{size * cudf::size_of(type), stream},
-                                             copy_bitmask(input, stream),
+                                             copy_bitmask(input, stream, mr),
                                              input.null_count());
 
       mutable_column_view output_mutable = *output;
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index d297148de45..d6f5c65593a 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,4 +69,10 @@ bool column_types_equal(column_view const& lhs, column_view const& rhs)
   return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
 }
 
+bool column_types_equivalent(column_view const& lhs, column_view const& rhs)
+{
+  if (lhs.type().id() != rhs.type().id()) { return false; }
+  return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 7c021a73eb5..bd4077aff4e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -13,12 +13,32 @@
 # =============================================================================
 
 # ##################################################################################################
-# * compiler function -----------------------------------------------------------------------------
+# enable testing ################################################################################
+# ##################################################################################################
+enable_testing()
+
+include(rapids-test)
+rapids_test_init()
 
 # This function takes in a test name and test source and handles setting all of the associated
 # properties and linking to build the test
 function(ConfigureTest CMAKE_TEST_NAME)
-  add_executable(${CMAKE_TEST_NAME} ${ARGN})
+  set(options)
+  set(one_value GPUS PERCENT)
+  set(multi_value)
+  cmake_parse_arguments(_CUDF_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
+  if(NOT DEFINED _CUDF_TEST_GPUS AND NOT DEFINED _CUDF_TEST_PERCENT)
+    set(_CUDF_TEST_GPUS 1)
+    set(_CUDF_TEST_PERCENT 15)
+  endif()
+  if(NOT DEFINED _CUDF_TEST_GPUS)
+    set(_CUDF_TEST_GPUS 1)
+  endif()
+  if(NOT DEFINED _CUDF_TEST_PERCENT)
+    set(_CUDF_TEST_PERCENT 100)
+  endif()
+
+  add_executable(${CMAKE_TEST_NAME} ${_CUDF_TEST_UNPARSED_ARGUMENTS})
   set_target_properties(
     ${CMAKE_TEST_NAME}
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gtests>"
@@ -35,12 +55,12 @@ function(ConfigureTest CMAKE_TEST_NAME)
     ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main
                                $<TARGET_NAME_IF_EXISTS:conda_env>
   )
-  add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
-  install(
-    TARGETS ${CMAKE_TEST_NAME}
-    COMPONENT testing
-    DESTINATION bin/gtests/libcudf
-    EXCLUDE_FROM_ALL
+  rapids_test_add(
+    NAME ${CMAKE_TEST_NAME}
+    COMMAND ${CMAKE_TEST_NAME}
+    GPUS ${_CUDF_TEST_GPUS}
+    PERCENT ${_CUDF_TEST_PERCENT}
+    INSTALL_COMPONENT_SET testing
   )
 endfunction()
 
@@ -112,6 +132,8 @@ ConfigureTest(
   groupby/sum_tests.cpp
   groupby/tdigest_tests.cu
   groupby/var_tests.cpp
+  GPUS 1
+  PERCENT 100
 )
 
 # ##################################################################################################
@@ -138,6 +160,8 @@ ConfigureTest(HASHING_TEST hashing/hash_test.cpp)
 ConfigureTest(
   PARTITIONING_TEST partitioning/hash_partition_test.cpp partitioning/round_robin_test.cpp
   partitioning/partition_test.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -149,6 +173,8 @@ ConfigureTest(HASH_MAP_TEST hash_map/map_test.cu)
 ConfigureTest(
   QUANTILES_TEST quantiles/percentile_approx_test.cpp quantiles/quantile_test.cpp
   quantiles/quantiles_test.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -162,6 +188,8 @@ ConfigureTest(
   reductions/segmented_reduction_tests.cpp
   reductions/list_rank_test.cpp
   reductions/tdigest_tests.cu
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -221,17 +249,41 @@ ConfigureTest(
 # * io tests --------------------------------------------------------------------------------------
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 
-ConfigureTest(CSV_TEST io/csv_test.cpp)
-ConfigureTest(FILE_IO_TEST io/file_io_test.cpp)
-ConfigureTest(ORC_TEST io/orc_test.cpp)
-ConfigureTest(PARQUET_TEST io/parquet_test.cpp io/parquet_chunked_reader_test.cpp)
-ConfigureTest(JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp)
+ConfigureTest(
+  CSV_TEST io/csv_test.cpp
+  GPUS 1
+  PERCENT 30
+)
+ConfigureTest(
+  FILE_IO_TEST io/file_io_test.cpp
+  GPUS 1
+  PERCENT 30
+)
+ConfigureTest(
+  ORC_TEST io/orc_test.cpp
+  GPUS 1
+  PERCENT 30
+)
+ConfigureTest(
+  PARQUET_TEST io/parquet_test.cpp io/parquet_chunked_reader_test.cpp
+  GPUS 1
+  PERCENT 30
+)
+ConfigureTest(
+  JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp
+  GPUS 1
+  PERCENT 30
+)
 ConfigureTest(JSON_WRITER_TEST io/json_writer.cpp)
 ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu)
 ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
-ConfigureTest(DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp)
+ConfigureTest(
+  DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
+  GPUS 1
+  PERCENT 30
+)
 target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
 ConfigureTest(FST_TEST io/fst/fst_test.cu)
@@ -245,6 +297,8 @@ endif()
 ConfigureTest(
   SORT_TEST sort/segmented_sort_tests.cpp sort/sort_test.cpp sort/stable_sort_tests.cpp
   sort/rank_test.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -275,6 +329,8 @@ ConfigureTest(
   copying/split_tests.cpp
   copying/utility_tests.cpp
   copying/reverse_tests.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -318,7 +374,11 @@ ConfigureTest(DEVICE_ATOMICS_TEST device_atomics/device_atomics_test.cu)
 
 # ##################################################################################################
 # * transpose tests -------------------------------------------------------------------------------
-ConfigureTest(TRANSPOSE_TEST transpose/transpose_test.cpp)
+ConfigureTest(
+  TRANSPOSE_TEST transpose/transpose_test.cpp
+  GPUS 1
+  PERCENT 70
+)
 
 # ##################################################################################################
 # * table tests -----------------------------------------------------------------------------------
@@ -359,6 +419,8 @@ ConfigureTest(
   rolling/range_rolling_window_test.cpp
   rolling/range_window_bounds_test.cpp
   rolling/rolling_test.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -504,6 +566,8 @@ ConfigureTest(
   lists/sort_lists_tests.cpp
   lists/stream_compaction/apply_boolean_mask_tests.cpp
   lists/stream_compaction/distinct_tests.cpp
+  GPUS 1
+  PERCENT 70
 )
 
 # ##################################################################################################
@@ -519,12 +583,11 @@ ConfigureTest(
 # tests by manually invoking the executable, so we'll have to manually pass this environment
 # variable in that setup.
 set_tests_properties(
-  STREAM_IDENTIFICATION_TEST PROPERTIES ENVIRONMENT
-                                        LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage>
+  STREAM_IDENTIFICATION_TEST
+  PROPERTIES ENVIRONMENT LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_cudf>
 )
 
 # ##################################################################################################
-# enable testing ################################################################################
+# Install tests ####################################################################################
 # ##################################################################################################
-
-enable_testing()
+rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/libcudf)
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index 00ec7bd218b..7805828ad55 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -87,7 +87,7 @@ rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fil
 {
   if (!fill_valid) {
     return cudf::detail::make_zeroed_device_uvector_sync<cudf::bitmask_type>(
-      size, cudf::get_default_stream());
+      size, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   } else {
     auto ret = rmm::device_uvector<cudf::bitmask_type>(size, cudf::get_default_stream());
     CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(),
diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu
index cdc453be8e4..cb086cda179 100644
--- a/cpp/tests/bitmask/valid_if_tests.cu
+++ b/cpp/tests/bitmask/valid_if_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ TEST_F(ValidIfTest, EmptyRange)
   auto actual        = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(0),
                                        odds_valid{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   auto const& buffer = actual.first;
   EXPECT_EQ(0u, buffer.size());
   EXPECT_EQ(nullptr, buffer.data());
@@ -55,7 +56,8 @@ TEST_F(ValidIfTest, InvalidRange)
   EXPECT_THROW(cudf::detail::valid_if(thrust::make_counting_iterator(1),
                                       thrust::make_counting_iterator(0),
                                       odds_valid{},
-                                      cudf::get_default_stream()),
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource()),
                cudf::logic_error);
 }
 
@@ -66,7 +68,8 @@ TEST_F(ValidIfTest, OddsValid)
   auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(10000),
                                        odds_valid{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(5000, actual.second);
 }
@@ -78,7 +81,8 @@ TEST_F(ValidIfTest, AllValid)
   auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(10000),
                                        all_valid{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(0, actual.second);
 }
@@ -90,7 +94,8 @@ TEST_F(ValidIfTest, AllNull)
   auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
                                        thrust::make_counting_iterator(10000),
                                        all_null{},
-                                       cudf::get_default_stream());
+                                       cudf::get_default_stream(),
+                                       rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(10000, actual.second);
 }
diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu
index bf2937ae8ab..aae511413ef 100644
--- a/cpp/tests/copying/detail_gather_tests.cu
+++ b/cpp/tests/copying/detail_gather_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,12 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
   // test with device vector iterators
   {
     std::unique_ptr<cudf::table> result =
-      cudf::detail::gather(source_table, gather_map.begin(), gather_map.end());
+      cudf::detail::gather(source_table,
+                           gather_map.begin(),
+                           gather_map.end(),
+                           cudf::out_of_bounds_policy::DONT_CHECK,
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
     for (auto i = 0; i < source_table.num_columns(); ++i) {
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -70,7 +75,12 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
   // test with raw pointers
   {
     std::unique_ptr<cudf::table> result =
-      cudf::detail::gather(source_table, gather_map.data(), gather_map.data() + gather_map.size());
+      cudf::detail::gather(source_table,
+                           gather_map.begin(),
+                           gather_map.data() + gather_map.size(),
+                           cudf::out_of_bounds_policy::DONT_CHECK,
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
     for (auto i = 0; i < source_table.num_columns(); ++i) {
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -97,7 +107,8 @@ TYPED_TEST(GatherTest, GatherDetailInvalidIndexTest)
                          gather_map,
                          cudf::out_of_bounds_policy::NULLIFY,
                          cudf::detail::negative_index_policy::NOT_ALLOWED,
-                         cudf::get_default_stream());
+                         cudf::get_default_stream(),
+                         rmm::mr::get_current_device_resource());
 
   auto expect_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2) ? 0 : i; });
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index 3db2ce399cc..7810566fbf1 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/mr/device/per_device_resource.hpp>
+
 class GatherTestStr : public cudf::test::BaseFixture {
 };
 
@@ -87,7 +89,8 @@ TEST_F(GatherTestStr, Gather)
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                      cudf::get_default_stream());
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource());
 
   std::vector<const char*> h_expected;
   std::vector<int32_t> expected_validity;
@@ -118,7 +121,8 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
                                       gather_map,
                                       cudf::out_of_bounds_policy::DONT_CHECK,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                      cudf::get_default_stream());
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource());
 
   std::vector<const char*> h_expected;
   for (auto itr = h_map.begin(); itr != h_map.end(); ++itr) {
@@ -137,7 +141,8 @@ TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                      cudf::get_default_stream());
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource());
   cudf::test::expect_column_empty(results->get_column(0).view());
 }
 
@@ -151,6 +156,7 @@ TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                      cudf::get_default_stream());
+                                      cudf::get_default_stream(),
+                                      rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results->get_column(0).view());
 }
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 1c51eab1f94..a35bbab0176 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -812,7 +812,7 @@ TYPED_TEST(StructGetValueTestTyped, mixed_types_valid)
   // col fields
   cudf::test::fixed_width_column_wrapper<TypeParam> f1{1, 2, 3};
   cudf::test::strings_column_wrapper f2{"aa", "bbb", "c"};
-  cudf::test::dictionary_column_wrapper<TypeParam, uint32_t> f3{42, 42, 24};
+  cudf::test::dictionary_column_wrapper<TypeParam, int32_t> f3{42, 42, 24};
   LCW f4{LCW{8, 8, 8}, LCW{9, 9}, LCW{10}};
 
   cudf::test::structs_column_wrapper col{f1, f2, f3, f4};
@@ -824,7 +824,7 @@ TYPED_TEST(StructGetValueTestTyped, mixed_types_valid)
   // expect fields
   cudf::test::fixed_width_column_wrapper<TypeParam> ef1{3};
   cudf::test::strings_column_wrapper ef2{"c"};
-  cudf::test::dictionary_column_wrapper<int32_t, TypeParam> ef3{24};
+  cudf::test::dictionary_column_wrapper<TypeParam, int32_t> ef3{24};
   LCW ef4{LCW{10}};
 
   cudf::table_view expect_data{{ef1, ef2, ef3, ef4}};
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 43874b84114..5694513647b 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -141,9 +141,10 @@ struct AtomicsTest : public cudf::test::BaseFixture {
     result_init[4] = result_init[1];
     result_init[5] = result_init[2];
 
-    auto dev_data = cudf::detail::make_device_uvector_sync(v, cudf::get_default_stream());
-    auto dev_result =
-      cudf::detail::make_device_uvector_sync(result_init, cudf::get_default_stream());
+    auto dev_data = cudf::detail::make_device_uvector_sync(
+      v, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto dev_result = cudf::detail::make_device_uvector_sync(
+      result_init, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
     if (block_size == 0) { block_size = vec_size; }
 
@@ -294,8 +295,10 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
     exact[2] = std::accumulate(
       v.begin(), v.end(), identity[2], [](T acc, uint64_t i) { return acc ^ T(i); });
 
-    auto dev_result = cudf::detail::make_device_uvector_sync(identity, cudf::get_default_stream());
-    auto dev_data   = cudf::detail::make_device_uvector_sync(v, cudf::get_default_stream());
+    auto dev_result = cudf::detail::make_device_uvector_sync(
+      identity, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    auto dev_data = cudf::detail::make_device_uvector_sync(
+      v, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
     if (block_size == 0) { block_size = vec_size; }
 
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index eb4a3e895f9..5b842322681 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/stream_checking_resource_adaptor.hpp>
 
 #include <cudf/filling.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -125,10 +127,14 @@ int main(int argc, char** argv)
   ::testing::InitGoogleTest(&argc, argv);
   auto const cmd_opts    = parse_cudf_test_opts(argc, argv);
   auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();
-  if (stream_mode == "custom") {
-    auto resource = rmm::mr::get_current_device_resource();
-    auto adapter  = make_stream_checking_resource_adaptor(resource);
-    rmm::mr::set_current_device_resource(&adapter);
+  if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
+    auto resource                      = rmm::mr::get_current_device_resource();
+    auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
+    auto const error_on_invalid_stream = (stream_error_mode == "error");
+    auto const check_default_stream    = (stream_mode == "new_cudf_default");
+    auto adaptor                       = make_stream_checking_resource_adaptor(
+      resource, error_on_invalid_stream, check_default_stream);
+    rmm::mr::set_current_device_resource(&adaptor);
   }
   return RUN_ALL_TESTS();
 }
diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index 383a69affa1..1f55cdf3df0 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/scalar/scalar.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -23,11 +21,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/filling.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/unary.hpp>
-
-using namespace cudf;
-using namespace cudf::test;
+#include <cudf/scalar/scalar.hpp>
 
 template <typename T>
 class SequenceTypedTestFixture : public cudf::test::BaseFixture {
@@ -44,13 +38,13 @@ TYPED_TEST(SequenceTypedTestFixture, Incrementing)
 {
   using T = TypeParam;
 
-  numeric_scalar<T> init(0);
-  numeric_scalar<T> step(1);
+  cudf::numeric_scalar<T> init(0);
+  cudf::numeric_scalar<T> step(1);
 
-  size_type num_els = 10;
+  cudf::size_type num_els = 10;
 
   T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
 
   auto result = cudf::sequence(num_els, init, step);
 
@@ -61,13 +55,13 @@ TYPED_TEST(SequenceTypedTestFixture, Decrementing)
 {
   using T = TypeParam;
 
-  numeric_scalar<T> init(0);
-  numeric_scalar<T> step(-5);
+  cudf::numeric_scalar<T> init(0);
+  cudf::numeric_scalar<T> step(-5);
 
-  size_type num_els = 10;
+  cudf::size_type num_els = 10;
 
   T expected[] = {0, -5, -10, -15, -20, -25, -30, -35, -40, -45};
-  fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
 
   auto result = cudf::sequence(num_els, init, step);
 
@@ -78,13 +72,13 @@ TYPED_TEST(SequenceTypedTestFixture, EmptyOutput)
 {
   using T = TypeParam;
 
-  numeric_scalar<T> init(0);
-  numeric_scalar<T> step(-5);
+  cudf::numeric_scalar<T> init(0);
+  cudf::numeric_scalar<T> step(-5);
 
-  size_type num_els = 0;
+  cudf::size_type num_els = 0;
 
   T expected[] = {};
-  fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
 
   auto result = cudf::sequence(num_els, init, step);
 
@@ -93,31 +87,31 @@ TYPED_TEST(SequenceTypedTestFixture, EmptyOutput)
 
 TEST_F(SequenceTestFixture, BadTypes)
 {
-  string_scalar string_init("zero");
-  string_scalar string_step("???");
+  cudf::string_scalar string_init("zero");
+  cudf::string_scalar string_step("???");
   EXPECT_THROW(cudf::sequence(10, string_init, string_step), cudf::logic_error);
 
-  numeric_scalar<bool> bool_init(true);
-  numeric_scalar<bool> bool_step(false);
+  cudf::numeric_scalar<bool> bool_init(true);
+  cudf::numeric_scalar<bool> bool_step(false);
   EXPECT_THROW(cudf::sequence(10, bool_init, bool_step), cudf::logic_error);
 
-  timestamp_scalar<timestamp_s> ts_init(duration_s{10}, true);
-  timestamp_scalar<timestamp_s> ts_step(duration_s{10}, true);
+  cudf::timestamp_scalar<cudf::timestamp_s> ts_init(cudf::duration_s{10}, true);
+  cudf::timestamp_scalar<cudf::timestamp_s> ts_step(cudf::duration_s{10}, true);
   EXPECT_THROW(cudf::sequence(10, ts_init, ts_step), cudf::logic_error);
 }
 
 TEST_F(SequenceTestFixture, MismatchedInputs)
 {
-  numeric_scalar<int> init(0);
-  numeric_scalar<float> step(-5);
+  cudf::numeric_scalar<int> init(0);
+  cudf::numeric_scalar<float> step(-5);
   EXPECT_THROW(cudf::sequence(10, init, step), cudf::logic_error);
 
-  numeric_scalar<int> init2(0);
-  numeric_scalar<int8_t> step2(-5);
+  cudf::numeric_scalar<int> init2(0);
+  cudf::numeric_scalar<int8_t> step2(-5);
   EXPECT_THROW(cudf::sequence(10, init2, step2), cudf::logic_error);
 
-  numeric_scalar<float> init3(0);
-  numeric_scalar<double> step3(-5);
+  cudf::numeric_scalar<float> init3(0);
+  cudf::numeric_scalar<double> step3(-5);
   EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::logic_error);
 }
 
@@ -125,12 +119,12 @@ TYPED_TEST(SequenceTypedTestFixture, DefaultStep)
 {
   using T = TypeParam;
 
-  numeric_scalar<T> init(0);
+  cudf::numeric_scalar<T> init(0);
 
-  size_type num_els = 10;
+  cudf::size_type num_els = 10;
 
   T expected[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
+  cudf::test::fixed_width_column_wrapper<T> expected_w(expected, expected + num_els);
 
   auto result = cudf::sequence(num_els, init);
 
@@ -140,11 +134,11 @@ TYPED_TEST(SequenceTypedTestFixture, DefaultStep)
 TEST_F(SequenceTestFixture, DateSequenceBasic)
 {
   // Timestamp generated using https://www.epochconverter.com/
-  timestamp_scalar<timestamp_s> init(1629852896L, true);  // 2021-08-25 00:54:56 GMT
-  size_type size{5};
-  size_type months{1};
+  cudf::timestamp_scalar<cudf::timestamp_s> init(1629852896L, true);  // 2021-08-25 00:54:56 GMT
+  cudf::size_type size{5};
+  cudf::size_type months{1};
 
-  fixed_width_column_wrapper<timestamp_s, int64_t> expected{
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_s, int64_t> expected{
     1629852896L,  // 2021-08-25 00:54:56 GMT
     1632531296L,  // 2021-09-25 00:54:56 GMT
     1635123296L,  // 2021-10-25 00:54:56 GMT
@@ -160,11 +154,11 @@ TEST_F(SequenceTestFixture, DateSequenceBasic)
 TEST_F(SequenceTestFixture, DateSequenceLeapYear)
 {
   // Timestamp generated using https://www.epochconverter.com/
-  timestamp_scalar<timestamp_s> init(951876379L, true);  // 2000-02-29 02:06:19 GMT
-  size_type size{5};
-  size_type months{12};
+  cudf::timestamp_scalar<cudf::timestamp_s> init(951876379L, true);  // 2000-02-29 02:06:19 GMT
+  cudf::size_type size{5};
+  cudf::size_type months{12};
 
-  fixed_width_column_wrapper<timestamp_s, int64_t> expected{
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_s, int64_t> expected{
     951876379L,   // 2000-02-29 02:06:19 GMT Leap Year
     983412379L,   // 2001-02-28 02:06:19 GMT
     1014948379L,  // 2002-02-28 02:06:19 GMT
@@ -179,9 +173,9 @@ TEST_F(SequenceTestFixture, DateSequenceLeapYear)
 
 TEST_F(SequenceTestFixture, DateSequenceBadTypes)
 {
-  numeric_scalar<int64_t> init(951876379, true);
-  size_type size   = 5;
-  size_type months = 12;
+  cudf::numeric_scalar<int64_t> init(951876379, true);
+  cudf::size_type size   = 5;
+  cudf::size_type months = 12;
 
   EXPECT_THROW(calendrical_month_sequence(size, init, months), cudf::logic_error);
 }
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index ab9970dc370..9631e433a5e 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,7 +83,8 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
 
   std::vector<decimal32> vec1(1000, decimal32{1, scale_type{-2}});
-  auto d_vec1 = cudf::detail::make_device_uvector_sync(vec1, cudf::get_default_stream());
+  auto d_vec1 = cudf::detail::make_device_uvector_sync(
+    vec1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto const sum = thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
                                   std::cbegin(d_vec1),
@@ -96,7 +97,8 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
   //       change inclusive scan to run on device (avoid copying to host)
   thrust::inclusive_scan(std::cbegin(vec1), std::cend(vec1), std::begin(vec1));
 
-  d_vec1 = cudf::detail::make_device_uvector_sync(vec1, cudf::get_default_stream());
+  d_vec1 = cudf::detail::make_device_uvector_sync(
+    vec1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   std::vector<int32_t> vec2(1000);
   std::iota(std::begin(vec2), std::end(vec2), 1);
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index d7446d4dabb..4052201b064 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -471,13 +471,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto e = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
+                                                            rmm::mr::get_current_device_resource());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index d2b159fc208..3f4d5bcf20f 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -264,7 +264,7 @@ TEST_F(FromArrowTest, DictionaryIndicesType)
   auto arrow_table = arrow::Table::Make(schema, {array1, array2, array3});
 
   std::vector<std::unique_ptr<cudf::column>> columns;
-  auto col = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
+  auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index c6b181fe8a1..0ae0360c4d9 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -586,11 +586,12 @@ TEST_F(JsonTest, TreeRepresentation)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
-  auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream);
+  auto gpu_tree = cuio_json::detail::get_tree_representation(
+    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -672,11 +673,12 @@ TEST_F(JsonTest, TreeRepresentation2)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
-  auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream);
+  auto gpu_tree = cuio_json::detail::get_tree_representation(
+    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -745,11 +747,12 @@ TEST_F(JsonTest, TreeRepresentation3)
   options.enable_lines(true);
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
-  auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream);
+  auto gpu_tree = cuio_json::detail::get_tree_representation(
+    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -769,12 +772,13 @@ TEST_F(JsonTest, TreeRepresentationError)
   cudf::io::json_reader_options const options{};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
 
   // Get the JSON's tree representation
   // This JSON is invalid and will raise an exception.
-  EXPECT_THROW(cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream),
+  EXPECT_THROW(cuio_json::detail::get_tree_representation(
+                 tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()),
                cudf::logic_error);
 }
 
@@ -851,8 +855,8 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
                                                              static_cast<size_t>(d_scalar.size())};
 
   // Parse the JSON and get the token stream
-  const auto [tokens_gpu, token_indices_gpu] =
-    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+  const auto [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
+    d_input, options, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   bool const is_array_of_arrays =
@@ -864,15 +868,21 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
   auto [cpu_col_id, cpu_row_offsets] =
     records_orient_tree_traversal_cpu(input, cpu_tree, is_array_of_arrays, json_lines, stream);
   // gpu tree generation
-  auto gpu_tree = cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream);
+  auto gpu_tree = cuio_json::detail::get_tree_representation(
+    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   // Print tree representation
   if (std::getenv("NJP_DEBUG_DUMP") != nullptr) {
     printf("BEFORE traversal (gpu_tree):\n");
     print_tree(gpu_tree);
   }
   // gpu tree traversal
-  auto [gpu_col_id, gpu_row_offsets] = cuio_json::detail::records_orient_tree_traversal(
-    d_input, gpu_tree, is_array_of_arrays, json_lines, stream);
+  auto [gpu_col_id, gpu_row_offsets] =
+    cuio_json::detail::records_orient_tree_traversal(d_input,
+                                                     gpu_tree,
+                                                     is_array_of_arrays,
+                                                     json_lines,
+                                                     stream,
+                                                     rmm::mr::get_current_device_resource());
   // Print tree representation
   if (std::getenv("NJP_DEBUG_DUMP") != nullptr) {
     printf("AFTER  traversal (gpu_tree):\n");
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 3c01bd4de25..5b797a00ca1 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -262,8 +262,8 @@ TEST_F(JsonTest, TokenStream)
     cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
 
   // Parse the JSON and get the token stream
-  auto [d_tokens_gpu, d_token_indices_gpu] =
-    cuio_json::detail::get_token_stream(d_input, default_options, stream);
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, default_options, stream, rmm::mr::get_current_device_resource());
   // Copy back the number of tokens that were written
   thrust::host_vector<PdaTokenT> const tokens_gpu =
     cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
@@ -398,8 +398,8 @@ TEST_F(JsonTest, TokenStream2)
     cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
 
   // Parse the JSON and get the token stream
-  auto [d_tokens_gpu, d_token_indices_gpu] =
-    cuio_json::detail::get_token_stream(d_input, default_options, stream);
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, default_options, stream, rmm::mr::get_current_device_resource());
   // Copy back the number of tokens that were written
   thrust::host_vector<PdaTokenT> const tokens_gpu =
     cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
@@ -470,7 +470,9 @@ TEST_P(JsonParserTest, ExtractColumn)
 
   std::string const input = R"( [{"a":0.0, "b":1.0}, {"a":0.1, "b":1.1}, {"a":0.2, "b":1.2}] )";
   auto const d_input      = cudf::detail::make_device_uvector_async(
-    cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+    cudf::host_span<char const>{input.c_str(), input.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, default_options, stream, mr);
 
@@ -508,7 +510,9 @@ TEST_P(JsonParserTest, UTF_JSON)
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])";
   auto const d_ascii_pass      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()}, stream);
+    cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
 
   CUDF_EXPECT_NO_THROW(json_parser(d_ascii_pass, default_options, stream, mr));
 
@@ -521,7 +525,9 @@ TEST_P(JsonParserTest, UTF_JSON)
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip ʒakotɛ"}}])";
   auto const d_utf_failed      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()}, stream);
+    cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_failed, default_options, stream, mr));
 
   // utf-8 string that passes parsing.
@@ -534,7 +540,9 @@ TEST_P(JsonParserTest, UTF_JSON)
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}},
   {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip ʒakotɛ"}}])";
   auto const d_utf_pass      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()}, stream);
+    cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_pass, default_options, stream, mr));
 }
 
@@ -555,7 +563,9 @@ TEST_P(JsonParserTest, ExtractColumnWithQuotes)
 
   std::string const input = R"( [{"a":"0.0", "b":1.0}, {"b":1.1}, {"b":2.1, "a":"2.0"}] )";
   auto const d_input      = cudf::detail::make_device_uvector_async(
-    cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+    cudf::host_span<char const>{input.c_str(), input.size()},
+    stream,
+    rmm::mr::get_current_device_resource());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, options, stream, mr);
 
@@ -599,14 +609,18 @@ TEST_P(JsonParserTest, ExpectFailMixStructAndList)
   // libcudf does not currently support a mix of lists and structs.
   for (auto const& input : inputs_fail) {
     auto const d_input = cudf::detail::make_device_uvector_async(
-      cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+      cudf::host_span<char const>{input.c_str(), input.size()},
+      stream,
+      rmm::mr::get_current_device_resource());
     EXPECT_THROW(auto const cudf_table = json_parser(d_input, options, stream, mr),
                  cudf::logic_error);
   }
 
   for (auto const& input : inputs_succeed) {
     auto const d_input = cudf::detail::make_device_uvector_async(
-      cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+      cudf::host_span<char const>{input.c_str(), input.size()},
+      stream,
+      rmm::mr::get_current_device_resource());
     CUDF_EXPECT_NO_THROW(auto const cudf_table = json_parser(d_input, options, stream, mr));
   }
 }
@@ -626,8 +640,10 @@ TEST_P(JsonParserTest, EmptyString)
   cudf::io::json_reader_options default_options{};
 
   std::string const input = R"([])";
-  auto const d_input      = cudf::detail::make_device_uvector_sync(
-    cudf::host_span<char const>{input.c_str(), input.size()}, stream);
+  auto const d_input =
+    cudf::detail::make_device_uvector_sync(cudf::host_span<char const>{input.c_str(), input.size()},
+                                           stream,
+                                           rmm::mr::get_current_device_resource());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, default_options, stream, mr);
 
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index b682ecbbae9..8a16fd9a05a 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -1128,7 +1128,8 @@ TEST_F(ParquetWriterTest, BufferSource)
     auto const d_input = cudf::detail::make_device_uvector_sync(
       cudf::host_span<uint8_t const>{reinterpret_cast<uint8_t const*>(out_buffer.data()),
                                      out_buffer.size()},
-      cudf::get_default_stream());
+      cudf::get_default_stream(),
+      rmm::mr::get_current_device_resource());
     auto const d_buffer = cudf::device_span<std::byte const>(
       reinterpret_cast<std::byte const*>(d_input.data()), d_input.size());
     cudf::io::parquet_reader_options in_opts =
@@ -4279,6 +4280,9 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNulls)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = parse_statistics(chunk);
 
+      // should be half nulls, except no nulls in column 0
+      EXPECT_EQ(stats.null_count, c == 0 ? 0 : num_rows / 2);
+
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
@@ -4364,6 +4368,9 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexNullColumn)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = parse_statistics(chunk);
 
+      // there should be no nulls except column 1 which is all nulls
+      EXPECT_EQ(stats.null_count, c == 1 ? num_rows : 0);
+
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
@@ -4465,6 +4472,133 @@ TEST_F(ParquetWriterTest, CheckColumnOffsetIndexStruct)
   }
 }
 
+TEST_F(ParquetWriterTest, CheckColumnIndexListWithNulls)
+{
+  using cudf::test::iterators::null_at;
+  using cudf::test::iterators::nulls_at;
+  using lcw = cudf::test::lists_column_wrapper<int32_t>;
+
+  // 4 nulls
+  // [NULL, 2, NULL]
+  // []
+  // [4, 5]
+  // NULL
+  lcw col0{{{{1, 2, 3}, nulls_at({0, 2})}, {}, {4, 5}, {}}, null_at(3)};
+
+  // 4 nulls
+  // [[1, 2, 3], [], [4, 5], [], [0, 6, 0]]
+  // [[7, 8]]
+  // []
+  // [[]]
+  lcw col1{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, {{7, 8}}, lcw{}, lcw{lcw{}}};
+
+  // 4 nulls
+  // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]]
+  // [[7, 8]]
+  // []
+  // [[]]
+  lcw col2{{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, lcw{}, lcw{lcw{}}};
+
+  // 6 nulls
+  // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]]
+  // [[7, 8]]
+  // []
+  // [[]]
+  using dlcw = cudf::test::lists_column_wrapper<double>;
+  dlcw col3{{{{1., 2., 3.}, {}, {4., 5.}, {}, {{0., 6., 0.}, nulls_at({0, 2})}}, null_at(3)},
+            {{7., 8.}},
+            dlcw{},
+            dlcw{dlcw{}}};
+
+  // 4 nulls
+  // [[1, 2, 3], [], [4, 5], NULL, [0, 6, 0]]
+  // [[7, 8]]
+  // []
+  // NULL
+  using ui16lcw = cudf::test::lists_column_wrapper<uint16_t>;
+  cudf::test::lists_column_wrapper<uint16_t> col4{
+    {{{{1, 2, 3}, {}, {4, 5}, {}, {0, 6, 0}}, null_at(3)}, {{7, 8}}, ui16lcw{}, ui16lcw{ui16lcw{}}},
+    null_at(3)};
+
+  // 6 nulls
+  // [[1, 2, 3], [], [4, 5], NULL, [NULL, 6, NULL]]
+  // [[7, 8]]
+  // []
+  // NULL
+  lcw col5{{{{{1, 2, 3}, {}, {4, 5}, {}, {{0, 6, 0}, nulls_at({0, 2})}}, null_at(3)},
+            {{7, 8}},
+            lcw{},
+            lcw{lcw{}}},
+           null_at(3)};
+
+  // 4 nulls
+  using strlcw = cudf::test::lists_column_wrapper<cudf::string_view>;
+  cudf::test::lists_column_wrapper<cudf::string_view> col6{
+    {{"Monday", "Monday", "Friday"}, {}, {"Monday", "Friday"}, {}, {"Sunday", "Funday"}},
+    {{"bee", "sting"}},
+    strlcw{},
+    strlcw{strlcw{}}};
+
+  // 11 nulls
+  // [[[NULL,2,NULL,4]], [[NULL,6,NULL], [8,9]]]
+  // [NULL, [[13],[14,15,16]],  NULL]
+  // [NULL, [], NULL, [[]]]
+  // NULL
+  lcw col7{{
+             {{{{1, 2, 3, 4}, nulls_at({0, 2})}}, {{{5, 6, 7}, nulls_at({0, 2})}, {8, 9}}},
+             {{{{10, 11}, {12}}, {{13}, {14, 15, 16}}, {{17, 18}}}, nulls_at({0, 2})},
+             {{lcw{lcw{}}, lcw{}, lcw{}, lcw{lcw{}}}, nulls_at({0, 2})},
+             lcw{lcw{lcw{}}},
+           },
+           null_at(3)};
+
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7});
+
+  int64_t const expected_null_counts[] = {4, 4, 4, 6, 4, 6, 4, 11};
+
+  auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet");
+  auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+                    .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
+                    .compression(cudf::io::compression_type::NONE);
+
+  cudf::io::write_parquet(out_opts);
+
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::FileMetaData fmd;
+
+  read_footer(source, &fmd);
+
+  for (size_t r = 0; r < fmd.row_groups.size(); r++) {
+    auto const& rg = fmd.row_groups[r];
+    for (size_t c = 0; c < rg.columns.size(); c++) {
+      auto const& chunk = rg.columns[c];
+
+      // loop over offsets, read each page header, make sure it's a data page and that
+      // the first row index is correct
+      auto const oi = read_offset_index(source, chunk);
+
+      int64_t num_vals = 0;
+      for (size_t o = 0; o < oi.page_locations.size(); o++) {
+        auto const& page_loc = oi.page_locations[o];
+        auto const ph        = read_page_header(source, page_loc);
+        EXPECT_EQ(ph.type, cudf::io::parquet::PageType::DATA_PAGE);
+        // last column has 2 values per row
+        EXPECT_EQ(page_loc.first_row_index * (c == rg.columns.size() - 1 ? 2 : 1), num_vals);
+        num_vals += ph.data_page_header.num_values;
+      }
+
+      // check null counts in column chunk stats and page indexes
+      auto const ci    = read_column_index(source, chunk);
+      auto const stats = parse_statistics(chunk);
+      EXPECT_EQ(stats.null_count, expected_null_counts[c]);
+
+      // should only be one page
+      EXPECT_FALSE(ci.null_pages[0]);
+      EXPECT_EQ(ci.null_counts[0], expected_null_counts[c]);
+    }
+  }
+}
+
 TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
 {
   const char* coldata[] = {
@@ -5210,4 +5344,51 @@ TYPED_TEST(ParquetReaderSourceTest, BufferSourceArrayTypes)
   }
 }
 
+TEST_F(ParquetWriterTest, UserNullability)
+{
+  auto weight_col = cudf::test::fixed_width_column_wrapper<float>{{57.5, 51.1, 15.3}};
+  auto ages_col   = cudf::test::fixed_width_column_wrapper<int32_t>{{30, 27, 5}};
+  auto struct_col = cudf::test::structs_column_wrapper{weight_col, ages_col};
+
+  auto expected = table_view({struct_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_nullability(false);
+  expected_metadata.column_metadata[0].child(0).set_nullability(true);
+
+  auto filepath = temp_env->get_temp_filepath("SingleWriteNullable.parquet");
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf::io::write_parquet(write_opts);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  EXPECT_FALSE(result.tbl->view().column(0).nullable());
+  EXPECT_TRUE(result.tbl->view().column(0).child(0).nullable());
+  EXPECT_FALSE(result.tbl->view().column(0).child(1).nullable());
+}
+
+TEST_F(ParquetWriterTest, UserNullabilityInvalid)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; });
+  auto col      = cudf::test::fixed_width_column_wrapper<double>{{57.5, 51.1, 15.3}, valids};
+  auto expected = table_view({col});
+
+  auto filepath = temp_env->get_temp_filepath("SingleWriteNullableInvalid.parquet");
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  // Should work without the nullability option
+  EXPECT_NO_THROW(cudf::io::write_parquet(write_opts));
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_nullability(false);
+  write_opts.set_metadata(&expected_metadata);
+  // Can't write a column with nulls as not nullable
+  EXPECT_THROW(cudf::io::write_parquet(write_opts), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
index ea6eb9b93ef..81c6563cd2d 100644
--- a/cpp/tests/io/type_inference_test.cu
+++ b/cpp/tests/io/type_inference_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,12 +51,12 @@ TEST_F(TypeInference, Basic)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 4, 7};
-  auto const string_length = std::vector<std::size_t>{2, 2, 1};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 4, 7};
+  auto const string_length   = std::vector<std::size_t>{2, 2, 1};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -84,12 +84,12 @@ TEST_F(TypeInference, Null)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 1, 4};
-  auto const string_length = std::vector<std::size_t>{0, 2, 1};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 1, 4};
+  auto const string_length   = std::vector<std::size_t>{0, 2, 1};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -117,12 +117,12 @@ TEST_F(TypeInference, AllNull)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 1, 1};
-  auto const string_length = std::vector<std::size_t>{0, 0, 4};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 1, 1};
+  auto const string_length   = std::vector<std::size_t>{0, 0, 4};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -150,12 +150,12 @@ TEST_F(TypeInference, String)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 8, 12};
-  auto const string_length = std::vector<std::size_t>{6, 3, 4};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 8, 12};
+  auto const string_length   = std::vector<std::size_t>{6, 3, 4};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -183,12 +183,12 @@ TEST_F(TypeInference, Bool)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 6, 12};
-  auto const string_length = std::vector<std::size_t>{4, 5, 5};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 6, 12};
+  auto const string_length   = std::vector<std::size_t>{4, 5, 5};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -216,12 +216,12 @@ TEST_F(TypeInference, Timestamp)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 10};
-  auto const string_length = std::vector<std::size_t>{8, 9};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 10};
+  auto const string_length   = std::vector<std::size_t>{8, 9};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -250,12 +250,12 @@ TEST_F(TypeInference, InvalidInput)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  auto const string_offset = std::vector<int32_t>{1, 3, 5, 7, 9};
-  auto const string_length = std::vector<std::size_t>{1, 1, 1, 1, 1};
-  auto const d_string_offset =
-    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
-  auto const d_string_length =
-    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
+  auto const string_offset   = std::vector<int32_t>{1, 3, 5, 7, 9};
+  auto const string_length   = std::vector<std::size_t>{1, 1, 1, 1, 1};
+  auto const d_string_offset = cudf::detail::make_device_uvector_async(
+    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const d_string_length = cudf::detail::make_device_uvector_async(
+    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 894e117ba40..882de994e67 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,8 +87,8 @@ struct IteratorTest : public cudf::test::BaseFixture {
   {
     InputIterator d_in_last = d_in + num_items;
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
-    auto dev_expected =
-      cudf::detail::make_device_uvector_sync(expected, cudf::get_default_stream());
+    auto dev_expected = cudf::detail::make_device_uvector_sync(
+      expected, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
     // using a temporary vector and calling transform and all_of separately is
     // equivalent to thrust::equal but compiles ~3x faster
diff --git a/cpp/tests/iterator/value_iterator_test.cuh b/cpp/tests/iterator/value_iterator_test.cuh
index fa931d34a0e..8252ce88f39 100644
--- a/cpp/tests/iterator/value_iterator_test.cuh
+++ b/cpp/tests/iterator/value_iterator_test.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,8 @@ template <typename T>
 void non_null_iterator(IteratorTest<T>& testFixture)
 {
   auto host_array = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
-  auto dev_array  = cudf::detail::make_device_uvector_sync(host_array, cudf::get_default_stream());
+  auto dev_array  = cudf::detail::make_device_uvector_sync(
+    host_array, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // calculate the expected value by CPU.
   thrust::host_vector<T> replaced_array(host_array);
diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu
index 8b4080fa493..d0e62c09a03 100644
--- a/cpp/tests/iterator/value_iterator_test_strings.cu
+++ b/cpp/tests/iterator/value_iterator_test_strings.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,8 @@ auto strings_to_string_views(std::vector<std::string>& input_strings)
   std::vector<int32_t> offsets;
   std::tie(chars, offsets) = cudf::test::detail::make_chars_and_offsets(
     input_strings.begin(), input_strings.end(), all_valid);
-  auto dev_chars = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
+  auto dev_chars = cudf::detail::make_device_uvector_sync(
+    chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // calculate the expected value by CPU. (but contains device pointers)
   thrust::host_vector<cudf::string_view> replaced_array(input_strings.size());
@@ -51,8 +52,9 @@ TEST_F(StringIteratorTest, string_view_null_iterator)
   using T = cudf::string_view;
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
-  T init       = T{initmsg.data(), int(initmsg.size())};
+  auto initmsg = cudf::detail::make_device_uvector_sync(
+    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  T init = T{initmsg.data(), int(initmsg.size())};
 
   // data and valid arrays
   std::vector<std::string> host_values(
@@ -86,8 +88,9 @@ TEST_F(StringIteratorTest, string_view_no_null_iterator)
   // T init = T{"", 0};
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
-  T init       = T{initmsg.data(), int(initmsg.size())};
+  auto initmsg = cudf::detail::make_device_uvector_sync(
+    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  T init = T{initmsg.data(), int(initmsg.size())};
 
   // data array
   std::vector<std::string> host_values(
@@ -110,8 +113,9 @@ TEST_F(StringIteratorTest, string_scalar_iterator)
   // T init = T{"", 0};
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
-  T init       = T{initmsg.data(), int(initmsg.size())};
+  auto initmsg = cudf::detail::make_device_uvector_sync(
+    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  T init = T{initmsg.data(), int(initmsg.size())};
 
   // data array
   std::vector<std::string> host_values(100, zero);
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 31500319592..404ff7d8380 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -1865,8 +1865,8 @@ TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown)
 
   // Note: Join result might not have nulls pushed down, since it's an output of gather().
   // Must superimpose parent nulls before comparisons.
-  auto [superimposed_results, _] =
-    cudf::structs::detail::push_down_nulls(*result, cudf::get_default_stream());
+  auto [superimposed_results, _] = cudf::structs::detail::push_down_nulls(
+    *result, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto const expected = [] {
     auto fact_ints    = ints{0};
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 9d206c5397d..a1508b5b973 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -308,8 +308,8 @@ void run_fixed_width_test(size_t cols,
 
   // Make a table view of the partition numbers
   constexpr cudf::data_type dtype{cudf::type_id::INT32};
-  auto d_partitions =
-    cudf::detail::make_device_uvector_sync(partitions, cudf::get_default_stream());
+  auto d_partitions = cudf::detail::make_device_uvector_sync(
+    partitions, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   cudf::column_view partitions_col(dtype, rows, d_partitions.data());
   cudf::table_view partitions_table({partitions_col});
 
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 5809501fe2f..819b342ff8f 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -373,7 +373,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 74c5e7fb504..77fdad09c0b 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -49,9 +49,9 @@ TYPED_TEST(SegmentedReductionTest, SumExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{6, 4, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -97,9 +97,9 @@ TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{15, 15, 1, XXX, XXX, XXX},
                                                                         {1, 1, 1, 0, 0, 0}};
 
@@ -147,9 +147,9 @@ TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{3, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -195,9 +195,9 @@ TYPED_TEST(SegmentedReductionTest, MinExcludeNulls)
   // output nullmask: {1, 1, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{1, 1, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -244,43 +244,32 @@ TYPED_TEST(SegmentedReductionTest, AnyExcludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX},
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {false, false, true, true, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, true, true, true, false, true, true, false, false}};
 
-  auto res =
-    cudf::segmented_reduce(input,
-                           d_offsets,
-                           *cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>(),
-                           cudf::data_type{cudf::type_id::BOOL8},
-                           cudf::null_policy::EXCLUDE);
+  auto const agg         = cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::BOOL8};
+  auto const policy      = cudf::null_policy::EXCLUDE;
+
+  auto res = cudf::segmented_reduce(input, d_offsets, *agg, output_type, policy);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
 
   // Test with initial value
-  auto const init_scalar = cudf::make_fixed_width_scalar<TypeParam>(1);
+  auto const init_scalar = cudf::make_fixed_width_scalar<TypeParam>(0);
   auto const init_expect = cudf::test::fixed_width_column_wrapper<bool>{
-    {true, true, true, true, true, true, true, true, true},
+    {false, false, true, true, false, false, true, false, false},
     {true, true, true, true, true, true, true, true, true}};
 
-  res = cudf::segmented_reduce(input,
-                               d_offsets,
-                               *cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>(),
-                               cudf::data_type{cudf::type_id::BOOL8},
-                               cudf::null_policy::EXCLUDE,
-                               *init_scalar);
+  res = cudf::segmented_reduce(input, d_offsets, *agg, output_type, policy, *init_scalar);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, init_expect);
 
   // Test with null initial value
   init_scalar->set_valid_async(false);
-  res = cudf::segmented_reduce(input,
-                               d_offsets,
-                               *cudf::make_any_aggregation<cudf::segmented_reduce_aggregation>(),
-                               cudf::data_type{cudf::type_id::BOOL8},
-                               cudf::null_policy::EXCLUDE,
-                               *init_scalar);
+  res = cudf::segmented_reduce(input, d_offsets, *agg, output_type, policy, *init_scalar);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*res, expect);
 }
 
@@ -295,9 +284,9 @@ TYPED_TEST(SegmentedReductionTest, AllExcludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0},
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {true, true, bool{XXX}, true, bool{XXX}, bool{XXX}, false, false, false},
     {true, true, false, true, false, false, true, true, true}};
@@ -346,9 +335,9 @@ TYPED_TEST(SegmentedReductionTest, SumIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{6, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -397,9 +386,9 @@ TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{15, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -450,9 +439,9 @@ TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{3, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -501,9 +490,9 @@ TYPED_TEST(SegmentedReductionTest, MinIncludeNulls)
   // output nullmask: {1, 0, 1, 0, 0, 0}
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{1, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -553,9 +542,9 @@ TYPED_TEST(SegmentedReductionTest, AnyIncludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX},
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {false, bool{XXX}, true, bool{XXX}, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, false, true, false, false, true, true, false, false}};
@@ -616,9 +605,9 @@ TYPED_TEST(SegmentedReductionTest, AllIncludeNulls)
   auto const input = cudf::test::fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0},
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {true, bool{XXX}, bool{XXX}, true, bool{XXX}, bool{XXX}, false, bool{XXX}, false},
     {true, false, false, true, false, false, true, false, true}};
@@ -681,9 +670,9 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
 
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>{
     {1, 2, 3, 4, 5, 6, 7}, {true, true, true, true, true, true, true}};
-  auto const offsets = std::vector<cudf::size_type>{1, 3, 4};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{1, 3, 4};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{{5, 4}, {true, true}};
 
   auto res =
@@ -731,10 +720,10 @@ TEST_F(SegmentedReductionTestUntyped, NonNullableInput)
   // outputs: {1, 5, 4}
   // output nullmask: {1, 1, 1}
 
-  auto const input   = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 3, 7};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 3, 7};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<int32_t>{{1, XXX, 5, 22}, {true, false, true, true}};
 
@@ -778,9 +767,9 @@ TEST_F(SegmentedReductionTestUntyped, Mean)
 {
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -797,9 +786,9 @@ TEST_F(SegmentedReductionTestUntyped, MeanNulls)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 20, 30, 40, 50, 60, 0, 80, 90}, {1, 1, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -817,9 +806,9 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquares)
 {
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
@@ -837,9 +826,9 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquaresNulls)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 20, 30, 40, 50, 60, 0, 80, 90}, {1, 1, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT64};
 
@@ -859,9 +848,9 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviation)
   constexpr float NaN{std::numeric_limits<float>::quiet_NaN()};
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -879,9 +868,9 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviationNulls)
   constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -901,9 +890,9 @@ TEST_F(SegmentedReductionTestUntyped, Variance)
   constexpr float NaN{std::numeric_limits<float>::quiet_NaN()};
   auto const input =
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -921,9 +910,9 @@ TEST_F(SegmentedReductionTestUntyped, VarianceNulls)
   constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -938,13 +927,55 @@ TEST_F(SegmentedReductionTestUntyped, VarianceNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 }
 
+TEST_F(SegmentedReductionTestUntyped, NUnique)
+{
+  auto const input =
+    cudf::test::fixed_width_column_wrapper<int32_t>({10, 15, 20, 30, 60, 60, 70, 70, 80});
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 2, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const agg         = cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::INT32};
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{{1, 0, 1, 2, 3}, {1, 0, 1, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(SegmentedReductionTestUntyped, NUniqueNulls)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {10, 0, 20, 30, 60, 60, 70, 70, 0}, {1, 0, 1, 1, 1, 1, 1, 1, 0});
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 2, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const agg         = cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
+  auto const output_type = cudf::data_type{cudf::type_id::INT32};
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{{1, 0, 0, 2, 2}, {1, 0, 0, 1, 1}};
+  auto result =
+    cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+
+  expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{{1, 0, 1, 2, 3}, {1, 0, 1, 1, 1}};
+  result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
 TEST_F(SegmentedReductionTestUntyped, Errors)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
-  auto const offsets = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const null_policy = cudf::null_policy::EXCLUDE;
   auto const output_type = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS};
   auto const str_input =
@@ -1010,10 +1041,10 @@ TEST_F(SegmentedReductionTestUntyped, Errors)
 
 TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
 {
-  auto const input   = cudf::test::fixed_width_column_wrapper<int32_t>{};
-  auto const offsets = std::vector<cudf::size_type>{0};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{};
+  auto const offsets   = std::vector<cudf::size_type>{0};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{};
 
   auto res =
@@ -1047,10 +1078,10 @@ TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
 
 TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
 {
-  auto const input   = cudf::test::fixed_width_column_wrapper<int32_t>{};
-  auto const offsets = std::vector<cudf::size_type>{0, 0, 0, 0, 0, 0};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{};
+  auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0, 0, 0};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<int32_t>{{XXX, XXX, XXX, XXX, XXX}, {0, 0, 0, 0, 0}};
 
@@ -1098,9 +1129,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxWithNulls)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1126,9 +1157,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinWithNulls)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1154,9 +1185,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxNonNullableInput)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 4, 4};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 4, 4};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1179,9 +1210,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinNonNullableInput)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 4, 4};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 4, 4};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1204,9 +1235,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, Sum)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1242,9 +1273,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, Product)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 12, 12};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 12, 12};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_product_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1279,9 +1310,9 @@ TYPED_TEST(SegmentedReductionFixedPointTest, SumOfSquares)
 {
   using RepType = cudf::device_storage_type_t<TypeParam>;
 
-  auto const offsets = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1442,10 +1473,10 @@ TEST_F(SegmentedReductionStringTest, MinExcludeNulls)
 
 TEST_F(SegmentedReductionStringTest, EmptyInputWithOffsets)
 {
-  auto const input   = cudf::test::strings_column_wrapper{};
-  auto const offsets = std::vector<cudf::size_type>{0, 0, 0, 0};
-  auto const d_offsets =
-    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const input     = cudf::test::strings_column_wrapper{};
+  auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const expect = cudf::test::strings_column_wrapper({XXX, XXX, XXX}, {0, 0, 0});
 
   auto result =
diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp
index 1b753fb6040..c70e0a78100 100644
--- a/cpp/tests/rolling/range_window_bounds_test.cpp
+++ b/cpp/tests/rolling/range_window_bounds_test.cpp
@@ -21,6 +21,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/rolling/range_window_bounds.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <src/rolling/detail/range_window_bounds.hpp>
 
 #include <vector>
@@ -57,34 +58,43 @@ TYPED_TEST(TimestampRangeWindowBoundsTest, BoundsConstruction)
   using OrderByType = TypeParam;
   using range_type  = cudf::detail::range_type<OrderByType>;
   using rep_type    = cudf::detail::range_rep_type<OrderByType>;
+  auto const dtype  = cudf::data_type{cudf::type_to_id<OrderByType>()};
 
   static_assert(cudf::is_duration<range_type>());
   auto range_3 = cudf::range_window_bounds::get(cudf::duration_scalar<range_type>{3, true});
   EXPECT_FALSE(range_3.is_unbounded() &&
                "range_window_bounds constructed from scalar cannot be unbounded.");
-  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_3), rep_type{3});
+  EXPECT_EQ(
+    cudf::detail::range_comparable_value<OrderByType>(range_3, dtype, cudf::get_default_stream()),
+    rep_type{3});
 
   auto range_unbounded =
     cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<range_type>()});
   EXPECT_TRUE(range_unbounded.is_unbounded() &&
               "range_window_bounds::unbounded() must return an unbounded range.");
-  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_unbounded), rep_type{});
+  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(
+              range_unbounded, dtype, cudf::get_default_stream()),
+            rep_type{});
 }
 
 TYPED_TEST(TimestampRangeWindowBoundsTest, WrongRangeType)
 {
   using OrderByType = TypeParam;
+  auto const dtype  = cudf::data_type{cudf::type_to_id<OrderByType>()};
 
   using wrong_range_type = std::conditional_t<std::is_same_v<OrderByType, cudf::timestamp_D>,
                                               cudf::duration_ns,
                                               cudf::duration_D>;
   auto range_3 = cudf::range_window_bounds::get(cudf::duration_scalar<wrong_range_type>{3, true});
 
-  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_3), cudf::logic_error);
+  EXPECT_THROW(
+    cudf::detail::range_comparable_value<OrderByType>(range_3, dtype, cudf::get_default_stream()),
+    cudf::logic_error);
 
   auto range_unbounded =
     cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<wrong_range_type>()});
-  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_unbounded),
+  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(
+                 range_unbounded, dtype, cudf::get_default_stream()),
                cudf::logic_error);
 }
 
@@ -112,33 +122,42 @@ TYPED_TEST(NumericRangeWindowBoundsTest, BoundsConstruction)
   using OrderByType = TypeParam;
   using range_type  = cudf::detail::range_type<OrderByType>;
   using rep_type    = cudf::detail::range_rep_type<OrderByType>;
+  auto const dtype  = cudf::data_type{cudf::type_to_id<OrderByType>()};
 
   static_assert(std::is_integral_v<range_type>);
   auto range_3 = cudf::range_window_bounds::get(cudf::numeric_scalar<range_type>{3, true});
   EXPECT_FALSE(range_3.is_unbounded() &&
                "range_window_bounds constructed from scalar cannot be unbounded.");
-  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_3), rep_type{3});
+  EXPECT_EQ(
+    cudf::detail::range_comparable_value<OrderByType>(range_3, dtype, cudf::get_default_stream()),
+    rep_type{3});
 
   auto range_unbounded =
     cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<range_type>()});
   EXPECT_TRUE(range_unbounded.is_unbounded() &&
               "range_window_bounds::unbounded() must return an unbounded range.");
-  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_unbounded), rep_type{});
+  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(
+              range_unbounded, dtype, cudf::get_default_stream()),
+            rep_type{});
 }
 
 TYPED_TEST(NumericRangeWindowBoundsTest, WrongRangeType)
 {
   using OrderByType = TypeParam;
+  auto const dtype  = cudf::data_type{cudf::type_to_id<OrderByType>()};
 
   using wrong_range_type =
     std::conditional_t<std::is_same_v<OrderByType, int32_t>, int16_t, int32_t>;
   auto range_3 = cudf::range_window_bounds::get(cudf::numeric_scalar<wrong_range_type>{3, true});
 
-  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_3), cudf::logic_error);
+  EXPECT_THROW(
+    cudf::detail::range_comparable_value<OrderByType>(range_3, dtype, cudf::get_default_stream()),
+    cudf::logic_error);
 
   auto range_unbounded =
     cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<wrong_range_type>()});
-  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_unbounded),
+  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(
+                 range_unbounded, dtype, cudf::get_default_stream()),
                cudf::logic_error);
 }
 
@@ -150,8 +169,9 @@ TYPED_TEST_SUITE(DecimalRangeBoundsTest, cudf::test::FixedPointTypes);
 
 TYPED_TEST(DecimalRangeBoundsTest, BoundsConstruction)
 {
-  using DecimalT = TypeParam;
-  using Rep      = cudf::detail::range_rep_type<DecimalT>;
+  using DecimalT   = TypeParam;
+  using Rep        = cudf::detail::range_rep_type<DecimalT>;
+  auto const dtype = cudf::data_type{cudf::type_to_id<DecimalT>()};
 
   // Interval type must match the decimal type.
   static_assert(std::is_same_v<cudf::detail::range_type<DecimalT>, DecimalT>);
@@ -160,7 +180,9 @@ TYPED_TEST(DecimalRangeBoundsTest, BoundsConstruction)
     cudf::fixed_point_scalar<DecimalT>{Rep{3}, numeric::scale_type{0}});
   EXPECT_FALSE(range_3.is_unbounded() &&
                "range_window_bounds constructed from scalar cannot be unbounded.");
-  EXPECT_EQ(cudf::detail::range_comparable_value<DecimalT>(range_3), Rep{3});
+  EXPECT_EQ(
+    cudf::detail::range_comparable_value<DecimalT>(range_3, dtype, cudf::get_default_stream()),
+    Rep{3});
 
   auto const range_unbounded =
     cudf::range_window_bounds::unbounded(cudf::data_type{cudf::type_to_id<DecimalT>()});
@@ -183,8 +205,8 @@ TYPED_TEST(DecimalRangeBoundsTest, Rescale)
   for (auto const range_scale : {-2, -1, 0, 1, 2}) {
     auto const decimal_range_bounds = cudf::range_window_bounds::get(
       cudf::fixed_point_scalar<DecimalT>{RepT{20}, numeric::scale_type{range_scale}});
-    auto const rescaled_range_rep =
-      cudf::detail::range_comparable_value<DecimalT>(decimal_range_bounds, order_by_data_type);
+    auto const rescaled_range_rep = cudf::detail::range_comparable_value<DecimalT>(
+      decimal_range_bounds, order_by_data_type, cudf::get_default_stream());
     EXPECT_EQ(rescaled_range_rep, RepT{20} * pow10[range_scale - order_by_scale]);
   }
 
@@ -192,8 +214,8 @@ TYPED_TEST(DecimalRangeBoundsTest, Rescale)
   {
     auto const decimal_range_bounds = cudf::range_window_bounds::get(
       cudf::fixed_point_scalar<DecimalT>{RepT{200}, numeric::scale_type{-3}});
-    EXPECT_THROW(
-      cudf::detail::range_comparable_value<DecimalT>(decimal_range_bounds, order_by_data_type),
-      cudf::logic_error);
+    EXPECT_THROW(cudf::detail::range_comparable_value<DecimalT>(
+                   decimal_range_bounds, order_by_data_type, cudf::get_default_stream()),
+                 cudf::logic_error);
   }
 }
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index c7365d63e1c..9e0f68573a5 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,7 +130,8 @@ TEST_F(StringScalarDeviceViewTest, Value)
 
   auto scalar_device_view = cudf::get_scalar_device_view(s);
   rmm::device_scalar<bool> result{cudf::get_default_stream()};
-  auto value_v = cudf::detail::make_device_uvector_sync(value, cudf::get_default_stream());
+  auto value_v = cudf::detail::make_device_uvector_sync(
+    value, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   test_string_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     scalar_device_view, value_v.data(), value.size(), result.data());
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 5331c4c34d8..316f24e4167 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -298,9 +298,11 @@ TEST_F(StringsContainsTests, HexTest)
   std::vector<cudf::offset_type> offsets(
     {thrust::make_counting_iterator<cudf::offset_type>(0),
      thrust::make_counting_iterator<cudf::offset_type>(0) + count + 1});
-  auto d_chars   = cudf::detail::make_device_uvector_sync(ascii_chars, cudf::get_default_stream());
-  auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
-  auto input     = cudf::make_strings_column(d_chars, d_offsets);
+  auto d_chars = cudf::detail::make_device_uvector_sync(
+    ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_offsets = cudf::detail::make_device_uvector_sync(
+    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto input = cudf::make_strings_column(d_chars, d_offsets);
 
   auto strings_view = cudf::strings_column_view(input->view());
   for (auto ch : ascii_chars) {
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index e3df8db721d..77857049e7a 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -78,7 +78,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     }
     h_offsets[idx + 1] = offset;
   }
-  auto d_strings = cudf::detail::make_device_uvector_sync(strings, cudf::get_default_stream());
+  auto d_strings = cudf::detail::make_device_uvector_sync(
+    strings, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   CUDF_CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyDefault));
   auto column = cudf::make_strings_column(d_strings);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
@@ -143,10 +144,13 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   }
 
   std::vector<cudf::bitmask_type> h_nulls{h_null_mask};
-  auto d_buffer  = cudf::detail::make_device_uvector_sync(h_buffer, cudf::get_default_stream());
-  auto d_offsets = cudf::detail::make_device_uvector_sync(h_offsets, cudf::get_default_stream());
-  auto d_nulls   = cudf::detail::make_device_uvector_sync(h_nulls, cudf::get_default_stream());
-  auto column    = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count);
+  auto d_buffer = cudf::detail::make_device_uvector_sync(
+    h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_offsets = cudf::detail::make_device_uvector_sync(
+    h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_nulls = cudf::detail::make_device_uvector_sync(
+    h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto column = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
   EXPECT_EQ(column->null_count(), null_count);
   EXPECT_EQ(2, column->num_children());
@@ -184,8 +188,8 @@ TEST_F(StringsFactoriesTest, CreateScalar)
 TEST_F(StringsFactoriesTest, EmptyStringsColumn)
 {
   rmm::device_uvector<char> d_chars{0, cudf::get_default_stream()};
-  auto d_offsets =
-    cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(1, cudf::get_default_stream());
+  auto d_offsets = cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   rmm::device_uvector<cudf::bitmask_type> d_nulls{0, cudf::get_default_stream()};
 
   auto results = cudf::make_strings_column(d_chars, d_offsets, d_nulls, 0);
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 04e6886a08a..79e96ff5121 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -297,8 +297,9 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2));
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
-  auto d_integers = cudf::detail::make_device_uvector_sync(h_integers, cudf::get_default_stream());
-  auto integers   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
+  auto d_integers = cudf::detail::make_device_uvector_sync(
+    h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
   auto integers_view = integers->mutable_view();
   CUDF_CUDA_TRY(cudaMemcpy(integers_view.data<TypeParam>(),
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 32e097838c0..85185b2deab 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -290,28 +290,22 @@ TEST_F(StringsReplaceTest, ReplaceSlice)
 
 TEST_F(StringsReplaceTest, ReplaceSliceError)
 {
-  std::vector<const char*> h_strings{"Héllo", "thesé", nullptr, "are not", "important", ""};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  auto strings_view = cudf::strings_column_view(strings);
-  EXPECT_THROW(cudf::strings::replace_slice(strings_view, cudf::string_scalar(""), 4, 1),
-               cudf::logic_error);
+  cudf::test::strings_column_wrapper input({"Héllo", "thesé", "are not", "important", ""});
+  EXPECT_THROW(
+    cudf::strings::replace_slice(cudf::strings_column_view(input), cudf::string_scalar(""), 4, 1),
+    cudf::logic_error);
 }
 
 TEST_F(StringsReplaceTest, ReplaceMulti)
 {
-  auto strings      = build_corpus();
-  auto strings_view = cudf::strings_column_view(strings);
+  auto input        = build_corpus();
+  auto strings_view = cudf::strings_column_view(input);
 
-  std::vector<const char*> h_targets{"the ", "a ", "to "};
-  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+  cudf::test::strings_column_wrapper targets({"the ", "a ", "to "});
   auto targets_view = cudf::strings_column_view(targets);
 
   {
-    std::vector<const char*> h_repls{"_ ", "A ", "2 "};
-    cudf::test::strings_column_wrapper repls(h_repls.begin(), h_repls.end());
+    cudf::test::strings_column_wrapper repls({"_ ", "A ", "2 "});
     auto repls_view = cudf::strings_column_view(repls);
 
     auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
@@ -331,8 +325,7 @@ TEST_F(StringsReplaceTest, ReplaceMulti)
   }
 
   {
-    std::vector<const char*> h_repls{"* "};
-    cudf::test::strings_column_wrapper repls(h_repls.begin(), h_repls.end());
+    cudf::test::strings_column_wrapper repls({"* "});
     auto repls_view = cudf::strings_column_view(repls);
 
     auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
@@ -352,6 +345,129 @@ TEST_F(StringsReplaceTest, ReplaceMulti)
   }
 }
 
+TEST_F(StringsReplaceTest, ReplaceMultiLong)
+{
+  // The length of the strings are to trigger the code path governed by the AVG_CHAR_BYTES_THRESHOLD
+  // setting in the multi.cu.
+  auto input = cudf::test::strings_column_wrapper(
+    {"This string needs to be very long to trigger the long-replace internal functions. "
+     "This string needs to be very long to trigger the long-replace internal functions. "
+     "This string needs to be very long to trigger the long-replace internal functions. "
+     "This string needs to be very long to trigger the long-replace internal functions.",
+     "012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012"
+     "345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345"
+     "678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678"
+     "901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901"
+     "2345678901234567890123456789",
+     "012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012"
+     "345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345"
+     "678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678"
+     "901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901"
+     "2345678901234567890123456789",
+     "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+     "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+     "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+     "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+     "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá",
+     "",
+     ""},
+    {1, 1, 1, 1, 0, 1});
+  auto strings_view = cudf::strings_column_view(input);
+
+  auto targets      = cudf::test::strings_column_wrapper({"78901", "bananá", "ápple", "78"});
+  auto targets_view = cudf::strings_column_view(targets);
+
+  {
+    cudf::test::strings_column_wrapper repls({"x", "PEAR", "avocado", "$$"});
+    auto repls_view = cudf::strings_column_view(repls);
+
+    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+
+    cudf::test::strings_column_wrapper expected(
+      {"This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions.",
+       "0123456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456"
+       "x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x"
+       "23456x23456x23456x23456x23456x23456x23456x23456x23456x23456$$9",
+       "0123456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456"
+       "x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x23456x"
+       "23456x23456x23456x23456x23456x23456x23456x23456x23456x23456$$9",
+       "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR "
+       "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR "
+       "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR "
+       "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR "
+       "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR",
+       "",
+       ""},
+      {1, 1, 1, 1, 0, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+
+  {
+    cudf::test::strings_column_wrapper repls({"*"});
+    auto repls_view = cudf::strings_column_view(repls);
+
+    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+
+    cudf::test::strings_column_wrapper expected(
+      {"This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions.",
+       "0123456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*"
+       "23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*"
+       "23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*9",
+       "0123456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*"
+       "23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*"
+       "23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*23456*9",
+       "Test string for overlap check: banana* * ** ban* * * Test string for overlap check: "
+       "banana* * ** ban* * * Test string for overlap check: banana* * ** ban* * * Test string for "
+       "overlap check: banana* * ** ban* * * Test string for overlap check: banana* * ** ban* * *",
+       "",
+       ""},
+      {1, 1, 1, 1, 0, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+
+  {
+    targets =
+      cudf::test::strings_column_wrapper({"01234567890123456789012345678901234567890123456789012345"
+                                          "6789012345678901234567890123456789012"
+                                          "34567890123456789012345678901234567890123456789012345678"
+                                          "9012345678901234567890123456789012345"
+                                          "67890123456789012345678901234567890123456789012345678901"
+                                          "2345678901234567890123456789012345678"
+                                          "90123456789012345678901234567890123456789012345678901234"
+                                          "5678901234567890123456789012345678901"
+                                          "2345678901234567890123456789",
+                                          "78"});
+    targets_view    = cudf::strings_column_view(targets);
+    auto repls      = cudf::test::strings_column_wrapper({""});
+    auto repls_view = cudf::strings_column_view(repls);
+
+    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+
+    cudf::test::strings_column_wrapper expected(
+      {"This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions. "
+       "This string needs to be very long to trigger the long-replace internal functions.",
+       "",
+       "",
+       "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+       "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+       "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+       "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá "
+       "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá",
+       "",
+       ""},
+      {1, 1, 1, 1, 0, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
 TEST_F(StringsReplaceTest, EmptyStringsColumn)
 {
   cudf::column_view zero_size_strings_column(
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index e92b96553c0..327fede6126 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -53,9 +53,14 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevel)
   auto lists_col = lists{{0, 1}, {22, 33}, {44, 55, 66}};
   auto nums_col  = nums{{0, 1, 2}, cudf::test::iterators::null_at(6)};
 
-  auto table           = cudf::table_view{{lists_col, nums_col}};
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto table = cudf::table_view{{lists_col, nums_col}};
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(table, flattened_table->flattened_columns());
 }
@@ -76,7 +81,8 @@ TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported)
                  {},
                  {},
                  cudf::structs::detail::column_nullability::FORCE,
-                 cudf::get_default_stream()),
+                 cudf::get_default_stream(),
+                 rmm::mr::get_current_device_resource()),
                cudf::logic_error);
 }
 
@@ -90,9 +96,14 @@ TYPED_TEST(TypedStructUtilitiesTest, NoStructs)
     {"", "1", "22", "333", "4444", "55555", "666666"}, cudf::test::iterators::null_at(1)};
   auto nuther_nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, cudf::test::iterators::null_at(6)};
 
-  auto table           = cudf::table_view{{nums_col, strings_col, nuther_nums_col}};
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto table = cudf::table_view{{nums_col, strings_col, nuther_nums_col}};
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(table, flattened_table->flattened_columns());
 }
@@ -118,8 +129,13 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct)
   auto expected = cudf::table_view{
     {expected_nums_col_1, expected_structs_col, expected_nums_col_2, expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -146,8 +162,13 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls)
   auto expected = cudf::table_view{
     {expected_nums_col_1, expected_structs_col, expected_nums_col_2, expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -185,8 +206,13 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct)
                                     expected_nums_col_3,
                                     expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -225,8 +251,13 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel)
                                     expected_nums_col_3,
                                     expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -266,8 +297,13 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel)
                                     expected_nums_col_3,
                                     expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -307,8 +343,13 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
                                     expected_nums_col_3,
                                     expected_strings_col}};
 
-  auto flattened_table = cudf::structs::detail::flatten_nested_columns(
-    table, {}, {}, cudf::structs::detail::column_nullability::FORCE, cudf::get_default_stream());
+  auto flattened_table =
+    cudf::structs::detail::flatten_nested_columns(table,
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE,
+                                                  cudf::get_default_stream(),
+                                                  rmm::mr::get_current_device_resource());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -330,7 +371,8 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported)
                  {},
                  {},
                  cudf::structs::detail::column_nullability::FORCE,
-                 cudf::get_default_stream()),
+                 cudf::get_default_stream(),
+                 rmm::mr::get_current_device_resource()),
                cudf::logic_error);
 }
 
@@ -346,8 +388,8 @@ TYPED_TEST_SUITE(TypedSuperimposeTest, cudf::test::FixedWidthTypes);
 void test_non_struct_columns(cudf::column_view const& input)
 {
   // push_down_nulls() on non-struct columns should return the input column, unchanged.
-  auto [superimposed, backing_data] =
-    cudf::structs::detail::push_down_nulls(input, cudf::get_default_stream());
+  auto [superimposed, backing_data] = cudf::structs::detail::push_down_nulls(
+    input, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(input, superimposed);
   EXPECT_TRUE(backing_data.new_null_masks.empty());
@@ -410,8 +452,8 @@ TYPED_TEST(TypedSuperimposeTest, BasicStruct)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(structs_view.child(1),
                                  make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5})));
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(structs_view, cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    structs_view, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), the struct nulls (i.e. at index-0) should have been pushed
   // down to the children. All members should have nulls at row-index 0.
@@ -436,8 +478,8 @@ TYPED_TEST(TypedSuperimposeTest, NonNullableParentStruct)
                                                           cudf::test::iterators::no_nulls()}
                          .release();
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(structs_input->view(), cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    structs_input->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), none of the child structs should have changed,
   // because the parent had no nulls to begin with.
@@ -471,8 +513,8 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_ChildNullable_ParentNonNullable)
   auto structs_of_structs =
     cudf::test::structs_column_wrapper{std::move(outer_struct_members)}.release();
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(structs_of_structs->view(), cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    structs_of_structs->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), outer-struct column should not have pushed nulls to child
   // structs. But the child struct column must push its nulls to its own children.
@@ -514,8 +556,8 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_ChildNullable_ParentNullable)
   cudf::detail::set_null_mask(
     structs_of_structs_view.null_mask(), 1, 2, false, cudf::get_default_stream());
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(structs_of_structs->view(), cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    structs_of_structs->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), outer-struct column should not have pushed nulls to child
   // structs. But the child struct column must push its nulls to its own children.
@@ -570,8 +612,8 @@ TYPED_TEST(TypedSuperimposeTest, Struct_Sliced)
   // nums_member:  11011
   // lists_member: 00111
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(sliced_structs, cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    sliced_structs, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), the null masks should be:
   // STRUCT:       11110
@@ -623,8 +665,8 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_Sliced)
   // nums_member:    11010
   // lists_member:   00110
 
-  auto [output, backing_data] =
-    cudf::structs::detail::push_down_nulls(sliced_structs, cudf::get_default_stream());
+  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
+    sliced_structs, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   // After push_down_nulls(), the null masks will be:
   // STRUCT<STRUCT>: 11101
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index 0542d007ca0..5127f69162f 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,8 +47,8 @@ void row_comparison(cudf::table_view input1,
 
   auto device_table_1 = cudf::table_device_view::create(input1, stream);
   auto device_table_2 = cudf::table_device_view::create(input2, stream);
-  auto d_column_order =
-    cudf::detail::make_device_uvector_sync(column_order, cudf::get_default_stream());
+  auto d_column_order = cudf::detail::make_device_uvector_sync(
+    column_order, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto comparator = cudf::row_lexicographic_comparator(
     cudf::nullate::NO{}, *device_table_1, *device_table_2, d_column_order.data());
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index 911911851f2..a27d8931ee6 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,7 +69,8 @@ __global__ void dispatch_test_kernel(cudf::type_id id, bool* d_result)
 
 TYPED_TEST(TypedDispatcherTest, DeviceDispatch)
 {
-  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1, cudf::get_default_stream());
+  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
@@ -130,7 +131,8 @@ __global__ void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2
 
 TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch)
 {
-  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1, cudf::get_default_stream());
+  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   double_dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 6c441539621..133ca99b31f 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -24,7 +24,7 @@
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <cudf/table/row_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -46,6 +46,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/reduce.h>
+#include <thrust/remove.h>
 #include <thrust/scan.h>
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
@@ -371,55 +372,56 @@ struct column_property_comparator {
   }
 };
 
+template <typename DeviceComparator>
 class corresponding_rows_unequal {
  public:
-  corresponding_rows_unequal(table_device_view d_lhs,
-                             table_device_view d_rhs,
-                             column_device_view lhs_row_indices_,
+  corresponding_rows_unequal(column_device_view lhs_row_indices_,
                              column_device_view rhs_row_indices_,
-                             size_type /*fp_ulps*/)
-    : comp(cudf::nullate::YES{}, d_lhs, d_rhs, cudf::null_equality::EQUAL),
-      lhs_row_indices(lhs_row_indices_),
-      rhs_row_indices(rhs_row_indices_)
+                             size_type /*fp_ulps*/,
+                             DeviceComparator comp_,
+                             column_device_view /*lhs*/,
+                             column_device_view /*rhs*/)
+    : lhs_row_indices(lhs_row_indices_), rhs_row_indices(rhs_row_indices_), comp(comp_)
   {
   }
 
-  cudf::row_equality_comparator<cudf::nullate::YES> comp;
-
   __device__ bool operator()(size_type index)
   {
-    return !comp(lhs_row_indices.element<size_type>(index),
-                 rhs_row_indices.element<size_type>(index));
+    using cudf::experimental::row::lhs_index_type;
+    using cudf::experimental::row::rhs_index_type;
+
+    return !comp(lhs_index_type{lhs_row_indices.element<size_type>(index)},
+                 rhs_index_type{rhs_row_indices.element<size_type>(index)});
   }
 
   column_device_view lhs_row_indices;
   column_device_view rhs_row_indices;
+  DeviceComparator comp;
 };
 
+template <typename DeviceComparator>
 class corresponding_rows_not_equivalent {
-  table_device_view d_lhs;
-  table_device_view d_rhs;
-
   column_device_view lhs_row_indices;
   column_device_view rhs_row_indices;
-
   size_type const fp_ulps;
+  DeviceComparator comp;
+  column_device_view lhs;
+  column_device_view rhs;
 
  public:
-  corresponding_rows_not_equivalent(table_device_view d_lhs,
-                                    table_device_view d_rhs,
-                                    column_device_view lhs_row_indices_,
+  corresponding_rows_not_equivalent(column_device_view lhs_row_indices_,
                                     column_device_view rhs_row_indices_,
-                                    size_type fp_ulps_)
-    : d_lhs(d_lhs),
-      d_rhs(d_rhs),
-      comp(cudf::nullate::YES{}, d_lhs, d_rhs, null_equality::EQUAL),
-      lhs_row_indices(lhs_row_indices_),
+                                    size_type fp_ulps_,
+                                    DeviceComparator comp_,
+                                    column_device_view lhs_,
+                                    column_device_view rhs_)
+    : lhs_row_indices(lhs_row_indices_),
       rhs_row_indices(rhs_row_indices_),
-      fp_ulps(fp_ulps_)
+      fp_ulps(fp_ulps_),
+      comp(comp_),
+      lhs(lhs_),
+      rhs(rhs_)
   {
-    CUDF_EXPECTS(d_lhs.num_columns() == 1 and d_rhs.num_columns() == 1,
-                 "Unsupported number of columns");
   }
 
   struct typed_element_not_equivalent {
@@ -459,23 +461,17 @@ class corresponding_rows_not_equivalent {
     }
   };
 
-  cudf::row_equality_comparator<cudf::nullate::YES> comp;
-
   __device__ bool operator()(size_type index)
   {
+    using cudf::experimental::row::lhs_index_type;
+    using cudf::experimental::row::rhs_index_type;
+
     auto const lhs_index = lhs_row_indices.element<size_type>(index);
     auto const rhs_index = rhs_row_indices.element<size_type>(index);
 
-    if (not comp(lhs_index, rhs_index)) {
-      auto lhs_col = this->d_lhs.column(0);
-      auto rhs_col = this->d_rhs.column(0);
-      return type_dispatcher(lhs_col.type(),
-                             typed_element_not_equivalent{},
-                             lhs_col,
-                             rhs_col,
-                             lhs_index,
-                             rhs_index,
-                             fp_ulps);
+    if (not comp(lhs_index_type{lhs_index}, rhs_index_type{rhs_index})) {
+      return type_dispatcher(
+        lhs.type(), typed_element_not_equivalent{}, lhs, rhs, lhs_index, rhs_index, fp_ulps);
     }
     return false;
   }
@@ -536,25 +532,46 @@ struct column_comparator_impl {
                   size_type fp_ulps,
                   int depth)
   {
-    auto d_lhs = cudf::table_device_view::create(table_view{{lhs}});
-    auto d_rhs = cudf::table_device_view::create(table_view{{rhs}});
-
     auto d_lhs_row_indices = cudf::column_device_view::create(lhs_row_indices);
     auto d_rhs_row_indices = cudf::column_device_view::create(rhs_row_indices);
 
-    using ComparatorType = std::conditional_t<check_exact_equality,
-                                              corresponding_rows_unequal,
-                                              corresponding_rows_not_equivalent>;
+    auto d_lhs = cudf::column_device_view::create(lhs);
+    auto d_rhs = cudf::column_device_view::create(rhs);
+
+    auto lhs_tview = table_view{{lhs}};
+    auto rhs_tview = table_view{{rhs}};
+
+    auto const comparator = cudf::experimental::row::equality::two_table_comparator{
+      lhs_tview, rhs_tview, cudf::get_default_stream()};
+    auto const has_nulls = cudf::has_nulls(lhs_tview) or cudf::has_nulls(rhs_tview);
+
+    auto const device_comparator = comparator.equal_to<false>(cudf::nullate::DYNAMIC{has_nulls});
+
+    using ComparatorType =
+      std::conditional_t<check_exact_equality,
+                         corresponding_rows_unequal<decltype(device_comparator)>,
+                         corresponding_rows_not_equivalent<decltype(device_comparator)>>;
 
     auto differences = rmm::device_uvector<int>(
-      lhs.size(), cudf::get_default_stream());  // worst case: everything different
+      lhs_row_indices.size(), cudf::get_default_stream());  // worst case: everything different
     auto input_iter = thrust::make_counting_iterator(0);
-    auto diff_iter  = thrust::copy_if(
+
+    auto diff_map = rmm::device_uvector<bool>(lhs_row_indices.size(), cudf::get_default_stream());
+
+    thrust::transform(
       rmm::exec_policy(cudf::get_default_stream()),
       input_iter,
       input_iter + lhs_row_indices.size(),
-      differences.begin(),
-      ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps));
+      diff_map.begin(),
+      ComparatorType(
+        *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps, device_comparator, *d_lhs, *d_rhs));
+
+    auto diff_iter = thrust::copy_if(rmm::exec_policy(cudf::get_default_stream()),
+                                     input_iter,
+                                     input_iter + lhs_row_indices.size(),
+                                     diff_map.begin(),
+                                     differences.begin(),
+                                     thrust::identity<bool>{});
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
                        cudf::get_default_stream());  // shrink back down
diff --git a/cpp/tests/utilities/default_stream.cpp b/cpp/tests/utilities/default_stream.cpp
new file mode 100644
index 00000000000..52752f78bb9
--- /dev/null
+++ b/cpp/tests/utilities/default_stream.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/utilities/default_stream.hpp>
+
+#include <cudf_test/default_stream.hpp>
+
+namespace cudf {
+namespace test {
+
+rmm::cuda_stream_view const get_default_stream() { return cudf::get_default_stream(); }
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/utilities/identify_stream_usage.cpp b/cpp/tests/utilities/identify_stream_usage.cpp
index 87301a7d49d..a4d19a8f552 100644
--- a/cpp/tests/utilities/identify_stream_usage.cpp
+++ b/cpp/tests/utilities/identify_stream_usage.cpp
@@ -19,18 +19,58 @@
 
 #include <cuda_runtime.h>
 
+#include <cstdlib>
 #include <cxxabi.h>
 #include <dlfcn.h>
 #include <execinfo.h>
 #include <iostream>
 #include <stdexcept>
+#include <string>
 #include <unordered_map>
 
-/**
- * @brief Print a backtrace and raise an error if stream is a default stream.
- */
-void check_stream_and_error(cudaStream_t stream)
+// We control whether to override cudf::test::get_default_stream or
+// cudf::get_default_stream with a compile-time flag. Thesee are the two valid
+// options:
+// 1. STREAM_MODE_TESTING=OFF: In this mode, cudf::get_default_stream will
+//    return a custom stream and stream_is_invalid will return true if any CUDA
+//    API is called using any of CUDA's default stream constants
+//    (cudaStreamLegacy, cudaStreamDefault, or cudaStreamPerThread). This check
+//    is sufficient to ensure that cudf is using cudf::get_default_stream
+//    everywhere internally rather than implicitly using stream 0,
+//    cudaStreamDefault, cudaStreamLegacy, thrust execution policies, etc. It
+//    is not sufficient to guarantee a stream-ordered API because it will not
+//    identify places in the code that use cudf::get_default_stream instead of
+//    properly forwarding along a user-provided stream.
+// 2. STREAM_MODE_TESTING=ON: In this mode, cudf::test::get_default_stream
+//    returns a custom stream and stream_is_invalid returns true if any CUDA
+//    API is called using any stream other than cudf::test::get_default_stream.
+//    This is a necessary and sufficient condition to ensure that libcudf is
+//    properly passing streams through all of its (tested) APIs.
+
+namespace cudf {
+
+#ifdef STREAM_MODE_TESTING
+namespace test {
+#endif
+
+rmm::cuda_stream_view const get_default_stream()
 {
+  static rmm::cuda_stream stream{};
+  return {stream};
+}
+
+#ifdef STREAM_MODE_TESTING
+}  // namespace test
+#endif
+
+}  // namespace cudf
+
+bool stream_is_invalid(cudaStream_t stream)
+{
+#ifdef STREAM_MODE_TESTING
+  // In this mode the _only_ valid stream is the one returned by cudf::test::get_default_stream.
+  return (stream != cudf::test::get_default_stream().value());
+#else
   // We explicitly list the possibilities rather than using
   // `cudf::get_default_stream().value()` for two reasons:
   // 1. There is no guarantee that `thrust::device` and the default value of
@@ -39,8 +79,17 @@ void check_stream_and_error(cudaStream_t stream)
   // 2. Using the cudf default stream would require linking against cudf, which
   //    adds unnecessary complexity to the build process (especially in CI)
   //    when this simple approach is sufficient.
-  if (stream == cudaStreamDefault || (stream == cudaStreamLegacy) ||
-      (stream == cudaStreamPerThread)) {
+  return (stream == cudaStreamDefault) || (stream == cudaStreamLegacy) ||
+         (stream == cudaStreamPerThread);
+#endif
+}
+
+/**
+ * @brief Print a backtrace and raise an error if stream is a default stream.
+ */
+void check_stream_and_error(cudaStream_t stream)
+{
+  if (stream_is_invalid(stream)) {
 #ifdef __GNUC__
     // If we're on the wrong stream, print the stack trace from the current frame.
     // Adapted from from https://panthema.net/2008/0901-stacktrace-demangled/
@@ -109,7 +158,12 @@ void check_stream_and_error(cudaStream_t stream)
 #else
     std::cout << "Backtraces are only when built with a GNU compiler." << std::endl;
 #endif  // __GNUC__
-    throw std::runtime_error("Found unexpected default stream!");
+    char const* env_stream_error_mode{std::getenv("GTEST_CUDF_STREAM_ERROR_MODE")};
+    if (env_stream_error_mode && !strcmp(env_stream_error_mode, "print")) {
+      std::cout << "Found unexpected stream!" << std::endl;
+    } else {
+      throw std::runtime_error("Found unexpected stream!");
+    }
   }
 }
 
@@ -289,23 +343,6 @@ DEFINE_OVERLOAD(cudaMallocFromPoolAsync,
                 ARG(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream),
                 ARG(ptr, size, memPool, stream));
 
-namespace cudf {
-
-/**
- * @brief Get the current default stream
- *
- * Overload the default function to return a new stream here.
- *
- * @return The current default stream.
- */
-rmm::cuda_stream_view const get_default_stream()
-{
-  static rmm::cuda_stream stream{};
-  return {stream};
-}
-
-}  // namespace cudf
-
 /**
  * @brief Function to collect all the original CUDA symbols corresponding to overloaded functions.
  *
diff --git a/cpp/tests/utilities/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu
index 15998e32bd0..d2e95812894 100644
--- a/cpp/tests/utilities/tdigest_utilities.cu
+++ b/cpp/tests/utilities/tdigest_utilities.cu
@@ -64,12 +64,12 @@ void tdigest_sample_compare(cudf::tdigest::tdigest_column_view const& tdv,
     });
   }
 
-  auto d_expected_src =
-    cudf::detail::make_device_uvector_async(h_expected_src, cudf::get_default_stream());
-  auto d_expected_mean =
-    cudf::detail::make_device_uvector_async(h_expected_mean, cudf::get_default_stream());
-  auto d_expected_weight =
-    cudf::detail::make_device_uvector_async(h_expected_weight, cudf::get_default_stream());
+  auto d_expected_src = cudf::detail::make_device_uvector_async(
+    h_expected_src, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_expected_mean = cudf::detail::make_device_uvector_async(
+    h_expected_mean, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto d_expected_weight = cudf::detail::make_device_uvector_async(
+    h_expected_weight, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto iter = thrust::make_counting_iterator(0);
   thrust::for_each(
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index a043e723eda..66f9fbfc0d6 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -238,8 +238,8 @@ __global__ void simple_device_kernel(device_span<bool> result) { result[0] = tru
 
 TEST(SpanTest, CanUseDeviceSpan)
 {
-  auto d_message =
-    cudf::detail::make_zeroed_device_uvector_async<bool>(1, cudf::get_default_stream());
+  auto d_message = cudf::detail::make_zeroed_device_uvector_async<bool>(
+    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   auto d_span = device_span<bool>(d_message.data(), d_message.size());
 
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index 84a2d15d477..f65c3652dc9 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -147,6 +147,7 @@ TEST_F(ColumnTypeCheckTest, DifferentFixedWidth)
   fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
 
   EXPECT_FALSE(column_types_equal(lhs5, rhs5));
+  EXPECT_TRUE(column_types_equivalent(lhs5, rhs5));
 
   // Different rep, same scale
   fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
diff --git a/dependencies.yaml b/dependencies.yaml
index 7b623d58425..1bd664fc57d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -8,31 +8,43 @@ files:
     includes:
       - build_all
       - build_cpp
+      - build_wheels
       - build_python
+      - build_python_common
       - cudatoolkit
       - develop
       - docs
       - notebooks
       - py_version
-      - run
+      - run_common
+      - run_cudf
+      - run_dask_cudf
+      - run_cudf_kafka
+      - run_custreamz
       - test_cpp
-      - test_python
+      - test_python_common
+      - test_python_cudf
+      - test_python_dask_cudf
   test_cpp:
     output: none
     includes:
       - cudatoolkit
-      - libidentify_stream_usage_build
       - test_cpp
+      - libarrow_run
   test_python:
     output: none
     includes:
       - cudatoolkit
       - py_version
-      - test_python
+      - test_python_common
+      - test_python_cudf
+      - test_python_dask_cudf
+      - pyarrow_run
   test_java:
     output: none
     includes:
       - build_all
+      - libarrow_run
       - cudatoolkit
       - test_java
   test_notebooks:
@@ -51,6 +63,103 @@ files:
       - cudatoolkit
       - docs
       - py_version
+  py_build_cudf:
+    output: pyproject
+    pyproject_dir: python/cudf
+    extras:
+      table: build-system
+    includes:
+      - build_all
+      - build_python
+      - build_python_common
+      - build_wheels
+  py_run_cudf:
+    output: pyproject
+    pyproject_dir: python/cudf
+    extras:
+      table: project
+    includes:
+      - run_common
+      - run_cudf
+      - pyarrow_run
+  py_test_cudf:
+    output: pyproject
+    pyproject_dir: python/cudf
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+      - test_python_cudf
+  py_build_dask_cudf:
+    output: pyproject
+    pyproject_dir: python/dask_cudf
+    extras:
+      table: build-system
+    includes:
+      - build_wheels
+  py_run_dask_cudf:
+    output: pyproject
+    pyproject_dir: python/dask_cudf
+    extras:
+      table: project
+    includes:
+      - run_common
+      - run_dask_cudf
+  py_test_dask_cudf:
+    output: pyproject
+    pyproject_dir: python/dask_cudf
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+      - test_python_dask_cudf
+  py_build_cudf_kafka:
+    output: pyproject
+    pyproject_dir: python/cudf_kafka
+    extras:
+      table: build-system
+    includes:
+      - build_wheels
+      - build_python_common
+  py_run_cudf_kafka:
+    output: pyproject
+    pyproject_dir: python/cudf_kafka
+    extras:
+      table: project
+    includes:
+      - run_cudf_kafka
+  py_test_cudf_kafka:
+    output: pyproject
+    pyproject_dir: python/cudf_kafka
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+  py_build_custreamz:
+    output: pyproject
+    pyproject_dir: python/custreamz
+    extras:
+      table: build-system
+    includes:
+      - build_wheels
+  py_run_custreamz:
+    output: pyproject
+    pyproject_dir: python/custreamz
+    extras:
+      table: project
+    includes:
+      - run_custreamz
+  py_test_custreamz:
+    output: pyproject
+    pyproject_dir: python/custreamz
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
 channels:
   - rapidsai
   - rapidsai-nightly
@@ -61,29 +170,28 @@ channels:
 dependencies:
   build_all:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.23.1,!=3.25.0
-          - dlpack>=0.5,<0.6.0a0
           - ninja
       - output_types: conda
         packages:
-          - libarrow=10
           - c-compiler
           - cxx-compiler
+          - dlpack>=0.5,<0.6.0a0
     specific:
       - output_types: conda
         matrices:
           - matrix:
               arch: x86_64
             packages:
-              - &gcc_amd64 gcc_linux-64=11.*
-              - &sysroot_amd64 sysroot_linux-64==2.17
+              - gcc_linux-64=11.*
+              - sysroot_linux-64==2.17
           - matrix:
               arch: aarch64
             packages:
-              - &gcc_aarch64 gcc_linux-aarch64=11.*
-              - &sysroot_aarch64 sysroot_linux-aarch64==2.17
+              - gcc_linux-aarch64=11.*
+              - sysroot_linux-aarch64==2.17
       - output_types: conda
         matrices:
           - matrix:
@@ -100,24 +208,56 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - librmm=23.04.*
+          - librmm==23.4.*
       - output_types: conda
         packages:
           - fmt>=9.1.0,<10
+          - &gtest gtest==1.10.0.*
+          - &gmock gmock==1.10.0.*
+          # Hard pin the patch version used during the build. This must be kept
+          # in sync with the version pinned in get_arrow.cmake.
+          - libarrow==10.0.1.*
           - librdkafka=1.7.0
           - spdlog>=1.11.0,<1.12
-  build_python:
+  build_wheels:
     common:
-      - output_types: [conda, requirements]
+      - output_types: pyproject
+        packages:
+          - wheel
+          - setuptools
+  build_python_common:
+    common:
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - cuda-python>=11.7.1,<12.0
           - cython>=0.29,<0.30
-          - pyarrow=10
-          - rmm=23.04.*
+          # Hard pin the patch version used during the build. This must be kept
+          # in sync with the version pinned in get_arrow.cmake.
+          - pyarrow==10.0.1.*
+          - numpy>=1.21
+  build_python:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
           - scikit-build>=0.13.1
+          - rmm==23.4.*
       - output_types: conda
         packages:
-          - protobuf>=4.21.6,<4.22
+          - &protobuf protobuf>=4.21.6,<4.22
+      - output_types: pyproject
+        packages:
+          - protoc-wheel
+  libarrow_run:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          # Allow runtime version to float up to minor version
+          - libarrow==10.*
+  pyarrow_run:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          # Allow runtime version to float up to minor version
+          - pyarrow==10.*
   cudatoolkit:
     specific:
       - output_types: conda
@@ -161,66 +301,6 @@ dependencies:
           - sphinx-copybutton
           - sphinx-markdown-tables
           - sphinxcontrib-websupport
-  libidentify_stream_usage_build:
-    common:
-      - output_types: conda
-        packages:
-          - *cmake_ver
-    specific:
-      - output_types: conda
-        matrices:
-          - matrix:
-              arch: x86_64
-            packages:
-              - *gcc_amd64
-              - *sysroot_amd64
-          - matrix:
-              arch: aarch64
-            packages:
-              - *gcc_aarch64
-              - *sysroot_aarch64
-      - output_types: conda
-        matrices:
-          - matrix:
-              arch: x86_64
-              cuda: "11.2"
-            packages:
-              - nvcc_linux-64=11.2
-          - matrix:
-              arch: aarch64
-              cuda: "11.2"
-            packages:
-              - nvcc_linux-aarch64=11.2
-          - matrix:
-              arch: x86_64
-              cuda: "11.4"
-            packages:
-              - nvcc_linux-64=11.4
-          - matrix:
-              arch: aarch64
-              cuda: "11.4"
-            packages:
-              - nvcc_linux-aarch64=11.4
-          - matrix:
-              arch: x86_64
-              cuda: "11.5"
-            packages:
-              - nvcc_linux-64=11.5
-          - matrix:
-              arch: aarch64
-              cuda: "11.5"
-            packages:
-              - nvcc_linux-aarch64=11.5
-          - matrix:
-              arch: x86_64
-              cuda: "11.8"
-            packages:
-              - nvcc_linux-64=11.8
-          - matrix:
-              arch: aarch64
-              cuda: "11.8"
-            packages:
-              - nvcc_linux-aarch64=11.8
   notebooks:
     common:
       - output_types: [conda, requirements]
@@ -247,22 +327,25 @@ dependencies:
           - matrix:
             packages:
               - python>=3.8,<3.11
-  run:
+  run_common:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - cachetools
-          - dask>=2023.1.1
-          - distributed>=2023.1.1
           - fsspec>=0.6.0
-          - numba>=0.56.2
           - numpy>=1.21
+          - pandas>=1.3,<1.6.0dev0
+  run_cudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - cachetools
+          - cuda-python>=11.7.1,<12.0
+          - &numba numba>=0.56.4,<0.57
           - nvtx>=0.2.1
           - packaging
-          - pandas>=1.3,<1.6.0dev0
-          - python-confluent-kafka=1.7.0
-          - streamz
+          - rmm==23.4.*
           - typing_extensions
+          - *protobuf
       - output_types: conda
         packages:
           - cubinlinker
@@ -271,7 +354,6 @@ dependencies:
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
           - ptxcompiler
-          - rmm=23.04.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -280,7 +362,11 @@ dependencies:
           - cubinlinker-cu11
           - git+https://github.com/python-streamz/streamz.git@master
           - ptxcompiler-cu11
-          - rmm-cu11=23.04.*
+      - output_types: pyproject
+        packages:
+          - cubinlinker
+          - &cupy_pip cupy-cuda11x>=9.5.0,<12.0.0a0
+          - ptxcompiler
     specific:
       - output_types: requirements
         matrices:
@@ -292,7 +378,41 @@ dependencies:
               arch: aarch64
             packages:
               - cupy-cuda11x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+  run_dask_cudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - dask>=2023.1.1
+          - distributed>=2023.1.1
+      - output_types: pyproject
+        packages:
+          - &cudf cudf==23.4.*
+          - *cupy_pip
+  run_cudf_kafka:
+    common:
+      - output_types: conda
+        packages:
+          - python-confluent-kafka==1.7.0
+      - output_types: [requirements, pyproject]
+        packages:
+          - *cudf
+          - confluent-kafka==1.7.0
+  run_custreamz:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - streamz
+      - output_types: [requirements, pyproject]
+        packages:
+          - *cudf
+          - cudf_kafka==23.4.*
   test_cpp:
+    common:
+      - output_types: conda
+        packages:
+          - *cmake_ver
+          - *gtest
+          - *gmock
     specific:
       - output_types: conda
         matrices:
@@ -320,27 +440,39 @@ dependencies:
               cuda: "11.8"
             packages:
               - cuda-nvtx=11.8
-  test_python:
+  test_python_common:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - pytest
+          - pytest-cov
+          - pytest-xdist
+  test_python_cudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
         packages:
-          - aiobotocore>=2.2.0
-          - boto3>=1.21.21
-          - botocore>=1.24.21
-          - dask-cuda=23.04.*
           - fastavro>=0.22.9
           - hypothesis
           - mimesis>=4.1.0
-          - moto>=4.0.8
           - pyorc
-          - pytest
           - pytest-benchmark
           - pytest-cases
-          - pytest-cov
-          - pytest-xdist
           - python-snappy>=0.6.0
-          - s3fs>=2022.3.0
           - scipy
+      - output_types: conda
+        packages:
+          - aiobotocore>=2.2.0
+          - boto3>=1.21.21
+          - botocore>=1.24.21
+          - msgpack-python
+          - moto>=4.0.8
+          - s3fs>=2022.3.0
+      - output_types: pyproject
+        packages:
+          - msgpack
+          - &tokenizers tokenizers==0.13.1
+          - &transformers transformers==4.24.0
+          - tzdata
     specific:
       - output_types: conda
         matrices:
@@ -352,7 +484,13 @@ dependencies:
               - pytorch<1.12.0
               # We only install these on x86_64 to avoid pulling pytorch as a
               # dependency of transformers.
-              - tokenizers==0.13.1
-              - transformers==4.24.0
+              - *tokenizers
+              - *transformers
           - matrix:
             packages:
+  test_python_dask_cudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - dask-cuda==23.4.*
+          - *numba
diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst
index 112df2fdf9f..5c28b4e7e85 100644
--- a/docs/cudf/source/api_docs/general_functions.rst
+++ b/docs/cudf/source/api_docs/general_functions.rst
@@ -27,6 +27,7 @@ Top-level conversions
 
     cudf.to_numeric
     cudf.from_dlpack
+    cudf.from_pandas
 
 Top-level dealing with datetimelike
 -----------------------------------
diff --git a/docs/dask_cudf/Makefile b/docs/dask_cudf/Makefile
new file mode 100644
index 00000000000..d0c3cbf1020
--- /dev/null
+++ b/docs/dask_cudf/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/dask_cudf/make.bat b/docs/dask_cudf/make.bat
new file mode 100644
index 00000000000..747ffb7b303
--- /dev/null
+++ b/docs/dask_cudf/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png b/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png
new file mode 100644
index 00000000000..d884e01374d
Binary files /dev/null and b/docs/dask_cudf/source/_static/RAPIDS-logo-purple.png differ
diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst
new file mode 100644
index 00000000000..893f5dd7434
--- /dev/null
+++ b/docs/dask_cudf/source/api.rst
@@ -0,0 +1,79 @@
+===============
+ API reference
+===============
+
+This page provides a list of all publicly accessible modules, methods,
+and classes in the ``dask_cudf`` namespace.
+
+
+Creating and storing DataFrames
+===============================
+
+:doc:`Like Dask <dask:dataframe-create>`, Dask-cuDF supports creation
+of DataFrames from a variety of storage formats. For on-disk data that
+are not supported directly in Dask-cuDF, we recommend using Dask's
+data reading facilities, followed by calling
+:func:`.from_dask_dataframe` to obtain a Dask-cuDF object.
+
+.. automodule:: dask_cudf
+   :members:
+      from_cudf,
+      from_dask_dataframe,
+      read_csv,
+      read_json,
+      read_orc,
+      to_orc,
+      read_text,
+      read_parquet
+
+.. warning::
+
+   FIXME: where should the following live?
+
+   .. autofunction:: dask_cudf.concat
+
+   .. autofunction:: dask_cudf.from_delayed
+
+Grouping
+========
+
+As discussed in the :doc:`Dask documentation for groupby
+<dask:dataframe-groupby>`, ``groupby``, ``join``, and ``merge``, and
+similar operations that require matching up rows of a DataFrame become
+significantly more challenging in a parallel setting than they are in
+serial. Dask-cuDF has the same challenges, however for certain groupby
+operations, we can take advantage of functionality in cuDF that allows
+us to compute multiple aggregations at once. There are therefore two
+interfaces to grouping in Dask-cuDF, the general
+:meth:`DataFrame.groupby` which returns a
+:class:`.CudfDataFrameGroupBy` object, and a specialized
+:func:`.groupby_agg`. Generally speaking, you should not need to call
+:func:`.groupby_agg` directly, since Dask-cuDF will arrange to call it
+if possible.
+
+.. autoclass:: dask_cudf.groupby.CudfDataFrameGroupBy
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autofunction:: dask_cudf.groupby_agg
+
+
+DataFrames and Series
+=====================
+
+The core distributed objects provided by Dask-cuDF are the
+:class:`.DataFrame` and :class:`.Series`. These inherit respectively
+from :class:`dask.dataframe.DataFrame` and
+:class:`dask.dataframe.Series`, and so the API is essentially
+identical. The full API is provided below.
+
+.. autoclass:: dask_cudf.DataFrame
+   :members:
+   :inherited-members:
+   :show-inheritance:
+
+.. autoclass:: dask_cudf.Series
+   :members:
+   :inherited-members:
+   :show-inheritance:
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
new file mode 100644
index 00000000000..1341e7fd9e7
--- /dev/null
+++ b/docs/dask_cudf/source/conf.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "dask-cudf"
+copyright = "2018-2023, NVIDIA Corporation"
+author = "NVIDIA Corporation"
+version = "23.04"
+release = "23.04.00"
+
+language = "en"
+
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.autodoc",
+    "sphinx_copybutton",
+    "numpydoc",
+    "IPython.sphinxext.ipython_console_highlighting",
+    "IPython.sphinxext.ipython_directive",
+    "myst_nb",
+]
+
+templates_path = ["_templates"]
+exclude_patterns = []
+
+copybutton_prompt_text = ">>> "
+
+# Enable automatic generation of systematic, namespaced labels for sections
+myst_heading_anchors = 2
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "pydata_sphinx_theme"
+html_logo = "_static/RAPIDS-logo-purple.png"
+htmlhelp_basename = "dask-cudfdoc"
+html_use_modindex = True
+
+html_static_path = ["_static"]
+
+pygments_style = "sphinx"
+
+html_theme_options = {
+    "external_links": [],
+    "github_url": "https://github.com/rapidsai/cudf",
+    "twitter_url": "https://twitter.com/rapidsai",
+    "show_toc_level": 1,
+    "navbar_align": "right",
+}
+include_pandas_compat = True
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "cupy": ("https://docs.cupy.dev/en/stable/", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "pyarrow": ("https://arrow.apache.org/docs/", None),
+    "cudf": ("https://docs.rapids.ai/api/cudf/stable/", None),
+    "dask": ("https://docs.dask.org/en/stable/", None),
+    "pandas": ("https://pandas.pydata.org/docs/", None),
+}
+
+numpydoc_show_inherited_class_members = True
+numpydoc_class_members_toctree = False
+numpydoc_attributes_as_param_list = False
+
+
+def setup(app):
+    app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
+    app.add_js_file(
+        "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
+    )
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
new file mode 100644
index 00000000000..0442ab0929a
--- /dev/null
+++ b/docs/dask_cudf/source/index.rst
@@ -0,0 +1,112 @@
+.. dask-cudf documentation coordinating file, created by
+   sphinx-quickstart on Mon Feb  6 18:48:11 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to dask-cudf's documentation!
+=====================================
+
+Dask-cuDF is an extension library for the `Dask <https://dask.org>`__
+parallel computing framework that provides a `cuDF
+<https://docs.rapids.ai/api/cudf/stable/>`__-backed distributed
+dataframe with the same API as `Dask dataframes
+<https://docs.dask.org/en/stable/dataframe.html>`__.
+
+If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
+`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask-cuDF
+should feel familiar to you. If not, we recommend starting with `10
+minutes to Dask
+<https://docs.dask.org/en/stable/10-minutes-to-dask.html>`__ followed
+by `10 minutes to cuDF and Dask-cuDF
+<https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
+
+When running on multi-GPU systems, `Dask-CUDA
+<https://docs.rapids.ai/api/dask-cuda/stable/>`__ is recommended to
+simplify the setup of the cluster, taking advantage of all features of
+the GPU and networking hardware.
+
+Using Dask-cuDF
+---------------
+
+When installed, Dask-cuDF registers itself as a dataframe backend for
+Dask. This means that in many cases, using cuDF-backed dataframes requires
+only small changes to an existing workflow. The minimal change is to
+select cuDF as the dataframe backend in :doc:`Dask's
+configuration <dask:configuration>`. To do so, we must set the option
+``dataframe.backend`` to ``cudf``. From Python, this can be achieved
+like so::
+
+  import dask
+
+  dask.config.set({"dataframe.backend": "cudf"})
+
+Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the
+environment before running your code.
+
+Dataframe creation from on-disk formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If your workflow creates Dask dataframes from on-disk formats
+(for example using :func:`dask.dataframe.read_parquet`), then setting
+the backend may well be enough to migrate your workflow.
+
+For example, consider reading a dataframe from parquet::
+
+   import dask.dataframe as dd
+
+   # By default, we obtain a pandas-backed dataframe
+   df = dd.read_parquet("data.parquet", ...)
+
+
+To obtain a cuDF-backed dataframe, we must set the
+``dataframe.backend`` configuration option::
+
+  import dask
+  import dask.dataframe as dd
+
+  dask.config.set({"dataframe.backend": "cudf"})
+  # This gives us a cuDF-backed dataframe
+  df = dd.read_parquet("data.parquet", ...)
+
+This code will use cuDF's GPU-accelerated :func:`parquet reader
+<cudf.read_parquet>` to read partitions of the data.
+
+Dataframe creation from in-memory formats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you already have a dataframe in memory and want to convert it to a
+cuDF-backend one, there are two options depending on whether the
+dataframe is already a Dask one or not. If you have a Dask dataframe,
+then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"``
+as the backend; if you have a pandas dataframe then you can either
+call :func:`dask.dataframe.from_pandas` followed by
+:func:`~dask.dataframe.to_backend` or first convert the dataframe with
+:func:`cudf.from_pandas` and then parallelise this with
+:func:`dask_cudf.from_cudf`.
+
+API Reference
+-------------
+
+Generally speaking, Dask-cuDF tries to offer exactly the same API as
+Dask itself. There are, however, some minor differences mostly because
+cuDF does not :doc:`perfectly mirror <cudf:user_guide/PandasCompat>`
+the pandas API, or because cuDF provides additional configuration
+flags (these mostly occur in data reading and writing interfaces).
+
+As a result, straightforward workflows can be migrated without too
+much trouble, but more complex ones that utilise more features may
+need a bit of tweaking. The API documentation describes details of the
+differences and all functionality that Dask-cuDF supports.
+
+.. toctree::
+   :maxdepth: 2
+
+   api
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 84183819854..7d93438d72e 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2914,6 +2914,41 @@ public final ColumnVector stringReplace(Scalar target, Scalar replace) {
         replace.getScalarHandle()));
   }
 
+  /**
+   * Returns a new strings column where target strings with each string are replaced with
+   * corresponding replacement strings. For each string in the column, the list of targets
+   * is searched within that string. If a target string is found, it is replaced by the
+   * corresponding entry in the repls column. All occurrences found in each string are replaced.
+   * The repls argument can optionally contain a single string. In this case, all matching
+   * target substrings will be replaced by that single string.
+   *
+   * Example:
+   * cv = ["hello", "goodbye"]
+   * targets = ["e","o"]
+   * repls = ["EE","OO"]
+   * r1 = cv.stringReplace(targets, repls)
+   * r1 is now ["hEEllO", "gOOOOdbyEE"]
+   *
+   * targets = ["e", "o"]
+   * repls = ["_"]
+   * r2 = cv.stringReplace(targets, repls)
+   * r2 is now ["h_ll_", "g__dby_"]
+   *
+   * @param targets Strings to search for in each string.
+   * @param repls Corresponding replacement strings for target strings.
+   * @return A new java column vector containing the replaced strings.
+   */
+  public final ColumnVector stringReplace(ColumnView targets, ColumnView repls) {
+    assert type.equals(DType.STRING) : "column type must be a String";
+    assert targets != null : "target list may not be null";
+    assert targets.getType().equals(DType.STRING) : "target list must be a string column";
+    assert repls != null : "replacement list may not be null";
+    assert repls.getType().equals(DType.STRING) : "replacement list must be a string column";
+
+    return new ColumnVector(stringReplaceMulti(getNativeView(), targets.getNativeView(),
+        repls.getNativeView()));
+  }
+
   /**
    * For each string, replaces any character sequence matching the given pattern using the
    * replacement string scalar.
@@ -4170,6 +4205,14 @@ private static native long substringColumn(long columnView, long startColumn, lo
    */
   private static native long stringReplace(long columnView, long target, long repl) throws CudfException;
 
+  /**
+   * Native method to replace target strings by corresponding repl strings.
+   * @param inputCV native handle of the cudf::column_view being operated on.
+   * @param targetsCV handle of column containing the strings being searched.
+   * @param replsCV handle of column containing the strings to replace (can optionally contain a single string).
+   */
+  private static native long stringReplaceMulti(long inputCV, long targetsCV, long replsCV) throws CudfException;
+
   /**
    * Native method for replacing each regular expression pattern match with the specified
    * replacement string.
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 6dc7de13560..1d22d8a5d79 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
@@ -296,7 +297,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env
         cudf::jni::native_jpointerArray<column_view>{env, column_handles}.get_dereferenced();
     auto const is_lists_column = columns[0].type().id() == cudf::type_id::LIST;
     return release_as_jlong(
-        is_lists_column ? cudf::lists::detail::concatenate(columns, cudf::get_default_stream()) :
+        is_lists_column ? cudf::lists::detail::concatenate(columns, cudf::get_default_stream(),
+                                                           rmm::mr::get_current_device_resource()) :
                           cudf::concatenate(columns));
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index f2c361c5e8c..1213ab305fe 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1546,6 +1546,26 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplace(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti(JNIEnv *env, jclass,
+                                                                          jlong inputs_cv,
+                                                                          jlong targets_cv,
+                                                                          jlong repls_cv) {
+  JNI_NULL_CHECK(env, inputs_cv, "column is null", 0);
+  JNI_NULL_CHECK(env, targets_cv, "targets string column view is null", 0);
+  JNI_NULL_CHECK(env, repls_cv, "repls string column view is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(inputs_cv);
+    cudf::strings_column_view scv(*cv);
+    cudf::column_view *cvtargets = reinterpret_cast<cudf::column_view *>(targets_cv);
+    cudf::strings_column_view scvtargets(*cvtargets);
+    cudf::column_view *cvrepls = reinterpret_cast<cudf::column_view *>(repls_cv);
+    cudf::strings_column_view scvrepls(*cvrepls);
+    return release_as_jlong(cudf::strings::replace(scv, scvtargets, scvrepls));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookupForKeys(JNIEnv *env, jclass,
                                                                         jlong map_column_view,
                                                                         jlong lookup_keys) {
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 7e0b0f9330d..9a96374688a 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 #include <cudf/utilities/span.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <thrust/functional.h>
 #include <thrust/logical.h>
 #include <thrust/scan.h>
@@ -55,7 +56,7 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
   auto [null_mask, null_count] = cudf::detail::valid_if(
       validity_begin, validity_end,
       [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
-      cudf::get_default_stream());
+      cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const exemplar_without_null_mask = cudf::column_view{
       exemplar.type(),
       exemplar.size(),
@@ -152,8 +153,9 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
                    });
 
   // Create a new nullmask from the validity data.
-  auto [new_null_mask, new_null_count] = cudf::detail::valid_if(
-      validity.begin(), validity.end(), thrust::identity{}, cudf::get_default_stream());
+  auto [new_null_mask, new_null_count] =
+      cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{},
+                             cudf::get_default_stream(), rmm::mr::get_current_device_resource());
 
   if (new_null_count > 0) {
     // If the `overlap_result` column is nullable, perform `bitmask_and` of its nullmask and the
@@ -162,7 +164,8 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
       auto [null_mask, null_count] = cudf::detail::bitmask_and(
           std::vector<bitmask_type const *>{
               overlap_cv.null_mask(), static_cast<bitmask_type const *>(new_null_mask.data())},
-          std::vector<cudf::size_type>{0, 0}, overlap_cv.size(), stream);
+          std::vector<cudf::size_type>{0, 0}, overlap_cv.size(), stream,
+          rmm::mr::get_current_device_resource());
       overlap_result->set_null_mask(std::move(null_mask), null_count);
     } else {
       // Just set the output nullmask as the new nullmask.
@@ -187,13 +190,14 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
   // Use `cudf::duplicate_keep_option::KEEP_LAST` so this will produce the desired behavior when
   // being called in `create_map` in spark-rapids.
   // Other options comparing nulls and NaNs are set as all-equal.
-  auto out_columns = cudf::detail::stable_distinct(
-                         table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}},
-                                     child.child(0), child.child(1)}}, // input table
-                         std::vector<size_type>{0, 1},                 // key columns
-                         cudf::duplicate_keep_option::KEEP_LAST, cudf::null_equality::EQUAL,
-                         cudf::nan_equality::ALL_EQUAL, stream)
-                         ->release();
+  auto out_columns =
+      cudf::detail::stable_distinct(
+          table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}}, child.child(0),
+                      child.child(1)}}, // input table
+          std::vector<size_type>{0, 1}, // key columns
+          cudf::duplicate_keep_option::KEEP_LAST, cudf::null_equality::EQUAL,
+          cudf::nan_equality::ALL_EQUAL, stream, rmm::mr::get_current_device_resource())
+          ->release();
   auto const out_labels = out_columns.front()->view();
 
   // Assemble a structs column of <out_keys, out_vals>.
@@ -211,9 +215,10 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
   cudf::detail::labels_to_offsets(labels_begin, labels_begin + out_labels.size(), offsets_begin,
                                   offsets_begin + out_offsets->size(), stream);
 
-  return cudf::make_lists_column(input.size(), std::move(out_offsets), std::move(out_structs),
-                                 input.null_count(),
-                                 cudf::detail::copy_bitmask(input.parent(), stream), stream);
+  return cudf::make_lists_column(
+      input.size(), std::move(out_offsets), std::move(out_structs), input.null_count(),
+      cudf::detail::copy_bitmask(input.parent(), stream, rmm::mr::get_current_device_resource()),
+      stream);
 }
 
 } // namespace cudf::jni
diff --git a/java/src/main/native/src/maps_column_view.cu b/java/src/main/native/src/maps_column_view.cu
index 23254c0d501..1af7689f972 100644
--- a/java/src/main/native/src/maps_column_view.cu
+++ b/java/src/main/native/src/maps_column_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,7 +55,8 @@ std::unique_ptr<column> get_values_for_impl(maps_column_view const &maps_view,
   CUDF_EXPECTS(lookup_keys.type().id() == keys_.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
   auto key_indices =
-      lists::detail::index_of(keys_, lookup_keys, lists::duplicate_find_option::FIND_LAST, stream);
+      lists::detail::index_of(keys_, lookup_keys, lists::duplicate_find_option::FIND_LAST, stream,
+                              rmm::mr::get_current_device_resource());
   auto constexpr absent_offset = size_type{-1};
   auto constexpr nullity_offset = std::numeric_limits<size_type>::min();
   thrust::replace(rmm::exec_policy(stream), key_indices->mutable_view().template begin<size_type>(),
@@ -86,7 +87,8 @@ std::unique_ptr<column> contains_impl(maps_column_view const &maps_view, KeyT co
   auto const keys = maps_view.keys();
   CUDF_EXPECTS(lookup_keys.type().id() == keys.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
-  auto const contains = lists::detail::contains(keys, lookup_keys, stream);
+  auto const contains =
+      lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
   // Replace nulls with BOOL8{false};
   auto const scalar_false = numeric_scalar<bool>{false, true, stream};
   return detail::replace_nulls(contains->view(), scalar_false, stream, mr);
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 5cf7658106f..84f84f8b46f 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -226,7 +226,8 @@ build_string_row_offsets(table_view const &tbl, size_type fixed_width_and_validi
     std::copy_if(offsets_iter, offsets_iter + tbl.num_columns(),
                  std::back_inserter(offsets_iterators),
                  [](auto const &offset_ptr) { return offset_ptr != nullptr; });
-    return make_device_uvector_async(offsets_iterators, stream);
+    return make_device_uvector_async(offsets_iterators, stream,
+                                     rmm::mr::get_current_device_resource());
   }();
 
   auto const num_columns = static_cast<size_type>(d_offsets_iterators.size());
@@ -1256,7 +1257,7 @@ static std::unique_ptr<column> fixed_width_convert_to_rows(
 
   // Allocate and set the offsets row for the byte array
   std::unique_ptr<column> offsets =
-      cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
+      cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream, mr);
 
   std::unique_ptr<column> data =
       make_numeric_column(data_type(type_id::INT8), static_cast<size_type>(total_allocation),
@@ -1539,7 +1540,9 @@ batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_w
     last_row_end = row_end;
   }
 
-  return {std::move(batch_row_offsets), make_device_uvector_async(batch_row_boundaries, stream),
+  return {std::move(batch_row_offsets),
+          make_device_uvector_async(batch_row_boundaries, stream,
+                                    rmm::mr::get_current_device_resource()),
           std::move(batch_row_boundaries), std::move(row_batches)};
 }
 
@@ -1750,8 +1753,10 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
     return table_view(cols);
   };
 
-  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream);
-  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream);
+  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream,
+                                                 rmm::mr::get_current_device_resource());
+  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream,
+                                                  rmm::mr::get_current_device_resource());
 
   // Get the pointers to the input columnar data ready
   auto const data_begin = thrust::make_transform_iterator(tbl.begin(), [](auto const &c) {
@@ -1764,8 +1769,10 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
       thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { return c.null_mask(); });
   std::vector<bitmask_type const *> input_nm(nm_begin, nm_begin + tbl.num_columns());
 
-  auto dev_input_data = make_device_uvector_async(input_data, stream);
-  auto dev_input_nm = make_device_uvector_async(input_nm, stream);
+  auto dev_input_data =
+      make_device_uvector_async(input_data, stream, rmm::mr::get_current_device_resource());
+  auto dev_input_nm =
+      make_device_uvector_async(input_nm, stream, rmm::mr::get_current_device_resource());
 
   // the first batch always exists unless we were sent an empty table
   auto const first_batch_size = batch_info.row_batches[0].row_count;
@@ -1811,7 +1818,8 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
   auto validity_tile_infos = detail::build_validity_tile_infos(
       tbl.num_columns(), num_rows, shmem_limit_per_tile, batch_info.row_batches);
 
-  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream);
+  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream,
+                                                           rmm::mr::get_current_device_resource());
 
   auto const validity_offset = column_info.column_starts.back();
 
@@ -1847,9 +1855,10 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
     std::vector<int8_t const *> variable_width_input_data(
         variable_data_begin, variable_data_begin + variable_width_table.num_columns());
 
-    auto dev_variable_input_data = make_device_uvector_async(variable_width_input_data, stream);
-    auto dev_variable_col_output_offsets =
-        make_device_uvector_async(column_info.variable_width_column_starts, stream);
+    auto dev_variable_input_data = make_device_uvector_async(
+        variable_width_input_data, stream, rmm::mr::get_current_device_resource());
+    auto dev_variable_col_output_offsets = make_device_uvector_async(
+        column_info.variable_width_column_starts, stream, rmm::mr::get_current_device_resource());
 
     for (uint i = 0; i < batch_info.row_batches.size(); i++) {
       auto const batch_row_offset = batch_info.batch_row_boundaries[i];
@@ -2076,8 +2085,10 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   // Ideally we would check that the offsets are all the same, etc. but for now this is probably
   // fine
   CUDF_EXPECTS(size_per_row * num_rows <= child.size(), "The layout of the data appears to be off");
-  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream);
-  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream);
+  auto dev_col_starts = make_device_uvector_async(column_info.column_starts, stream,
+                                                  rmm::mr::get_current_device_resource());
+  auto dev_col_sizes = make_device_uvector_async(column_info.column_sizes, stream,
+                                                 rmm::mr::get_current_device_resource());
 
   // Allocate the columns we are going to write into
   std::vector<std::unique_ptr<column>> output_columns;
@@ -2118,16 +2129,20 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
     }
   }
 
-  auto dev_string_row_offsets = make_device_uvector_async(string_row_offsets, stream);
-  auto dev_string_lengths = make_device_uvector_async(string_lengths, stream);
+  auto dev_string_row_offsets =
+      make_device_uvector_async(string_row_offsets, stream, rmm::mr::get_current_device_resource());
+  auto dev_string_lengths =
+      make_device_uvector_async(string_lengths, stream, rmm::mr::get_current_device_resource());
 
   // build the row_batches from the passed in list column
   std::vector<detail::row_batch> row_batches;
   row_batches.push_back(
       {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
 
-  auto dev_output_data = make_device_uvector_async(output_data, stream);
-  auto dev_output_nm = make_device_uvector_async(output_nm, stream);
+  auto dev_output_data =
+      make_device_uvector_async(output_data, stream, rmm::mr::get_current_device_resource());
+  auto dev_output_nm =
+      make_device_uvector_async(output_nm, stream, rmm::mr::get_current_device_resource());
 
   // only ever get a single batch when going from rows, so boundaries are 0, num_rows
   constexpr auto num_batches = 2;
@@ -2164,7 +2179,8 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   auto validity_tile_infos =
       detail::build_validity_tile_infos(schema.size(), num_rows, shmem_limit_per_tile, row_batches);
 
-  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream);
+  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream,
+                                                           rmm::mr::get_current_device_resource());
 
   dim3 const validity_blocks(validity_tile_infos.size());
 
@@ -2221,8 +2237,10 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
       string_col_offsets.push_back(std::move(output_string_offsets));
       string_data_cols.push_back(std::move(string_data));
     }
-    auto dev_string_col_offsets = make_device_uvector_async(string_col_offset_ptrs, stream);
-    auto dev_string_data_cols = make_device_uvector_async(string_data_col_ptrs, stream);
+    auto dev_string_col_offsets = make_device_uvector_async(string_col_offset_ptrs, stream,
+                                                            rmm::mr::get_current_device_resource());
+    auto dev_string_data_cols = make_device_uvector_async(string_data_col_ptrs, stream,
+                                                          rmm::mr::get_current_device_resource());
 
     dim3 const string_blocks(
         std::min(std::max(MIN_STRING_BLOCKS, num_rows / NUM_STRING_ROWS_PER_BLOCK_FROM_ROWS),
@@ -2274,8 +2292,10 @@ std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
     // fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = make_device_uvector_async(column_start, stream);
-    auto dev_column_size = make_device_uvector_async(column_size, stream);
+    auto dev_column_start =
+        make_device_uvector_async(column_start, stream, rmm::mr::get_current_device_resource());
+    auto dev_column_size =
+        make_device_uvector_async(column_size, stream, rmm::mr::get_current_device_resource());
 
     // Allocate the columns we are going to write into
     std::vector<std::unique_ptr<column>> output_columns;
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 7848807dab8..8e19c543ee5 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -5146,6 +5146,27 @@ void teststringReplaceThrowsException() {
     });
   }
 
+  @Test
+  void teststringReplaceMulti() {
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo", "thésssé", null, "", "ARé", "sssstrings");
+         ColumnVector e_allParameters = ColumnVector.fromStrings("Hello", "theSse", null, "", "ARe", "SStrings");
+         ColumnVector targets = ColumnVector.fromStrings("ss", "é");
+         ColumnVector repls = ColumnVector.fromStrings("S", "e");
+         ColumnVector replace_allParameters = v.stringReplace(targets, repls)) {
+      assertColumnsAreEqual(e_allParameters, replace_allParameters);
+    }
+  }
+
+  @Test
+  void teststringReplaceMultiThrowsException() {
+    assertThrows(AssertionError.class, () -> {
+      try (ColumnVector testStrings = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");
+           ColumnVector targets = ColumnVector.fromInts(0, 1);
+           ColumnVector repls = null;
+           ColumnVector result = testStrings.stringReplace(targets,repls)){}
+    });
+  }
+
   @Test
   void testReplaceRegex() {
     try (ColumnVector v = ColumnVector.fromStrings("title and Title with title", "nothing", null, "Title");
diff --git a/python/cudf/MANIFEST.in b/python/cudf/MANIFEST.in
deleted file mode 100644
index 4d3155158f8..00000000000
--- a/python/cudf/MANIFEST.in
+++ /dev/null
@@ -1,16 +0,0 @@
-# Cython files
-recursive-include cudf *.pxd
-recursive-include cudf *.pyx
-
-# Typing files
-recursive-include cudf *.pyi
-
-# C++ files
-recursive-include cudf *.hpp
-recursive-include udf_cpp *.hpp
-recursive-include udf_cpp *.cuh
-
-# Build files. Don't use a recursive include on '.' in case the repo is dirty
-include . CMakeLists.txt
-recursive-include cudf CMakeLists.txt
-recursive-include cmake *
diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
index 42bfa854396..28777b23583 100644
--- a/python/cudf/benchmarks/API/bench_dataframe.py
+++ b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 
 """Benchmarks of DataFrame methods."""
 
@@ -104,6 +104,30 @@ def bench_groupby_agg(benchmark, dataframe, agg, num_key_cols, as_index, sort):
     benchmark(dataframe.groupby(by=by, as_index=as_index, sort=sort).agg, agg)
 
 
+@benchmark_with_object(cls="dataframe", dtype="int", nulls=False, cols=6)
+@pytest.mark.parametrize(
+    "num_key_cols",
+    [2, 3, 4],
+)
+@pytest.mark.parametrize("use_frac", [True, False])
+@pytest.mark.parametrize("replace", [True, False])
+@pytest.mark.parametrize("target_sample_frac", [0.1, 0.5, 1])
+def bench_groupby_sample(
+    benchmark, dataframe, num_key_cols, use_frac, replace, target_sample_frac
+):
+    grouper = dataframe.groupby(by=list(dataframe.columns[:num_key_cols]))
+    if use_frac:
+        kwargs = {"frac": target_sample_frac, "replace": replace}
+    else:
+        minsize = grouper.size().min()
+        target_size = numpy.round(
+            target_sample_frac * minsize, decimals=0
+        ).astype(int)
+        kwargs = {"n": target_size, "replace": replace}
+
+    benchmark(grouper.sample, **kwargs)
+
+
 @benchmark_with_object(cls="dataframe", dtype="int")
 @pytest.mark.parametrize("num_cols_to_sort", [1])
 def bench_sort_values(benchmark, dataframe, num_cols_to_sort):
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index 21809ef7bd9..b2b0a77c45f 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -74,6 +74,7 @@ cdef extern from "cudf/io/types.hpp" \
         column_in_metadata& set_decimal_precision(uint8_t precision)
         column_in_metadata& child(size_type i)
         column_in_metadata& set_output_as_binary(bool binary)
+        string get_name()
 
     cdef cppclass table_input_metadata:
         table_input_metadata() except +
diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd
index c6c42c327ac..b210ddf81dd 100644
--- a/python/cudf/cudf/_lib/cpp/sorting.pxd
+++ b/python/cudf/cudf/_lib/cpp/sorting.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -38,3 +38,10 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         const table_view& table,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence) except +
+
+    cdef unique_ptr[table] segmented_sort_by_key(
+        const table_view& values,
+        const table_view& keys,
+        const column_view& segment_offsets,
+        vector[libcudf_types.order] column_order,
+        vector[libcudf_types.null_order] null_precedence) except +
diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd
index b1a257feedf..e4106ffb99d 100644
--- a/python/cudf/cudf/_lib/cpp/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, uint32_t
 
@@ -47,8 +47,10 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         UNEQUAL "cudf::null_equality::UNEQUAL"
 
     ctypedef enum nan_equality "cudf::nan_equality":
+        # These names differ from the C++ names due to Cython warnings if
+        # "UNEQUAL" is declared by both null_equality and nan_equality.
         ALL_EQUAL "cudf::nan_equality::ALL_EQUAL"
-        UNEQUAL "cudf::nan_equality::UNEQUAL"
+        NANS_UNEQUAL "cudf::nan_equality::UNEQUAL"
 
     ctypedef enum type_id "cudf::type_id":
         EMPTY                  "cudf::type_id::EMPTY"
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 92840561563..c5d8c48fa2c 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cpython cimport pycapsule
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -70,7 +70,7 @@ def to_dlpack(list source_columns):
     )
 
 
-cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj):
+cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
     cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>0
     try:
         dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 47e9dccc8e6..199641fd2ce 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -84,7 +84,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
         null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
     )
     cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL
+        nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.NANS_UNEQUAL
     )
 
     cdef unique_ptr[column] c_result
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 464d9243408..923f5c4089f 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -321,7 +321,8 @@ def write_parquet(
     object row_group_size_rows=None,
     object max_page_size_bytes=None,
     object max_page_size_rows=None,
-    object partitions_info=None
+    object partitions_info=None,
+    object force_nullable_schema=False,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -364,7 +365,9 @@ def write_parquet(
 
         tbl_meta.get().column_metadata[i].set_name(name.encode())
         _set_col_metadata(
-            table[name]._column, tbl_meta.get().column_metadata[i]
+            table[name]._column,
+            tbl_meta.get().column_metadata[i],
+            force_nullable_schema
         )
 
     cdef map[string, string] tmp_user_data
@@ -597,7 +600,8 @@ cdef class ParquetWriter:
         for i, name in enumerate(table._column_names, num_index_cols_meta):
             self.tbl_meta.get().column_metadata[i].set_name(name.encode())
             _set_col_metadata(
-                table[name]._column, self.tbl_meta.get().column_metadata[i]
+                table[name]._column,
+                self.tbl_meta.get().column_metadata[i],
             )
 
         index = (
@@ -675,15 +679,32 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression):
         raise ValueError("Unsupported `compression` type")
 
 
-cdef _set_col_metadata(Column col, column_in_metadata& col_meta):
+cdef _set_col_metadata(
+    Column col,
+    column_in_metadata& col_meta,
+    bool force_nullable_schema=False,
+):
+    if force_nullable_schema:
+        # Only set nullability if `force_nullable_schema`
+        # is true.
+        col_meta.set_nullability(True)
+
     if is_struct_dtype(col):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
             col_meta.child(i).set_name(name.encode())
-            _set_col_metadata(child_col, col_meta.child(i))
+            _set_col_metadata(
+                child_col,
+                col_meta.child(i),
+                force_nullable_schema
+            )
     elif is_list_dtype(col):
-        _set_col_metadata(col.children[1], col_meta.child(1))
+        _set_col_metadata(
+            col.children[1],
+            col_meta.child(1),
+            force_nullable_schema
+        )
     else:
         if is_decimal_dtype(col):
             col_meta.set_decimal_precision(col.dtype.precision)
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index 3b96cc618dd..3c3f8cabda6 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -18,11 +18,13 @@ from cudf._lib.cpp.search cimport lower_bound, upper_bound
 from cudf._lib.cpp.sorting cimport (
     is_sorted as cpp_is_sorted,
     rank,
+    segmented_sort_by_key as cpp_segmented_sort_by_key,
     sorted_order,
 )
+from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport null_order, null_policy, order
-from cudf._lib.utils cimport table_view_from_columns
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
 @acquire_spill_lock()
@@ -143,6 +145,67 @@ def order_by(list columns_from_table, object ascending, str na_position):
     return Column.from_unique_ptr(move(c_result))
 
 
+def segmented_sort_by_key(
+    list values,
+    list keys,
+    Column segment_offsets,
+    list column_order=None,
+    list null_precedence=None,
+):
+    """
+    Sort segments of a table by given keys
+
+    Parameters
+    ----------
+    values : list[Column]
+        Columns of the table which will be sorted
+    keys : list[Column]
+        Columns making up the sort key
+    offsets : Column
+        Segment offsets
+    column_order : list[bool], optional
+        Sequence of boolean values which correspond to each column in
+        keys providing the sort order (default all True).
+        With True <=> ascending; False <=> descending.
+    null_precedence : list[str], optional
+        Sequence of "first" or "last" values (default "first")
+        indicating the position of null values when sorting the keys.
+
+    Returns
+    -------
+    list[Column]
+        list of value columns sorted by keys
+    """
+    cdef table_view values_view = table_view_from_columns(values)
+    cdef table_view keys_view = table_view_from_columns(keys)
+    cdef column_view offsets_view = segment_offsets.view()
+    cdef vector[order] c_column_order
+    cdef vector[null_order] c_null_precedence
+    cdef unique_ptr[table] result
+    ncol = len(values)
+    column_order = column_order or [True] * ncol
+    null_precedence = null_precedence or ["first"] * ncol
+    for asc, null in zip(column_order, null_precedence):
+        c_column_order.push_back(order.ASCENDING if asc else order.DESCENDING)
+        if asc ^ (null == "first"):
+            c_null_precedence.push_back(null_order.AFTER)
+        elif asc ^ (null == "last"):
+            c_null_precedence.push_back(null_order.BEFORE)
+        else:
+            raise ValueError(f"Invalid null precedence {null}")
+    with nogil:
+        result = move(
+            cpp_segmented_sort_by_key(
+                values_view,
+                keys_view,
+                offsets_view,
+                c_column_order,
+                c_null_precedence,
+            )
+        )
+    return columns_from_unique_ptr(move(result))
+
+
 @acquire_spill_lock()
 def digitize(list source_columns, list bins, bool right=False):
     """
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 6590cf2940d..ccf730c91fb 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from collections import abc
 
@@ -279,12 +279,8 @@ def cut(
     if labels is not None:
         if labels is not ordered and len(set(labels)) != len(labels):
             # when we have duplicate labels and ordered is False, we
-            # should allow duplicate categories. The categories are
-            # returned in order
-            new_data = [interval_labels[i][0] for i in index_labels.values]
-            return cudf.CategoricalIndex(
-                new_data, categories=sorted(set(labels)), ordered=False
-            )
+            # should allow duplicate categories.
+            return interval_labels[index_labels]
 
     col = build_categorical_column(
         categories=interval_labels,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e50c324a8f4..9d14d4bde7f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -259,9 +259,12 @@ def _getitem_tuple_arg(self, arg):
 
             else:
                 if isinstance(arg, tuple):
-                    return columns_df.index._get_row_major(columns_df, arg[0])
+                    row_arg = arg[0]
+                elif is_scalar(arg):
+                    row_arg = (arg,)
                 else:
-                    return columns_df.index._get_row_major(columns_df, arg)
+                    row_arg = arg
+                return columns_df.index._get_row_major(columns_df, row_arg)
         else:
             if isinstance(arg[0], slice):
                 out = _get_label_range_or_mask(
@@ -904,14 +907,24 @@ def _init_from_dict_like(
         if index is None:
             num_rows = 0
             if data:
-                col_name = next(iter(data))
-                if is_scalar(data[col_name]):
-                    num_rows = num_rows or 1
-                else:
-                    data[col_name] = column.as_column(
-                        data[col_name], nan_as_null=nan_as_null
+                keys, values, lengths = zip(
+                    *(
+                        (k, v, 1)
+                        if is_scalar(v)
+                        else (
+                            k,
+                            vc := as_column(v, nan_as_null=nan_as_null),
+                            len(vc),
+                        )
+                        for k, v in data.items()
                     )
-                    num_rows = len(data[col_name])
+                )
+                data = dict(zip(keys, values))
+                try:
+                    (num_rows,) = (set(lengths) - {1}) or {1}
+                except ValueError:
+                    raise ValueError("All arrays must be the same length")
+
             self._index = RangeIndex(0, num_rows)
         else:
             self._index = as_index(index)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ea6a6de0b2b..d8b9ee4d006 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -485,9 +485,20 @@ def get_column_values_na(col):
             )
 
         if dtype is None:
-            dtype = find_common_type(
-                [col.dtype for col in self._data.values()]
-            )
+            dtypes = [col.dtype for col in self._data.values()]
+            for dtype in dtypes:
+                if isinstance(
+                    dtype,
+                    (
+                        cudf.ListDtype,
+                        cudf.core.dtypes.DecimalDtype,
+                        cudf.StructDtype,
+                    ),
+                ):
+                    raise NotImplementedError(
+                        f"{dtype} cannot be exposed as a cupy array"
+                    )
+            dtype = find_common_type(dtypes)
 
         matrix = make_empty_matrix(
             shape=(len(self), ncol), dtype=dtype, order="F"
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 8ff3e17d6ff..cb4c0f6b48b 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -6,7 +6,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import Any, Iterable, List, Tuple, Union
+from typing import Any, Iterable, List, Optional, Tuple, Union
 
 import cupy as cp
 import numpy as np
@@ -16,6 +16,8 @@
 from cudf._lib import groupby as libgroupby
 from cudf._lib.null_mask import bitmask_or
 from cudf._lib.reshape import interleave_columns
+from cudf._lib.sort import segmented_sort_by_key
+from cudf._lib.types import size_type_dtype
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
 from cudf.api.types import is_list_like
 from cudf.core.abc import Serializable
@@ -608,6 +610,177 @@ def _scan(self, op: str, *args, **kwargs):
 
     aggregate = agg
 
+    def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
+        """Return the head or tail of each group
+
+        Parameters
+        ----------
+        n
+           Number of entries to include (if negative, number of
+           entries to exclude)
+        take_head
+           Do we want the head or the tail of the group
+        preserve_order
+            If True, return the n rows from each group in original
+            dataframe order (this mimics pandas behavior though is
+            more expensive).
+
+        Returns
+        -------
+        New DataFrame or Series
+
+        Notes
+        -----
+        Unlike pandas, this returns an object in group order, not
+        original order, unless ``preserve_order`` is ``True``.
+        """
+        # A more memory-efficient implementation would merge the take
+        # into the grouping, but that probably requires a new
+        # aggregation scheme in libcudf. This is probably "fast
+        # enough" for most reasonable input sizes.
+        _, offsets, _, group_values = self._grouped()
+        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
+        size_per_group = np.diff(group_offsets)
+        # "Out of bounds" n for the group size either means no entries
+        # (negative) or all the entries (positive)
+        if n < 0:
+            size_per_group = np.maximum(
+                size_per_group + n, 0, out=size_per_group
+            )
+        else:
+            size_per_group = np.minimum(size_per_group, n, out=size_per_group)
+        if take_head:
+            group_offsets = group_offsets[:-1]
+        else:
+            group_offsets = group_offsets[1:] - size_per_group
+        to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype)
+        fixup = np.empty_like(size_per_group)
+        fixup[0] = 0
+        np.cumsum(size_per_group[:-1], out=fixup[1:])
+        to_take += np.repeat(group_offsets - fixup, size_per_group)
+        to_take = as_column(to_take)
+        result = group_values.iloc[to_take]
+        if preserve_order:
+            # Can't use _mimic_pandas_order because we need to
+            # subsample the gather map from the full input ordering,
+            # rather than permuting the gather map of the output.
+            _, (ordering,), _ = self._groupby.groups(
+                [arange(0, self.obj._data.nrows)]
+            )
+            # Invert permutation from original order to groups on the
+            # subset of entries we want.
+            gather_map = ordering.take(to_take).argsort()
+            return result.take(gather_map)
+        else:
+            return result
+
+    @_cudf_nvtx_annotate
+    def head(self, n: int = 5, *, preserve_order: bool = True):
+        """Return first n rows of each group
+
+        Parameters
+        ----------
+        n
+            If positive: number of entries to include from start of group
+            If negative: number of entries to exclude from end of group
+
+        preserve_order
+            If True (default), return the n rows from each group in
+            original dataframe order (this mimics pandas behavior
+            though is more expensive). If you don't need rows in
+            original dataframe order you will see a performance
+            improvement by setting ``preserve_order=False``. In both
+            cases, the original index is preserved, so ``.loc``-based
+            indexing will work identically.
+
+        Returns
+        -------
+        Series or DataFrame
+            Subset of the original grouped object as determined by n
+
+        See Also
+        --------
+        .tail
+
+        Examples
+        --------
+        >>> df = cudf.DataFrame(
+        ...     {
+        ...         "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3],
+        ...         "b": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        ...     }
+        ... )
+        >>> df.groupby("a").head(1)
+           a  b
+        0  1  0
+        1  0  1
+        3  2  3
+        6  3  6
+        >>> df.groupby("a").head(-2)
+           a  b
+        0  1  0
+        3  2  3
+        6  3  6
+        8  3  8
+        """
+        return self._head_tail(
+            n, take_head=True, preserve_order=preserve_order
+        )
+
+    @_cudf_nvtx_annotate
+    def tail(self, n: int = 5, *, preserve_order: bool = True):
+        """Return last n rows of each group
+
+        Parameters
+        ----------
+        n
+            If positive: number of entries to include from end of group
+            If negative: number of entries to exclude from start of group
+
+        preserve_order
+            If True (default), return the n rows from each group in
+            original dataframe order (this mimics pandas behavior
+            though is more expensive). If you don't need rows in
+            original dataframe order you will see a performance
+            improvement by setting ``preserve_order=False``. In both
+            cases, the original index is preserved, so ``.loc``-based
+            indexing will work identically.
+
+        Returns
+        -------
+        Series or DataFrame
+            Subset of the original grouped object as determined by n
+
+
+        See Also
+        --------
+        .head
+
+        Examples
+        --------
+        >>> df = cudf.DataFrame(
+        ...     {
+        ...         "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3],
+        ...         "b": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        ...     }
+        ... )
+        >>> df.groupby("a").tail(1)
+            a   b
+        1   0   1
+        5   1   5
+        7   2   7
+        10  3  10
+        >>> df.groupby("a").tail(-2)
+            a   b
+        5   1   5
+        7   2   7
+        9   3   9
+        10  3  10
+        """
+        return self._head_tail(
+            n, take_head=False, preserve_order=preserve_order
+        )
+
     def nth(self, n):
         """
         Return the nth row from each group.
@@ -699,6 +872,134 @@ def ngroup(self, ascending=True):
         group_ids._index = index
         return self._broadcast(group_ids)
 
+    def sample(
+        self,
+        n: Optional[int] = None,
+        frac: Optional[float] = None,
+        replace: bool = False,
+        weights: Union[abc.Sequence, "cudf.Series", None] = None,
+        random_state: Union[np.random.RandomState, int, None] = None,
+    ):
+        """Return a random sample of items in each group.
+
+        Parameters
+        ----------
+        n
+            Number of items to return for each group, if sampling
+            without replacement must be at most the size of the
+            smallest group. Cannot be used with frac. Default is
+            ``n=1`` if frac is None.
+        frac
+            Fraction of items to return. Cannot be used with n.
+        replace
+            Should sampling occur with or without replacement?
+        weights
+            Sampling probability for each element. Must be the same
+            length as the grouped frame. Not currently supported.
+        random_state
+            Seed for random number generation.
+
+        Returns
+        -------
+        New dataframe or series with samples of appropriate size drawn
+        from each group.
+
+        """
+        if weights is not None:
+            # To implement this case again needs different algorithms
+            # in both cases.
+            #
+            # Without replacement, use the weighted reservoir sampling
+            # approach of Efraimidas and Spirakis (2006)
+            # https://doi.org/10.1016/j.ipl.2005.11.003, essentially,
+            # do a segmented argsort sorting on weight-scaled
+            # logarithmic deviates. See
+            # https://timvieira.github.io/blog/post/
+            # 2019/09/16/algorithms-for-sampling-without-replacement/
+            #
+            # With replacement is trickier, one might be able to use
+            # the alias method, otherwise we're back to bucketed
+            # rejection sampling.
+            raise NotImplementedError("Sampling with weights is not supported")
+        if frac is not None and n is not None:
+            raise ValueError("Cannot supply both of frac and n")
+        elif n is None and frac is None:
+            n = 1
+        elif frac is not None and not (0 <= frac <= 1):
+            raise ValueError(
+                "Sampling with fraction must provide fraction in "
+                f"[0, 1], got {frac=}"
+            )
+        # TODO: handle random states properly.
+        if random_state is not None and not isinstance(random_state, int):
+            raise NotImplementedError(
+                "Only integer seeds are supported for random_state "
+                "in this case"
+            )
+        # Get the groups
+        # TODO: convince Cython to convert the std::vector offsets
+        # into a numpy array directly, rather than a list.
+        # TODO: this uses the sort-based groupby, could one use hash-based?
+        _, offsets, _, group_values = self._grouped()
+        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
+        size_per_group = np.diff(group_offsets)
+        if n is not None:
+            samples_per_group = np.broadcast_to(
+                size_type_dtype.type(n), size_per_group.shape
+            )
+            if not replace and (minsize := size_per_group.min()) < n:
+                raise ValueError(
+                    f"Cannot sample {n=} without replacement, "
+                    f"smallest group is {minsize}"
+                )
+        else:
+            # Pandas uses round-to-nearest, ties to even to
+            # pick sample sizes for the fractional case (unlike IEEE
+            # which is round-to-nearest, ties to sgn(x) * inf).
+            samples_per_group = np.round(
+                size_per_group * frac, decimals=0
+            ).astype(size_type_dtype)
+        if replace:
+            # We would prefer to use cupy here, but their rng.integers
+            # interface doesn't take array-based low and high
+            # arguments.
+            low = 0
+            high = np.repeat(size_per_group, samples_per_group)
+            rng = np.random.default_rng(seed=random_state)
+            indices = rng.integers(low, high, dtype=size_type_dtype)
+            indices += np.repeat(group_offsets[:-1], samples_per_group)
+        else:
+            # Approach: do a segmented argsort of the index array and take
+            # the first samples_per_group entries from sorted array.
+            # We will shuffle the group indices and then pick them out
+            # from the grouped dataframe index.
+            nrows = len(group_values)
+            indices = cp.arange(nrows, dtype=size_type_dtype)
+            if len(size_per_group) < 500:
+                # Empirically shuffling with cupy is faster at this scale
+                rs = cp.random.get_random_state()
+                rs.seed(seed=random_state)
+                for off, size in zip(group_offsets, size_per_group):
+                    rs.shuffle(indices[off : off + size])
+            else:
+                rng = cp.random.default_rng(seed=random_state)
+                (indices,) = segmented_sort_by_key(
+                    [as_column(indices)],
+                    [as_column(rng.random(size=nrows))],
+                    as_column(group_offsets),
+                    [],
+                    [],
+                )
+                indices = cp.asarray(indices.data_array_view(mode="read"))
+            # Which indices are we going to want?
+            want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
+            scan = np.empty_like(samples_per_group)
+            scan[0] = 0
+            np.cumsum(samples_per_group[:-1], out=scan[1:])
+            want += np.repeat(group_offsets[:-1] - scan, samples_per_group)
+            indices = indices[want]
+        return group_values.iloc[indices]
+
     def serialize(self):
         header = {}
         frames = []
@@ -977,13 +1278,14 @@ def mult(df):
         ``engine='jit'`` may be used to accelerate certain functions,
         initially those that contain reductions and arithmetic operations
         between results of those reductions:
+
         >>> import cudf
         >>> df = cudf.DataFrame({'a':[1,1,2,2,3,3], 'b':[1,2,3,4,5,6]})
         >>> df.groupby('a').apply(
-        ...     lambda group: group['b'].max() - group['b'].min(),
-        ...     engine='jit'
+        ...   lambda group: group['b'].max() - group['b'].min(),
+        ...   engine='jit'
         ... )
-        a  None
+           a  None
         0  1     1
         1  2     1
         2  3     1
@@ -2017,7 +2319,14 @@ def _handle_mapping(self, by):
         self._handle_series(by)
 
     def _handle_label(self, by):
-        self._key_columns.append(self._obj._data[by])
+        try:
+            self._key_columns.append(self._obj._data[by])
+        except KeyError as e:
+            # `by` can be index name(label) too.
+            if by in self._obj._index.names:
+                self._key_columns.append(self._obj._index._data[by])
+            else:
+                raise e
         self.names.append(by)
         self._named_columns.append(by)
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 413e005b798..d1408fec160 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1403,7 +1403,7 @@ def __repr__(self):
     @_cudf_nvtx_annotate
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
-        if not isinstance(index, int):
+        if isinstance(res, ColumnBase):
             res = as_index(res)
             res.name = self.name
         return res
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 79927c60a85..8ec08b7c92a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -250,7 +250,11 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
         if isinstance(self._frame.index, cudf.MultiIndex) and not isinstance(
             arg, cudf.MultiIndex
         ):
-            result = self._frame.index._get_row_major(self._frame, arg)
+            if is_scalar(arg):
+                row_arg = (arg,)
+            else:
+                row_arg = arg
+            result = self._frame.index._get_row_major(self._frame, row_arg)
             if (
                 isinstance(arg, tuple)
                 and len(arg) == self._frame._index.nlevels
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index dc31cf43292..ebf8c677e55 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -19,11 +19,13 @@
     groupby_apply_kernel_template,
 )
 from cudf.core.udf.utils import (
+    _generate_cache_key,
     _get_extensionty_size,
     _get_kernel,
     _get_udf_return_type,
     _supported_cols_from_frame,
     _supported_dtypes_from_frame,
+    precompiled,
 )
 from cudf.utils.utils import _cudf_nvtx_annotate
 
@@ -147,12 +149,19 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     offsets = cp.asarray(offsets)
     ngroups = len(offsets) - 1
 
-    kernel, return_type = _get_groupby_apply_kernel(
-        grouped_values, function, args
+    cache_key = _generate_cache_key(
+        grouped_values, function, suffix="__GROUPBY_APPLY_UDF"
     )
-    return_type = numpy_support.as_dtype(return_type)
 
+    if cache_key not in precompiled:
+        precompiled[cache_key] = _get_groupby_apply_kernel(
+            grouped_values, function, args
+        )
+    kernel, return_type = precompiled[cache_key]
+
+    return_type = numpy_support.as_dtype(return_type)
     output = cudf.core.column.column_empty(ngroups, dtype=return_type)
+
     launch_args = [
         offsets,
         output,
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index edc1a16353f..ed0c3332499 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -245,7 +245,7 @@ def _mask_get(mask, pos):
     return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1
 
 
-def _generate_cache_key(frame, func: Callable):
+def _generate_cache_key(frame, func: Callable, suffix="__APPLY_UDF"):
     """Create a cache key that uniquely identifies a compilation.
 
     A new compilation is needed any time any of the following things change:
@@ -259,6 +259,7 @@ def _generate_cache_key(frame, func: Callable):
         ),
         *(col.mask is None for col in frame._data.values()),
         *frame._data.keys(),
+        suffix,
     )
 
 
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index cac4774400a..8a92ea86d57 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -557,5 +557,6 @@ def _apply_agg(self, agg_name):
             )
         )
 
-        result = super()._apply_agg(agg_name).set_index(index)
+        result = super()._apply_agg(agg_name)
+        result.index = index
         return result
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index ca4fb103ee8..3e1a4b1f024 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,7 +15,7 @@
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
-from cudf.core.column import as_column, build_categorical_column
+from cudf.core.column import build_categorical_column, column_empty, full
 from cudf.utils import ioutils
 from cudf.utils.utils import _cudf_nvtx_annotate
 
@@ -60,6 +60,7 @@ def _write_parquet(
     max_page_size_rows=None,
     partitions_info=None,
     storage_options=None,
+    force_nullable_schema=False,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -89,6 +90,7 @@ def _write_parquet(
         "max_page_size_bytes": max_page_size_bytes,
         "max_page_size_rows": max_page_size_rows,
         "partitions_info": partitions_info,
+        "force_nullable_schema": force_nullable_schema,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -126,6 +128,7 @@ def write_to_dataset(
     max_page_size_bytes=None,
     max_page_size_rows=None,
     storage_options=None,
+    force_nullable_schema=False,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
     For each combination of partition group and value,
@@ -179,7 +182,6 @@ def write_to_dataset(
     max_page_size_rows: integer or None, default None
         Maximum number of rows of each page of the output.
         If None, 20000 will be used.
-
     storage_options : dict, optional, default None
         Extra options that make sense for a particular storage connection,
         e.g. host, port, username, password, etc. For HTTP(S) URLs the
@@ -187,6 +189,10 @@ def write_to_dataset(
         header options. For other URLs (e.g. starting with "s3://", and
         "gcs://") the key-value pairs are forwarded to ``fsspec.open``.
         Please see ``fsspec`` and ``urllib`` for more details.
+    force_nullable_schema : bool, default False.
+        If True, writes all columns as `null` in schema.
+        If False, columns are written as `null` if they contain null values,
+        otherwise as `not null`.
     """
 
     fs = ioutils._ensure_filesystem(fs, root_path, storage_options)
@@ -224,6 +230,7 @@ def write_to_dataset(
             row_group_size_rows=row_group_size_rows,
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
+            force_nullable_schema=force_nullable_schema,
         )
 
     else:
@@ -244,6 +251,7 @@ def write_to_dataset(
             row_group_size_rows=row_group_size_rows,
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
+            force_nullable_schema=force_nullable_schema,
         )
 
     return metadata
@@ -609,11 +617,12 @@ def _parquet_to_frame(
         )
         # Add partition columns to the last DataFrame
         for (name, value) in part_key:
+            _len = len(dfs[-1])
             if partition_categories and name in partition_categories:
                 # Build the categorical column from `codes`
-                codes = as_column(
-                    partition_categories[name].index(value),
-                    length=len(dfs[-1]),
+                codes = full(
+                    size=_len,
+                    fill_value=partition_categories[name].index(value),
                 )
                 dfs[-1][name] = build_categorical_column(
                     categories=partition_categories[name],
@@ -625,14 +634,23 @@ def _parquet_to_frame(
             else:
                 # Not building categorical columns, so
                 # `value` is already what we want
-                if partition_meta is not None:
-                    dfs[-1][name] = as_column(
-                        value,
-                        length=len(dfs[-1]),
-                        dtype=partition_meta[name].dtype,
+                _dtype = (
+                    partition_meta[name].dtype
+                    if partition_meta is not None
+                    else None
+                )
+                if pd.isna(value):
+                    dfs[-1][name] = column_empty(
+                        row_count=_len,
+                        dtype=_dtype,
+                        masked=True,
                     )
                 else:
-                    dfs[-1][name] = as_column(value, length=len(dfs[-1]))
+                    dfs[-1][name] = full(
+                        size=_len,
+                        fill_value=value,
+                        dtype=_dtype,
+                    )
 
     # Concatenate dfs and return.
     # Assume we can ignore the index if it has no name.
@@ -702,6 +720,7 @@ def to_parquet(
     max_page_size_rows=None,
     storage_options=None,
     return_metadata=False,
+    force_nullable_schema=False,
     *args,
     **kwargs,
 ):
@@ -750,6 +769,7 @@ def to_parquet(
                 max_page_size_rows=max_page_size_rows,
                 return_metadata=return_metadata,
                 storage_options=storage_options,
+                force_nullable_schema=force_nullable_schema,
             )
 
         partition_info = (
@@ -774,6 +794,7 @@ def to_parquet(
             max_page_size_rows=max_page_size_rows,
             partitions_info=partition_info,
             storage_options=storage_options,
+            force_nullable_schema=force_nullable_schema,
         )
 
     else:
@@ -886,8 +907,11 @@ def _get_groups_and_offsets(
         grouped_df.reset_index(drop=True, inplace=True)
     grouped_df.drop(columns=partition_cols, inplace=True)
     # Copy the entire keys df in one operation rather than using iloc
-    part_names = part_keys.to_pandas().unique().to_frame(index=False)
-
+    part_names = (
+        part_keys.take(part_offsets[:-1])
+        .to_pandas(nullable=True)
+        .to_frame(index=False)
+    )
     return part_names, grouped_df, part_offsets
 
 
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 9eb01ae31b4..ea23587ea70 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import datetime
 import io
+import pathlib
+from typing import Optional
 
 import fastavro
 import pytest
@@ -22,8 +25,7 @@
 from cudf.testing.dataset_generator import rand_dataframe
 
 
-def cudf_from_avro_util(schema, records):
-
+def cudf_from_avro_util(schema: dict, records: list) -> cudf.DataFrame:
     schema = [] if schema is None else fastavro.parse_schema(schema)
     buffer = io.BytesIO()
     fastavro.writer(buffer, schema, records)
@@ -244,3 +246,201 @@ def test_avro_compression(rows, codec):
     got_df = cudf.read_avro(buffer)
 
     assert_eq(expected_df, got_df)
+
+
+avro_logical_type_params = [
+    # (avro logical type, avro primitive type, cudf expected dtype)
+    ("date", "int", "datetime64[s]"),
+]
+
+
+@pytest.mark.parametrize(
+    "logical_type, primitive_type, expected_dtype", avro_logical_type_params
+)
+@pytest.mark.parametrize("namespace", [None, "root_ns"])
+@pytest.mark.parametrize("nullable", [True, False])
+@pytest.mark.parametrize("prepend_null", [True, False])
+def test_can_detect_dtypes_from_avro_logical_type(
+    logical_type,
+    primitive_type,
+    expected_dtype,
+    namespace,
+    nullable,
+    prepend_null,
+):
+    avro_type = [{"logicalType": logical_type, "type": primitive_type}]
+    if nullable:
+        if prepend_null:
+            avro_type.insert(0, "null")
+        else:
+            avro_type.append("null")
+
+    schema = fastavro.parse_schema(
+        {
+            "type": "record",
+            "name": "test",
+            "namespace": namespace,
+            "fields": [{"name": "prop", "type": avro_type}],
+        }
+    )
+
+    actual = cudf_from_avro_util(schema, [])
+
+    expected = cudf.DataFrame(
+        {"prop": cudf.Series(None, None, expected_dtype)}
+    )
+
+    assert_eq(expected, actual)
+
+
+def get_days_from_epoch(date: Optional[datetime.date]) -> Optional[int]:
+    if date is None:
+        return None
+    return (date - datetime.date(1970, 1, 1)).days
+
+
+@pytest.mark.parametrize("namespace", [None, "root_ns"])
+@pytest.mark.parametrize("nullable", [True, False])
+@pytest.mark.parametrize("prepend_null", [True, False])
+def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null):
+
+    avro_type = {"logicalType": "date", "type": "int"}
+    if nullable:
+        if prepend_null:
+            avro_type = ["null", avro_type]
+        else:
+            avro_type = [avro_type, "null"]
+
+    schema_dict = {
+        "type": "record",
+        "name": "test",
+        "fields": [
+            {"name": "o_date", "type": avro_type},
+        ],
+    }
+
+    if namespace:
+        schema_dict["namespace"] = namespace
+
+    schema = fastavro.parse_schema(schema_dict)
+
+    # Insert some None values in no particular order.  These will get converted
+    # into avro "nulls" by the fastavro writer (or filtered out if we're not
+    # nullable).  The first and last dates are epoch min/max values, the rest
+    # are arbitrarily chosen.
+    dates = [
+        None,
+        datetime.date(1970, 1, 1),
+        datetime.date(1970, 1, 2),
+        datetime.date(1981, 10, 25),
+        None,
+        None,
+        datetime.date(2012, 5, 18),
+        None,
+        datetime.date(2019, 9, 3),
+        None,
+        datetime.date(9999, 12, 31),
+    ]
+
+    if not nullable:
+        dates = [date for date in dates if date is not None]
+
+    days_from_epoch = [get_days_from_epoch(date) for date in dates]
+
+    records = [{"o_date": day} for day in days_from_epoch]
+
+    actual = cudf_from_avro_util(schema, records)
+
+    expected = cudf.DataFrame(
+        {"o_date": cudf.Series(dates, dtype="datetime64[s]")}
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_alltypes_plain_avro():
+    # During development of the logical type support, the Java avro tests were
+    # triggering CUDA kernel crashes (null pointer dereferences).  We were able
+    # to replicate the behavior in a C++ test case, and then subsequently came
+    # up with this Python unit test to also trigger the problematic code path.
+    #
+    # So, unlike the other tests, this test is inherently reactive in nature,
+    # added simply to verify we fixed the problematic code path that was
+    # causing CUDA kernel crashes.
+    #
+    # See https://github.com/rapidsai/cudf/pull/12788#issuecomment-1468822875
+    # for more information.
+    relpath = "../../../../java/src/test/resources/alltypes_plain.avro"
+    path = pathlib.Path(__file__).parent.joinpath(relpath).resolve()
+    assert path.is_file(), path
+    path = str(path)
+
+    with open(path, "rb") as f:
+        reader = fastavro.reader(f)
+        records = [record for record in reader]
+
+    # For reference:
+    #
+    # >>> from pprint import pprint
+    # >>> pprint(reader.writer_schema)
+    # {'fields': [{'name': 'id', 'type': ['int', 'null']},
+    #             {'name': 'bool_col', 'type': ['boolean', 'null']},
+    #             {'name': 'tinyint_col', 'type': ['int', 'null']},
+    #             {'name': 'smallint_col', 'type': ['int', 'null']},
+    #             {'name': 'int_col', 'type': ['int', 'null']},
+    #             {'name': 'bigint_col', 'type': ['long', 'null']},
+    #             {'name': 'float_col', 'type': ['float', 'null']},
+    #             {'name': 'double_col', 'type': ['double', 'null']},
+    #             {'name': 'date_string_col', 'type': ['bytes', 'null']},
+    #             {'name': 'string_col', 'type': ['bytes', 'null']},
+    #             {'name': 'timestamp_col',
+    #              'type': [{'logicalType': 'timestamp-micros',
+    #                        'type': 'long'},
+    #                       'null']}],
+    #  'name': 'topLevelRecord',
+    #  'type': 'record'}
+    #
+    # >>> pprint(records[0])
+    # {'bigint_col': 0,
+    #  'bool_col': True,
+    #  'date_string_col': b'03/01/09',
+    #  'double_col': 0.0,
+    #  'float_col': 0.0,
+    #  'id': 4,
+    #  'int_col': 0,
+    #  'smallint_col': 0,
+    #  'string_col': b'0',
+    #  'timestamp_col': datetime.datetime(2009, 3, 1, 0, 0,
+    #                                     tzinfo=datetime.timezone.utc),
+    #  'tinyint_col': 0}
+
+    # Nothing particularly special about these columns, other than them being
+    # the ones that @davidwendt used to coerce the crash.
+    columns = ["bool_col", "int_col", "timestamp_col"]
+
+    # This next line would trigger the fatal CUDA kernel crash.
+    actual = cudf.read_avro(path, columns=columns)
+
+    # If we get here, we haven't crashed, obviously.  Verify the returned data
+    # frame meets our expectations.  We need to fiddle with the dtypes of the
+    # expected data frame in order to correctly match the schema definition and
+    # our corresponding read_avro()-returned data frame.
+
+    data = [{column: row[column] for column in columns} for row in records]
+    expected = cudf.DataFrame(data)
+
+    # The fastavro.reader supports the `'logicalType': 'timestamp-micros'` used
+    # by the 'timestamp_col' column, which is converted into Python
+    # datetime.datetime() objects (see output of pprint(records[0]) above).
+    # As we don't support that logical type yet in cudf, we need to convert to
+    # int64, then divide by 1000 to convert from nanoseconds to microseconds.
+    timestamps = expected["timestamp_col"].astype("int64")
+    timestamps //= 1000
+    expected["timestamp_col"] = timestamps
+
+    # Furthermore, we need to force the 'int_col' into an int32, per the schema
+    # definition.  (It ends up as an int64 due to cudf.DataFrame() defaulting
+    # all Python int values to int64 sans a dtype= override.)
+    expected["int_col"] = expected["int_col"].astype("int32")
+
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 2ff0bddf1c8..910f0b9cf86 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -1869,3 +1869,16 @@ def test_concat_invalid_axis(axis):
     s = gd.Series([1, 2, 3])
     with pytest.raises(ValueError):
         gd.concat([s], axis=axis)
+
+
+@pytest.mark.parametrize(
+    "s1,s2",
+    [
+        ([1, 2], [[1, 2], [3, 4]]),
+    ],
+)
+def test_concat_mixed_list_types_error(s1, s2):
+    s1, s2 = gd.Series(s1), gd.Series(s2)
+
+    with pytest.raises(NotImplementedError):
+        gd.concat([s1, s2], ignore_index=True)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6a79555d43e..609f5eb488b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10031,6 +10031,20 @@ def test_dataframe_transpose_complex_types(data):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]},
+        {"a": [[{"b": 567}], None] * 10},
+        {"a": [decimal.Decimal(10), decimal.Decimal(20), None]},
+    ],
+)
+def test_dataframe_values_complex_types(data):
+    gdf = cudf.DataFrame(data)
+    with pytest.raises(NotImplementedError):
+        gdf.values
+
+
 def test_dataframe_from_arrow_slice():
     table = pa.Table.from_pandas(
         pd.DataFrame.from_dict(
@@ -10043,3 +10057,33 @@ def test_dataframe_from_arrow_slice():
     actual = cudf.DataFrame.from_arrow(table_slice)
 
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 4},
+        {"c": 4, "a": [1, 2, 3], "b": ["x", "y", "z"]},
+        {"a": [1, 2, 3], "c": 4},
+    ],
+)
+def test_dataframe_init_from_scalar_and_lists(data):
+    actual = cudf.DataFrame(data)
+    expected = pd.DataFrame(data)
+
+    assert_eq(expected, actual)
+
+
+def test_dataframe_init_length_error():
+    assert_exceptions_equal(
+        lfunc=pd.DataFrame,
+        rfunc=cudf.DataFrame,
+        lfunc_args_and_kwargs=(
+            [],
+            {"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}},
+        ),
+        rfunc_args_and_kwargs=(
+            [],
+            {"data": {"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}},
+        ),
+    )
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 0751ef7ca67..e58d70f49c7 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1,7 +1,10 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
+import collections
 import datetime
 import itertools
+import operator
+import string
 import textwrap
 from decimal import Decimal
 
@@ -17,6 +20,7 @@
 from cudf import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140
 from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES
+from cudf.core.udf.utils import precompiled
 from cudf.testing._utils import (
     DATETIME_TYPES,
     SIGNED_TYPES,
@@ -531,6 +535,42 @@ def diverging_block(grp_df):
     run_groupby_apply_jit_test(df, diverging_block, ["a"])
 
 
+def test_groupby_apply_caching():
+    # Make sure similar functions that differ
+    # by simple things like constants actually
+    # recompile
+
+    # begin with a clear cache
+    precompiled.clear()
+    assert precompiled.currsize == 0
+
+    data = cudf.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 4, 5, 6]})
+
+    def f(group):
+        return group["b"].mean() * 2
+
+    # a single run should result in a cache size of 1
+    run_groupby_apply_jit_test(data, f, ["a"])
+    assert precompiled.currsize == 1
+
+    # a second run with f should not increase the count
+    run_groupby_apply_jit_test(data, f, ["a"])
+    assert precompiled.currsize == 1
+
+    # changing a constant value inside the UDF should miss
+    def f(group):
+        return group["b"].mean() * 3
+
+    run_groupby_apply_jit_test(data, f, ["a"])
+    assert precompiled.currsize == 2
+
+    # changing the dtypes of the columns should miss
+    data["b"] = data["b"].astype("float64")
+    run_groupby_apply_jit_test(data, f, ["a"])
+
+    assert precompiled.currsize == 3
+
+
 @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000])
 @pytest.mark.parametrize(
     "func",
@@ -1474,7 +1514,6 @@ def test_grouping(grouper):
 @pytest.mark.parametrize("agg", [lambda x: x.count(), "count"])
 @pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]])
 def test_groupby_count(agg, by):
-
     pdf = pd.DataFrame(
         {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]}
     )
@@ -1540,7 +1579,6 @@ def test_groupby_nth(n, by):
     reason="https://github.com/pandas-dev/pandas/issues/43209",
 )
 def test_raise_data_error():
-
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
     gdf = cudf.from_pandas(pdf)
 
@@ -1551,7 +1589,6 @@ def test_raise_data_error():
 
 
 def test_drop_unsupported_multi_agg():
-
     gdf = cudf.DataFrame(
         {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
     )
@@ -2567,7 +2604,6 @@ def foo(x):
     ],
 )
 def test_groupby_apply_series_args(func, args):
-
     got = make_frame(DataFrame, 100).groupby("x").y.apply(func, *args)
     expect = (
         make_frame(pd.DataFrame, 100)
@@ -2963,3 +2999,175 @@ def test_groupby_dtypes(groups):
     pdf = df.to_pandas()
 
     assert_eq(pdf.groupby(groups).dtypes, df.groupby(groups).dtypes)
+
+
+@pytest.mark.parametrize("index_names", ["a", "b", "c", ["b", "c"]])
+def test_groupby_by_index_names(index_names):
+    gdf = cudf.DataFrame(
+        {"a": [1, 2, 3, 4], "b": ["a", "b", "a", "a"], "c": [1, 1, 2, 1]}
+    ).set_index(index_names)
+    pdf = gdf.to_pandas()
+
+    assert_groupby_results_equal(
+        pdf.groupby(index_names).min(), gdf.groupby(index_names).min()
+    )
+
+
+class TestSample:
+    @pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"])
+    def index(self, request):
+        n = 12
+        if request.param == "rangeindex":
+            return cudf.RangeIndex(2, n + 2)
+        elif request.param == "intindex":
+            return cudf.Index(
+                [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32"
+            )
+        elif request.param == "strindex":
+            return cudf.StringIndex(list(string.ascii_lowercase[:n]))
+        elif request.param == "default":
+            return None
+
+    @pytest.fixture(
+        params=[
+            ["a", "a", "b", "b", "c", "c", "c", "d", "d", "d", "d", "d"],
+            [1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4],
+        ],
+        ids=["str-group", "int-group"],
+    )
+    def df(self, index, request):
+        return cudf.DataFrame(
+            {"a": request.param, "b": request.param, "v": request.param},
+            index=index,
+        )
+
+    @pytest.fixture(params=["a", ["a", "b"]], ids=["single-col", "two-col"])
+    def by(self, request):
+        return request.param
+
+    def expected(self, df, *, n=None, frac=None):
+        value_counts = collections.Counter(df.a.values_host)
+        if n is not None:
+            values = list(
+                itertools.chain.from_iterable(
+                    itertools.repeat(v, n) for v in value_counts.keys()
+                )
+            )
+        elif frac is not None:
+            values = list(
+                itertools.chain.from_iterable(
+                    itertools.repeat(v, round(count * frac))
+                    for v, count in value_counts.items()
+                )
+            )
+        else:
+            raise ValueError("Must provide either n or frac")
+        values = cudf.Series(sorted(values), dtype=df.a.dtype)
+        return cudf.DataFrame({"a": values, "b": values, "v": values})
+
+    @pytest.mark.parametrize("n", [None, 0, 1, 2])
+    def test_constant_n_no_replace(self, df, by, n):
+        result = df.groupby(by).sample(n=n).sort_values("a")
+        n = 1 if n is None else n
+        assert_eq(self.expected(df, n=n), result.reset_index(drop=True))
+
+    def test_constant_n_no_replace_too_large_raises(self, df):
+        with pytest.raises(ValueError):
+            df.groupby("a").sample(n=3)
+
+    @pytest.mark.parametrize("n", [1, 2, 3])
+    def test_constant_n_replace(self, df, by, n):
+        result = df.groupby(by).sample(n=n, replace=True).sort_values("a")
+        assert_eq(self.expected(df, n=n), result.reset_index(drop=True))
+
+    def test_invalid_arguments(self, df):
+        with pytest.raises(ValueError):
+            df.groupby("a").sample(n=1, frac=0.1)
+
+    def test_not_implemented_arguments(self, df):
+        with pytest.raises(NotImplementedError):
+            # These are valid weights, but we don't implement this yet.
+            df.groupby("a").sample(n=1, weights=[1 / len(df)] * len(df))
+
+    @pytest.mark.parametrize("frac", [0, 1 / 3, 1 / 2, 2 / 3, 1])
+    @pytest.mark.parametrize("replace", [False, True])
+    def test_fraction_rounding(self, df, by, frac, replace):
+        result = (
+            df.groupby(by).sample(frac=frac, replace=replace).sort_values("a")
+        )
+        assert_eq(self.expected(df, frac=frac), result.reset_index(drop=True))
+
+
+class TestHeadTail:
+    @pytest.fixture(params=[-3, -2, -1, 0, 1, 2, 3], ids=lambda n: f"{n=}")
+    def n(self, request):
+        return request.param
+
+    @pytest.fixture(
+        params=[False, True], ids=["no-preserve-order", "preserve-order"]
+    )
+    def preserve_order(self, request):
+        return request.param
+
+    @pytest.fixture
+    def df(self):
+        return cudf.DataFrame(
+            {
+                "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3],
+                "b": [0, 1, 2, 4, 3, 5, 6, 7, 9, 8, 10],
+            }
+        )
+
+    @pytest.fixture(params=[True, False], ids=["head", "tail"])
+    def take_head(self, request):
+        return request.param
+
+    @pytest.fixture
+    def expected(self, df, n, take_head, preserve_order):
+        if n == 0:
+            # We'll get an empty dataframe in this case
+            return df._empty_like(keep_index=True)
+        else:
+            if preserve_order:
+                # Should match pandas here
+                g = df.to_pandas().groupby("a")
+                if take_head:
+                    return g.head(n=n)
+                else:
+                    return g.tail(n=n)
+            else:
+                # We groupby "a" which is the first column. This
+                # possibly relies on an implementation detail that for
+                # integer group keys, cudf produces groups in sorted
+                # (ascending) order.
+                keyfunc = operator.itemgetter(0)
+                if take_head or n == 0:
+                    # Head does group[:n] as does tail for n == 0
+                    slicefunc = operator.itemgetter(slice(None, n))
+                else:
+                    # Tail does group[-n:] except when n == 0
+                    slicefunc = operator.itemgetter(
+                        slice(-n, None) if n else slice(0)
+                    )
+                values_to_sort = np.hstack(
+                    [df.values_host, np.arange(len(df)).reshape(-1, 1)]
+                )
+                expect_a, expect_b, index = zip(
+                    *itertools.chain.from_iterable(
+                        slicefunc(list(group))
+                        for _, group in itertools.groupby(
+                            sorted(values_to_sort.tolist(), key=keyfunc),
+                            key=keyfunc,
+                        )
+                    )
+                )
+                return cudf.DataFrame(
+                    {"a": expect_a, "b": expect_b}, index=index
+                )
+
+    def test_head_tail(self, df, n, take_head, expected, preserve_order):
+        if take_head:
+            actual = df.groupby("a").head(n=n, preserve_order=preserve_order)
+        else:
+            actual = df.groupby("a").tail(n=n, preserve_order=preserve_order)
+        assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index d043b917251..0b0c5fba7fa 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2886,3 +2886,22 @@ def test_index_to_pandas_nullable(data, expected_dtype):
     expected = pd.Index(data, dtype=expected_dtype)
 
     assert_eq(pi, expected)
+
+
+class TestIndexScalarGetItem:
+    @pytest.fixture(
+        params=[range(1, 10, 2), [1, 2, 3], ["a", "b", "c"], [1.5, 2.5, 3.5]]
+    )
+    def index_values(self, request):
+        return request.param
+
+    @pytest.fixture(params=[int, np.int8, np.int32, np.int64])
+    def i(self, request):
+        return request.param(1)
+
+    def test_scalar_getitem(self, index_values, i):
+        index = cudf.Index(index_values)
+
+        assert not isinstance(index[i], cudf.Index)
+        assert index[i] == index_values[i]
+        assert_eq(index, index.to_pandas())
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 5012ae0979f..95936c48b7c 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1446,6 +1446,8 @@ def test_loc_zero_dim_array():
                 reason="https://github.com/pandas-dev/pandas/issues/46704"
             ),
         ),
+        1,
+        2,
     ],
 )
 def test_loc_series_multiindex(arg):
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 0f04e8c0f2d..a0e027d4c86 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -319,6 +319,9 @@ def test_multiindex_getitem(pdf, gdf, pdfIndex):
         (("a", "store"), slice(None)),
         # return 2 rows, n-1 remaining keys = dataframe with n-k index columns
         ("a",),
+        "a",
+        "b",
+        "c",
         (("a",), slice(None)),
         # return 1 row, 0 remaining keys = dataframe with entire index
         ("a", "store", "storm", "smoke"),
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 7cc67347467..c24ff080033 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2775,3 +2775,16 @@ def test_parquet_reader_unsupported_page_encoding(datadir):
     # expect a failure when reading the whole file
     with pytest.raises(RuntimeError):
         cudf.read_parquet(fname)
+
+
+@pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}])
+@pytest.mark.parametrize("force_nullable_schema", [True, False])
+def test_parquet_writer_schema_nullability(data, force_nullable_schema):
+    df = cudf.DataFrame(data)
+    file_obj = BytesIO()
+
+    df.to_parquet(file_obj, force_nullable_schema=force_nullable_schema)
+
+    assert pa.parquet.read_schema(file_obj).field(0).nullable == (
+        force_nullable_schema or df.isnull().any().any()
+    )
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 62120619d94..b4e0983a9e3 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -547,3 +547,13 @@ def test_rolling_indexer_support(indexer):
     actual = gdf.rolling(window=indexer, min_periods=2).sum()
 
     assert_eq(expected, actual)
+
+
+def test_rolling_series():
+    df = cudf.DataFrame({"a": range(0, 100), "b": [10, 20, 30, 40, 50] * 20})
+    pdf = df.to_pandas()
+
+    expected = pdf.groupby("b")["a"].rolling(5).mean()
+    actual = df.groupby("b")["a"].rolling(5).mean()
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 4d9ffc7cd81..dd82a9244b6 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -353,3 +353,18 @@ def test_scatter_by_slice_with_start_and_step():
     target[1::2] = source
     ctarget[1::2] = csource
     assert_eq(target, ctarget)
+
+
+@pytest.mark.parametrize("n", [1, 3])
+def test_setitem_str_trailing_null(n):
+    trailing_nulls = "\x00" * n
+    s = cudf.Series(["a", "b", "c" + trailing_nulls])
+    assert s[2] == "c" + trailing_nulls
+    s[0] = "a" + trailing_nulls
+    assert s[0] == "a" + trailing_nulls
+    s[1] = trailing_nulls
+    assert s[1] == trailing_nulls
+    s[0] = ""
+    assert s[0] == ""
+    s[0] = "\x00"
+    assert s[0] == "\x00"
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index acf00b3a3d5..c7a8c8b4096 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -260,6 +260,15 @@ def to_cudf_compatible_scalar(val, dtype=None):
     ) or cudf.api.types.is_string_dtype(dtype):
         dtype = "str"
 
+        if isinstance(val, str) and val.endswith("\x00"):
+            # Numpy string dtypes are fixed width and use NULL to
+            # indicate the end of the string, so they cannot
+            # distinguish between "abc\x00" and "abc".
+            # https://github.com/numpy/numpy/issues/20118
+            # In this case, don't try going through numpy and just use
+            # the string value directly (cudf.DeviceScalar will DTRT)
+            return val
+
     if isinstance(val, datetime.datetime):
         val = np.datetime64(val)
     elif isinstance(val, datetime.timedelta):
@@ -571,6 +580,27 @@ def find_common_type(dtypes):
             )
         else:
             return cudf.dtype("O")
+    if any(cudf.api.types.is_list_dtype(dtype) for dtype in dtypes):
+        if len(dtypes) == 1:
+            return dtypes.get(0)
+        else:
+            # TODO: As list dtypes allow casting
+            # to identical types, improve this logic of returning a
+            # common dtype, for example:
+            # ListDtype(int64) & ListDtype(int32) common
+            # dtype could be ListDtype(int64).
+            raise NotImplementedError(
+                "Finding a common type for `ListDtype` is currently "
+                "not supported"
+            )
+    if any(cudf.api.types.is_struct_dtype(dtype) for dtype in dtypes):
+        if len(dtypes) == 1:
+            return dtypes.get(0)
+        else:
+            raise NotImplementedError(
+                "Finding a common type for `StructDtype` is currently "
+                "not supported"
+            )
 
     # Corner case 1:
     # Resort to np.result_type to handle "M" and "m" types separately
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 5f39c8722d9..bf51b360fec 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -290,6 +290,10 @@
     include the file path metadata (relative to `root_path`).
     To request metadata binary blob when using with ``partition_cols``, Pass
     ``return_metadata=True`` instead of specifying ``metadata_file_path``
+force_nullable_schema : bool, default False.
+    If True, writes all columns as `null` in schema.
+    If False, columns are written as `null` if they contain null values,
+    otherwise as `not null`.
 **kwargs
     Additional parameters will be passed to execution engines other
     than ``cudf``.
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 5b259b1dc66..3b49c821eff 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -3,18 +3,17 @@
 [build-system]
 build-backend = "setuptools.build_meta"
 requires = [
-    "wheel",
-    "setuptools",
-    "cython>=0.29,<0.30",
-    "scikit-build>=0.13.1",
     "cmake>=3.23.1,!=3.25.0",
+    "cython>=0.29,<0.30",
     "ninja",
-    "numpy",
-    # Hard pin the patch version used during the build.
-    "pyarrow==10.0.1",
+    "numpy>=1.21",
     "protoc-wheel",
+    "pyarrow==10.0.1.*",
     "rmm==23.4.*",
-]
+    "scikit-build>=0.13.1",
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "cudf"
@@ -28,22 +27,21 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
     "cachetools",
+    "cubinlinker",
     "cuda-python>=11.7.1,<12.0",
+    "cupy-cuda11x>=9.5.0,<12.0.0a0",
     "fsspec>=0.6.0",
-    "numba>=0.56.2",
+    "numba>=0.56.4,<0.57",
     "numpy>=1.21",
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=1.3,<1.6.0dev0",
     "protobuf>=4.21.6,<4.22",
-    "typing_extensions",
-    # Allow floating minor versions for Arrow.
-    "pyarrow==10",
-    "rmm==23.4.*",
     "ptxcompiler",
-    "cubinlinker",
-    "cupy-cuda11x",
-]
+    "pyarrow==10.*",
+    "rmm==23.4.*",
+    "typing_extensions",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Database",
@@ -56,18 +54,22 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
+    "fastavro>=0.22.9",
+    "hypothesis",
+    "mimesis>=4.1.0",
+    "msgpack",
+    "pyorc",
     "pytest",
     "pytest-benchmark",
+    "pytest-cases",
+    "pytest-cov",
     "pytest-xdist",
-    "hypothesis",
-    "mimesis>=4.1.0",
-    "fastavro>=0.22.9",
     "python-snappy>=0.6.0",
-    "pyorc",
-    "msgpack",
+    "scipy",
+    "tokenizers==0.13.1",
     "transformers==4.24.0",
     "tzdata",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 8a7ebf574fe..96b91b4ccc0 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -3,8 +3,9 @@
 from setuptools import find_packages
 from skbuild import setup
 
+packages = find_packages(include=["cudf*", "udf_cpp*"])
 setup(
-    include_package_data=True,
-    packages=find_packages(include=["cudf", "cudf.*"]),
+    packages=packages,
+    package_data={key: ["*.pxd", "*.hpp", "*.cuh"] for key in packages},
     zip_safe=False,
 )
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 308a7869bc0..ccaa08eeef5 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -3,10 +3,41 @@
 [build-system]
 
 requires = [
-    "wheel",
-    "setuptools",
     "cython>=0.29,<0.30",
+    "numpy>=1.21",
+    "pyarrow==10.0.1.*",
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "cudf_kafka"
+version = "23.04.00"
+description = "cuDF Kafka Datasource"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
 ]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.8"
+dependencies = [
+    "confluent-kafka==1.7.0",
+    "cudf==23.4.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.optional-dependencies]
+test = [
+    "pytest",
+    "pytest-cov",
+    "pytest-xdist",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cudf"
+Documentation = "https://docs.rapids.ai/api/cudf/stable/"
+
+[tool.setuptools]
+license-files = ["LICENSE"]
 
 [tool.isort]
 line_length = 79
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index c39b65cdb55..d955d95858a 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -10,10 +10,6 @@
 from setuptools import find_packages, setup
 from setuptools.extension import Extension
 
-install_requires = ["cudf", "cython"]
-
-extras_require = {"test": ["pytest", "pytest-xdist"]}
-
 cython_files = ["cudf_kafka/_lib/*.pyx"]
 
 CUDA_HOME = os.environ.get("CUDA_HOME", False)
@@ -84,24 +80,8 @@
     )
 ]
 
+packages = find_packages(include=["cudf_kafka*"])
 setup(
-    name="cudf_kafka",
-    version="23.04.00",
-    description="cuDF Kafka Datasource",
-    url="https://github.com/rapidsai/cudf",
-    author="NVIDIA Corporation",
-    license="Apache 2.0",
-    classifiers=[
-        "Intended Audience :: Developers",
-        "Topic :: Streaming",
-        "Topic :: Scientific/Engineering",
-        "Topic :: Apache Kafka",
-        "License :: OSI Approved :: Apache Software License",
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-    ],
     # Include the separately-compiled shared library
     ext_modules=cythonize(
         extensions,
@@ -110,12 +90,7 @@
             profile=False, language_level=3, embedsignature=True
         ),
     ),
-    packages=find_packages(include=["cudf_kafka", "cudf_kafka.*"]),
-    package_data=dict.fromkeys(
-        find_packages(include=["cudf_kafka._lib*"]),
-        ["*.pxd"],
-    ),
-    install_requires=install_requires,
-    extras_require=extras_require,
+    packages=packages,
+    package_data={key: ["*.pxd"] for key in packages},
     zip_safe=False,
 )
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 315621fa3c1..657b3865495 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -3,9 +3,9 @@
 [build-system]
 build-backend = "setuptools.build_meta"
 requires = [
-    "wheel",
     "setuptools",
-]
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "custreamz"
@@ -18,9 +18,10 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "cudf",
-    "cudf_kafka",
-]
+    "cudf==23.4.*",
+    "cudf_kafka==23.4.*",
+    "streamz",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Streaming",
@@ -36,8 +37,9 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "pytest",
+    "pytest-cov",
     "pytest-xdist",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/dask_cudf/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 60bbe5d9571..d2858876fcd 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -1,6 +1,7 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 import math
+import textwrap
 import warnings
 
 import numpy as np
@@ -68,6 +69,18 @@ def to_dask_dataframe(self, **kwargs):
 
 
 class DataFrame(_Frame, dd.core.DataFrame):
+    """
+    A distributed Dask DataFrame where the backing dataframe is a
+    :class:`cuDF DataFrame <cudf:cudf.DataFrame>`.
+
+    Typically you would not construct this object directly, but rather
+    use one of Dask-cuDF's IO routines.
+
+    Most operations on :doc:`Dask DataFrames <dask:dataframe>` are
+    supported, with many of the same caveats.
+
+    """
+
     _partition_type = cudf.DataFrame
 
     @_dask_cudf_nvtx_annotate
@@ -671,12 +684,35 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
 
 
 from_cudf.__doc__ = (
-    "Wraps main-line Dask from_pandas...\n" + dd.from_pandas.__doc__
+    textwrap.dedent(
+        """
+        Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`.
+
+        This function is a thin wrapper around
+        :func:`dask.dataframe.from_pandas`, accepting the same
+        arguments (described below) excepting that it operates on cuDF
+        rather than pandas objects.\n
+        """
+    )
+    + textwrap.dedent(dd.from_pandas.__doc__)
 )
 
 
 @_dask_cudf_nvtx_annotate
 def from_dask_dataframe(df):
+    """
+    Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
+    one.
+
+    Parameters
+    ----------
+    df : dask.dataframe.DataFrame
+        The Dask dataframe to convert
+
+    Returns
+    -------
+    dask_cudf.DataFrame : A new Dask collection backed by cuDF objects
+    """
     return df.map_partitions(cudf.from_pandas)
 
 
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index f91738bdab0..f4bbcaf4dd1 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from functools import wraps
 from typing import Set
@@ -433,22 +433,55 @@ def groupby_agg(
 ):
     """Optimized groupby aggregation for Dask-CuDF.
 
-    This aggregation algorithm only supports the following options:
-
-    - "count"
-    - "mean"
-    - "std"
-    - "var"
-    - "sum"
-    - "min"
-    - "max"
-    - "collect"
-    - "first"
-    - "last"
-
-    This "optimized" approach is more performant than the algorithm
-    in `dask.dataframe`, because it allows the cudf backend to
-    perform multiple aggregations at once.
+    Parameters
+    ----------
+    ddf : DataFrame
+        DataFrame object to perform grouping on.
+    gb_cols : str or list[str]
+        Column names to group by.
+    aggs_in : str, list, or dict
+        Aggregations to perform.
+    split_every : int (optional)
+        How to group intermediate aggregates.
+    dropna : bool
+        Drop grouping key values corresponding to NA values.
+    as_index : bool
+        Currently ignored.
+    sort : bool
+        Sort the group keys, better performance is obtained when
+        not sorting.
+    shuffle : str (optional)
+        Control how shuffling of the DataFrame is performed.
+    sep : str
+        Internal usage.
+
+
+    Notes
+    -----
+    This "optimized" approach is more performant than the algorithm in
+    implemented in :meth:`DataFrame.apply` because it allows the cuDF
+    backend to perform multiple aggregations at once.
+
+    This aggregation algorithm only supports the following options
+
+    * "collect"
+    * "count"
+    * "first"
+    * "last"
+    * "max"
+    * "mean"
+    * "min"
+    * "std"
+    * "sum"
+    * "var"
+
+
+    See Also
+    --------
+    DataFrame.groupby : generic groupby of a DataFrame
+    dask.dataframe.apply_concat_apply : for more description of the
+        split_every argument.
+
     """
     # Assert that aggregations are supported
     aggs = _redirect_aggs(aggs_in)
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index b4d080fd182..fd27083bbf4 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -16,9 +16,10 @@
 
 def read_csv(path, blocksize="default", **kwargs):
     """
-    Read CSV files into a dask_cudf.DataFrame
+    Read CSV files into a :class:`.DataFrame`.
 
-    This API parallelizes the ``cudf.read_csv`` function in the following ways:
+    This API parallelizes the :func:`cudf:cudf.read_csv` function in
+    the following ways:
 
     It supports loading many files at once using globstrings:
 
@@ -34,23 +35,26 @@ def read_csv(path, blocksize="default", **kwargs):
     >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
     >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
 
-    Internally ``dask_cudf.read_csv`` uses ``cudf.read_csv`` and supports
-    many of the same keyword arguments with the same performance guarantees.
-    See the docstring for ``cudf.read_csv()`` for more information on available
+    Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and
+    supports many of the same keyword arguments with the same
+    performance guarantees. See the docstring for
+    :func:`cudf:cudf.read_csv` for more information on available
     keyword arguments.
 
     Parameters
     ----------
     path : str, path object, or file-like object
-        Either a path to a file (a str, pathlib.Path, or
-        py._path.local.LocalPath), URL (including http, ftp, and S3 locations),
-        or any object with a read() method (such as builtin open() file
-        handler function or StringIO).
+        Either a path to a file (a str, :py:class:`pathlib.Path`, or
+        py._path.local.LocalPath), URL (including http, ftp, and S3
+        locations), or any object with a read() method (such as
+        builtin :py:func:`open` file handler function or
+        :py:class:`~io.StringIO`).
     blocksize : int or str, default "256 MiB"
-        The target task partition size. If `None`, a single block
+        The target task partition size. If ``None``, a single block
         is used for each file.
     **kwargs : dict
-        Passthrough key-word arguments that are sent to ``cudf.read_csv``.
+        Passthrough key-word arguments that are sent to
+        :func:`cudf:cudf.read_csv`.
 
     Examples
     --------
@@ -61,6 +65,7 @@ def read_csv(path, blocksize="default", **kwargs):
     0  1     hi
     1  2  hello
     2  3     ai
+
     """
 
     # Handle `chunksize` deprecation
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index bb3d0f3c601..2a6ad603414 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -10,30 +10,33 @@
 
 
 def read_json(url_path, engine="auto", **kwargs):
-    """Create a dask_cudf DataFrame collection from JSON data
+    """Read JSON data into a :class:`.DataFrame`.
 
-    This function wraps ``dask.dataframe.read_json``, and passes
+    This function wraps :func:`dask.dataframe.read_json`, and passes
     ``engine=partial(cudf.read_json, engine="auto")`` by default.
 
     Parameters
     ----------
-    url_path: str, list of str
+    url_path : str, list of str
         Location to read from. If a string, can include a glob character to
         find a set of file names.
         Supports protocol specifications such as ``"s3://"``.
     engine : str or Callable, default "auto"
-        If str, this value will be used as the ``engine`` argument when
-        ``cudf.read_json`` is used to create each partition. If Callable,
-        this value will be used as the underlying function used to create
-        each partition from JSON data. The default value is "auto", so
-        that ``engine=partial(cudf.read_json, engine="auto")`` will be
-        passed to ``dask.dataframe.read_json`` by default.
+
+        If str, this value will be used as the ``engine`` argument
+        when :func:`cudf.read_json` is used to create each partition.
+        If a :obj:`~typing.Callable`, this value will be used as the
+        underlying function used to create each partition from JSON
+        data. The default value is "auto", so that
+        ``engine=partial(cudf.read_json, engine="auto")`` will be
+        passed to :func:`dask.dataframe.read_json` by default.
+
     **kwargs :
-        Key-word arguments to pass through to ``dask.dataframe.read_json``.
+        Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
 
     Returns
     -------
-    dask_cudf.DataFrame
+    :class:`.DataFrame`
 
     Examples
     --------
@@ -53,7 +56,8 @@ def read_json(url_path, engine="auto", **kwargs):
 
     See Also
     --------
-    dask.dataframe.io.json.read_json
+    dask.dataframe.read_json
+
     """
 
     # TODO: Add optimized code path to leverage the
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index e731057ed90..49fea0d7602 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from io import BufferedWriter, IOBase
 
@@ -25,37 +25,45 @@ def _read_orc_stripe(fs, path, stripe, columns, kwargs=None):
 
 
 def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
-    """Read cudf dataframe from ORC file(s).
+    """Read ORC files into a :class:`.DataFrame`.
 
     Note that this function is mostly borrowed from upstream Dask.
 
     Parameters
     ----------
-    path: str or list(str)
+    path : str or list[str]
         Location of file(s), which can be a full URL with protocol specifier,
         and may include glob character if a single string.
-    columns: None or list(str)
+    columns : None or list[str]
         Columns to load. If None, loads all.
     filters : None or list of tuple or list of lists of tuples
-        If not None, specifies a filter predicate used to filter out row groups
-        using statistics stored for each row group as Parquet metadata. Row
-        groups that do not match the given filter predicate are not read. The
-        predicate is expressed in disjunctive normal form (DNF) like
-        `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical
-        combinations of single column predicates. The innermost tuples each
-        describe a single column predicate. The list of inner predicates is
-        interpreted as a conjunction (AND), forming a more selective and
-        multiple column predicate. Finally, the outermost list combines
-        these filters as a disjunction (OR). Predicates may also be passed
-        as a list of tuples. This form is interpreted as a single conjunction.
-        To express OR in predicates, one must use the (preferred) notation of
-        list of lists of tuples.
-    storage_options: None or dict
+        If not None, specifies a filter predicate used to filter out
+        row groups using statistics stored for each row group as
+        Parquet metadata. Row groups that do not match the given
+        filter predicate are not read. The predicate is expressed in
+        `disjunctive normal form (DNF)
+        <https://en.wikipedia.org/wiki/Disjunctive_normal_form>`__
+        like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary
+        boolean logical combinations of single column predicates. The
+        innermost tuples each describe a single column predicate. The
+        list of inner predicates is interpreted as a conjunction
+        (AND), forming a more selective and multiple column predicate.
+        Finally, the outermost list combines these filters as a
+        disjunction (OR). Predicates may also be passed as a list of
+        tuples. This form is interpreted as a single conjunction. To
+        express OR in predicates, one must use the (preferred)
+        notation of list of lists of tuples.
+    storage_options : None or dict
         Further parameters to pass to the bytes backend.
 
+    See Also
+    --------
+    dask.dataframe.read_orc
+
     Returns
     -------
-    cudf.DataFrame
+    dask_cudf.DataFrame
+
     """
 
     storage_options = storage_options or {}
@@ -133,22 +141,25 @@ def to_orc(
     compute=True,
     **kwargs,
 ):
-    """Write a dask_cudf dataframe to ORC file(s) (one file per partition).
+    """
+    Write a :class:`.DataFrame` to ORC file(s) (one file per partition).
 
     Parameters
     ----------
-    df : dask_cudf.DataFrame
-    path: string or pathlib.Path
+    df : DataFrame
+    path : str or pathlib.Path
         Destination directory for data.  Prepend with protocol like ``s3://``
         or ``hdfs://`` for remote data.
     write_index : boolean, optional
         Whether or not to write the index. Defaults to True.
-    storage_options: None or dict
+    storage_options : None or dict
         Further parameters to pass to the bytes backend.
     compression : string or dict, optional
     compute : bool, optional
-        If True (default) then the result is computed immediately. If False
-        then a ``dask.delayed`` object is returned for future computation.
+        If True (default) then the result is computed immediately. If
+        False then a :class:`~dask.delayed.Delayed` object is returned
+        for future computation.
+
     """
 
     from dask import compute as dask_compute, delayed
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 452f2f8914a..f19c373150d 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -121,6 +121,8 @@ def _read_paths(
                                 if row_groups
                                 else None,
                                 strings_to_categorical=strings_to_categorical,
+                                dataset_kwargs=dataset_kwargs,
+                                categorical_partitions=False,
                                 **kwargs,
                             )
                             for i, pof in enumerate(paths_or_fobs)
@@ -191,6 +193,8 @@ def read_partition(
 
         dataset_kwargs = kwargs.get("dataset", {})
         partitioning = partitioning or dataset_kwargs.get("partitioning", None)
+        if isinstance(partitioning, dict):
+            partitioning = pa_ds.partitioning(**partitioning)
 
         # Check if we are actually selecting any columns
         read_columns = columns
@@ -438,13 +442,14 @@ def set_object_dtypes_from_pa_schema(df, schema):
 
 
 def read_parquet(path, columns=None, **kwargs):
-    """Read parquet files into a Dask DataFrame
+    """
+    Read parquet files into a :class:`.DataFrame`.
 
-    Calls ``dask.dataframe.read_parquet`` with ``engine=CudfEngine``
-    to coordinate the execution of ``cudf.read_parquet``, and to
-    ultimately create a ``dask_cudf.DataFrame`` collection.
+    Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine``
+    to coordinate the execution of :func:`cudf.read_parquet`, and to
+    ultimately create a :class:`.DataFrame` collection.
 
-    See the ``dask.dataframe.read_parquet`` documentation for
+    See the :func:`dask.dataframe.read_parquet` documentation for
     all available options.
 
     Examples
@@ -469,6 +474,7 @@ def read_parquet(path, columns=None, **kwargs):
     See Also
     --------
     cudf.read_parquet
+    dask.dataframe.read_parquet
     """
     if isinstance(columns, str):
         columns = [columns]
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 8fb6e591660..f5ae9706fde 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -508,13 +508,14 @@ def test_null_partition(tmpdir):
     import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
 
-    df = pd.DataFrame({"id": [0, 1, None], "x": [1, 2, 3]})
+    ids = pd.Series([0, 1, None], dtype="Int64")
+    df = pd.DataFrame({"id": ids, "x": [1, 2, 3]})
     ddf = dd.from_pandas(df, npartitions=1).to_backend("cudf")
     ddf.to_parquet(str(tmpdir), partition_on="id")
     fns = glob.glob(os.path.join(tmpdir, "id" + "=*/*.parquet"))
     assert len(fns) == 3
 
-    partitioning = HivePartitioning(pa.schema([("id", pa.float64())]))
+    partitioning = HivePartitioning(pa.schema([("id", pa.int64())]))
     ddf_read = dask_cudf.read_parquet(
         str(tmpdir),
         dataset={"partitioning": partitioning},
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index 0f2dc0d4efc..e841f2d8830 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from collections.abc import Iterator
 
@@ -218,9 +218,11 @@ def quantile_divisions(df, by, npartitions):
                 divisions[col].iloc[-1] += 1
                 divisions[col] = divisions[col].astype(dtype)
             else:
-                divisions[col].iloc[-1] = chr(
-                    ord(divisions[col].iloc[-1][0]) + 1
-                )
+                if last := divisions[col].iloc[-1]:
+                    val = chr(ord(last[0]) + 1)
+                else:
+                    val = "this string intentionally left empty"  # any but ""
+                divisions[col].iloc[-1] = val
         divisions = divisions.drop_duplicates().sort_index()
     return divisions
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 770a52316b6..94609b180d6 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -104,3 +104,13 @@ def f(partition, by_columns, ascending, na_position, **kwargs):
         )
     expect = df.sort_values(by=by)
     dd.assert_eq(got, expect, check_index=False)
+
+
+@pytest.mark.parametrize("by", ["a", "b", ["a", "b"], ["b", "a"]])
+def test_sort_values_empty_string(by):
+    df = cudf.DataFrame({"a": [3, 2, 1, 4], "b": [""] * 4})
+    ddf = dd.from_pandas(df, npartitions=2)
+    got = ddf.sort_values(by)
+    if "a" in by:
+        expect = df.sort_values(by)
+        assert dd.assert_eq(got, expect, check_index=False)
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 79a9aca9e96..c91a9bb3b85 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -3,9 +3,9 @@
 [build-system]
 build-backend = "setuptools.build_meta"
 requires = [
-    "wheel",
     "setuptools",
-]
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
 name = "dask_cudf"
@@ -18,14 +18,14 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.8"
 dependencies = [
+    "cudf==23.4.*",
+    "cupy-cuda11x>=9.5.0,<12.0.0a0",
     "dask>=2023.1.1",
     "distributed>=2023.1.1",
     "fsspec>=0.6.0",
     "numpy>=1.21",
     "pandas>=1.3,<1.6.0dev0",
-    "cudf==23.4.*",
-    "cupy-cuda11x",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Topic :: Database",
@@ -40,12 +40,12 @@ dynamic = ["entry-points"]
 
 [project.optional-dependencies]
 test = [
-    "numpy>=1.21",
-    "pandas>=1.3,<1.6.0dev0",
+    "dask-cuda==23.4.*",
+    "numba>=0.56.4,<0.57",
     "pytest",
+    "pytest-cov",
     "pytest-xdist",
-    "numba>=0.56.2",
-]
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"

File	Compile time	Size
File	Compile time	Size	t-cmp
", name, "	", build_time_str, "	", file_size_str, "
", file_size_str, "	", + diff_time_str, + "