diff --git a/.github/workflows/dependency-files.yml b/.github/workflows/dependency-files.yml
new file mode 100644
index 00000000000..2ae939292d7
--- /dev/null
+++ b/.github/workflows/dependency-files.yml
@@ -0,0 +1,12 @@
+name: pr
+
+on:
+  pull_request:
+
+jobs:
+  checks:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@main
+    with:
+      enable_check_size: false
+      enable_check_style: false
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6eb621abcc3..608bd42d86c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -99,13 +99,13 @@ cd $CUDF_HOME
 **Note:** Using a conda environment is the easiest way to satisfy the library's dependencies.
 Instructions for a minimal build environment without conda are included below.
 
-- Create the conda development environment `cudf_dev`:
+- Create the conda development environment:
 
 ```bash
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`;
 # use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.5.yml
+conda env create --name cudf_dev --file conda/environments/all_cuda-115_arch-x86_64.yaml
 # activate the environment
 conda activate cudf_dev
 ```
@@ -114,9 +114,6 @@ conda activate cudf_dev
   development environment may also need to be updated if dependency versions or
   pinnings are changed.
 
-- For other CUDA versions, check the corresponding `cudf_dev_cuda*.yml` file in
-  `conda/environments/`.
-
 #### Building without a conda environment
 
 - libcudf has the following minimal dependencies (in addition to those listed in the [General
@@ -382,7 +379,7 @@ You can skip these checks with `git commit --no-verify` or with the short versio
 
 ## Developer Guidelines
 
-The [C++ Developer Guide](cpp/docs/DEVELOPER_GUIDE.md) includes details on contributing to libcudf C++ code.
+The [C++ Developer Guide](cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md) includes details on contributing to libcudf C++ code.
 
 The [Python Developer Guide](https://docs.rapids.ai/api/cudf/stable/developer_guide/index.html) includes details on contributing to cuDF Python code.
 
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index a8bc33e00bc..0ea39a0b365 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -82,8 +82,8 @@ conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
 
 # Install the conda-forge or nightly version of dask and distributed
 if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
-    gpuci_logger "gpuci_mamba_retry update dask"
-    gpuci_mamba_retry update dask
+    gpuci_logger "gpuci_mamba_retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
+    gpuci_mamba_retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
 else
     gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
     gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 7d67efa77b1..500c3bdbcc5 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -96,8 +96,8 @@ function install_dask {
     gpuci_logger "Install the conda-forge or nightly version of dask and distributed"
     set -x
     if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
-        gpuci_logger "gpuci_mamba_retry update dask"
-        gpuci_mamba_retry update dask
+        gpuci_logger "gpuci_mamba_retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
+        gpuci_mamba_retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
         conda list
     else
         gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
@@ -111,6 +111,8 @@ function install_dask {
     set +x
 }
 
+install_dask
+
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
 
     gpuci_logger "Install dependencies"
@@ -126,8 +128,6 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
     # gpuci_mamba_retry install -y "your-pkg=1.0.0"
 
-    install_dask
-
     ################################################################################
     # BUILD - Build libcudf, cuDF, libcudf_kafka, dask_cudf, and strings_udf from source
     ################################################################################
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 52dc22b6c49..9dcfe093643 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -63,9 +63,10 @@ sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cudf/source
 sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/conf.py
 
 # bump rmm & dask-cuda
-for FILE in conda/environments/*.yml; do
-  sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
+for FILE in conda/environments/*.yaml dependencies.yaml; do
   sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE};
+  sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
+  sed_runner "s/rmm-cu11=${CURRENT_SHORT_TAG}/rmm-cu11=${NEXT_SHORT_TAG}/g" ${FILE};
 done
 
 # Doxyfile update
diff --git a/conda/environments/all_cuda-115_arch-x86_64.yaml b/conda/environments/all_cuda-115_arch-x86_64.yaml
new file mode 100644
index 00000000000..a7e5f1a04a6
--- /dev/null
+++ b/conda/environments/all_cuda-115_arch-x86_64.yaml
@@ -0,0 +1,78 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- aiobotocore>=2.2.0
+- arrow-cpp=9
+- boto3>=1.21.21
+- botocore>=1.24.21
+- c-compiler
+- cachetools
+- cmake>=3.23.1
+- cubinlinker
+- cuda-python>=11.7.1,<12.0
+- cudatoolkit=11.5
+- cupy>=9.5.0,<12.0.0a0
+- cxx-compiler
+- cython>=0.29,<0.30
+- dask-cuda=22.12.*
+- dask>=2022.9.2
+- distributed>=2022.9.2
+- dlpack>=0.5,<0.6.0a0
+- doxygen=1.8.20
+- fastavro>=0.22.9
+- fsspec>=0.6.0
+- gcc_linux-64=9.*
+- hypothesis
+- ipython
+- librdkafka=1.7.0
+- mimesis>=4.1.0
+- moto>=4.0.8
+- myst-nb
+- nbsphinx
+- notebook>=0.5.0
+- numba>=0.56.2
+- numpy
+- numpydoc
+- nvcc_linux-64=11.5
+- nvtx>=0.2.1
+- packaging
+- pandas>=1.0,<1.6.0dev0
+- pandoc<=2.0.0
+- pip
+- pre-commit
+- protobuf>=3.20.1,<3.21.0a0
+- ptxcompiler
+- pyarrow=9.0.0
+- pydata-sphinx-theme
+- pytest
+- pytest-benchmark
+- pytest-cases
+- pytest-cov
+- pytest-xdist
+- python-confluent-kafka=1.7.0
+- python-snappy>=0.6.0
+- python>=3.8,<3.10
+- pytorch<1.12.0
+- rmm=22.12.*
+- s3fs>=2022.3.0
+- scikit-build>=0.13.1
+- scipy
+- sphinx
+- sphinx-autobuild
+- sphinx-copybutton
+- sphinx-markdown-tables
+- sphinxcontrib-websupport
+- streamz
+- sysroot_linux-64==2.17
+- transformers
+- typing_extensions
+- pip:
+  - git+https://github.com/python-streamz/streamz.git@master
+  - pyorc
+name: all_cuda-115_arch-x86_64
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
deleted file mode 100644
index 2cad2002456..00000000000
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - rapidsai-nightly
-  - dask/label/dev
-  - conda-forge
-  - nvidia
-dependencies:
-  - c-compiler
-  - cxx-compiler
-  - clang=11.1.0
-  - clang-tools=11.1.0
-  - cupy>=9.5.0,<12.0.0a0
-  - rmm=22.12.*
-  - cmake>=3.23.1
-  - cmake_setuptools>=0.1.3
-  - scikit-build>=0.13.1
-  - python>=3.8,<3.10
-  - numba>=0.56.2
-  - numpy
-  - pandas>=1.0,<1.6.0dev0
-  - pyarrow=9
-  - fastavro>=0.22.9
-  - python-snappy>=0.6.0
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-cases
-  - pytest-xdist
-  - sphinx
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - pandoc<=2.0.0
-  - cudatoolkit=11.5
-  - cuda-python>=11.7.1,<12.0
-  - pip
-  - doxygen=1.8.20
-  - typing_extensions
-  - pre-commit
-  - dask>=2022.9.2
-  - distributed>=2022.9.2
-  - streamz
-  - arrow-cpp=9
-  - dlpack>=0.5,<0.6.0a0
-  - double-conversion
-  - rapidjson
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - sphinx-autobuild
-  - myst-nb
-  - scipy
-  - dask-cuda=22.12.*
-  - mimesis>=4.1.0
-  - packaging
-  - protobuf>=3.20.1,<3.21.0a0
-  - nvtx>=0.2.1
-  - cachetools
-  - transformers<=4.10.3
-  - pydata-sphinx-theme
-  - pyorc
-  - librdkafka=1.7.0
-  - python-confluent-kafka=1.7.0
-  - moto>=3.1.6
-  - boto3>=1.21.21
-  - botocore>=1.24.21
-  - aiobotocore>=2.2.0
-  - s3fs>=2022.3.0
-  - werkzeug<2.2.0 # Temporary transient dependency pinning to avoid URL-LIB3 + moto timeouts
-  - pytorch<1.12.0
-  - cubinlinker  # [linux64]
-  - gcc_linux-64=9.* # [linux64]
-  - sysroot_linux-64==2.17 # [linux64]
-  - nvcc_linux-64=11.5
-  # Un-comment following lines for ARM specific packages.
-  # - gcc_linux-aarch64=9.* # [aarch64]
-  # - sysroot_linux-aarch64==2.17 # [aarch64]
-  # - nvcc_linux-aarch64=11.5 # [aarch64]
-  - pip:
-      - git+https://github.com/python-streamz/streamz.git@master
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 1d0153c94be..15d2fcc2a36 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -236,6 +236,7 @@ outputs:
         - test -f $PREFIX/include/cudf/strings/json.hpp
         - test -f $PREFIX/include/cudf/strings/padding.hpp
         - test -f $PREFIX/include/cudf/strings/regex/flags.hpp
+        - test -f $PREFIX/include/cudf/strings/regex/regex_program.hpp
         - test -f $PREFIX/include/cudf/strings/repeat_strings.hpp
         - test -f $PREFIX/include/cudf/strings/replace.hpp
         - test -f $PREFIX/include/cudf/strings/replace_re.hpp
@@ -254,7 +255,7 @@ outputs:
         - test -f $PREFIX/include/cudf/structs/structs_column_view.hpp
         - test -f $PREFIX/include/cudf/table/table.hpp
         - test -f $PREFIX/include/cudf/table/table_view.hpp
-        - test -f $PREFIX/include/cudf/tdigest/tdigest_column_view.cuh
+        - test -f $PREFIX/include/cudf/tdigest/tdigest_column_view.hpp
         - test -f $PREFIX/include/cudf/transform.hpp
         - test -f $PREFIX/include/cudf/transpose.hpp
         - test -f $PREFIX/include/cudf/types.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 03cf4c7d2b7..7e8ee5b60bf 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -69,6 +69,8 @@ option(CUDA_ENABLE_LINEINFO
 option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
 # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
+option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
+mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
 
 message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
 message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
@@ -344,7 +346,10 @@ add_library(
   src/io/parquet/chunk_dict.cu
   src/io/parquet/page_enc.cu
   src/io/parquet/page_hdr.cu
-  src/io/parquet/reader_impl.cu
+  src/io/parquet/reader.cpp
+  src/io/parquet/reader_impl.cpp
+  src/io/parquet/reader_impl_helpers.cpp
+  src/io/parquet/reader_impl_preprocess.cu
   src/io/parquet/writer_impl.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu
@@ -499,7 +504,8 @@ add_library(
   src/strings/padding.cu
   src/strings/json/json_path.cu
   src/strings/regex/regcomp.cpp
-  src/strings/regex/regexec.cu
+  src/strings/regex/regexec.cpp
+  src/strings/regex/regex_program.cpp
   src/strings/repeat_strings.cu
   src/strings/replace/backref_re.cu
   src/strings/replace/multi_re.cu
@@ -690,10 +696,10 @@ add_library(cudf::cudf ALIAS cudf)
 add_library(
   cudftestutil STATIC
   tests/io/metadata_utilities.cpp
-  tests/quantiles/tdigest_utilities.cu
   tests/utilities/base_fixture.cpp
   tests/utilities/column_utilities.cu
   tests/utilities/table_utilities.cu
+  tests/utilities/tdigest_utilities.cu
 )
 
 set_target_properties(
diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu
index 3c4208bf0fc..547367ffb69 100644
--- a/cpp/benchmarks/join/conditional_join.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -70,7 +70,7 @@ CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_
                    cudf::table_view const& right,                                      \
                    cudf::ast::operation binary_pred,                                   \
                    cudf::null_equality compare_nulls) {                                \
-      return cudf::conditional_inner_join(left, right, binary_pred);                   \
+      return cudf::conditional_full_join(left, right, binary_pred);                    \
     };                                                                                 \
     constexpr bool is_conditional = true;                                              \
     BM_join<key_type, payload_type, nullable, is_conditional>(st, join);               \
diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/string/json.cu
index 87528608cc7..d7c0066eb33 100644
--- a/cpp/benchmarks/string/json.cu
+++ b/cpp/benchmarks/string/json.cu
@@ -177,8 +177,8 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
-  auto children =
-    cudf::strings::detail::make_strings_children(jb, num_rows, cudf::get_default_stream());
+  auto children = cudf::strings::detail::make_strings_children(
+    jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(
     num_rows, std::move(children.first), std::move(children.second), 0, {});
 }
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 9fa5b9d1658..94dcdcb5bc2 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -20,43 +20,98 @@
 
 # cmake-lint: disable=R0912,R0913,R0915
 
+include_guard(GLOBAL)
+
+# Generate a FindArrow module for the case where we need to search for arrow within a pip install
+# pyarrow.
+function(find_libarrow_in_python_wheel PYARROW_VERSION)
+  string(REPLACE "." "" PYARROW_SO_VER "${PYARROW_VERSION}")
+  set(PYARROW_LIB libarrow.so.${PYARROW_SO_VER})
+
+  find_package(Python REQUIRED)
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])"
+    OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}")
+  rapids_find_generate_module(
+    Arrow NO_CONFIG
+    VERSION "${PYARROW_VERSION}"
+    LIBRARY_NAMES "${PYARROW_LIB}"
+    BUILD_EXPORT_SET cudf-exports
+    INSTALL_EXPORT_SET cudf-exports
+    HEADER_NAMES arrow/python/arrow_to_pandas.h
+  )
+
+  find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
+  add_library(arrow_shared ALIAS Arrow::Arrow)
+
+  # When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's
+  # `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note
+  # that these flags will often be redundant because we build wheels in manylinux containers that
+  # actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent
+  # behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer
+  # containers. Note that tests will not build successfully without also propagating these options
+  # to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly
+  # NVBench) builds. We are currently ignoring these limitations since we don't anticipate using
+  # this feature except for building wheels.
+  target_compile_options(
+    Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
+                           "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
+  )
+
+  rapids_export_package(BUILD Arrow cudf-exports)
+  rapids_export_package(INSTALL Arrow cudf-exports)
+
+  list(POP_BACK CMAKE_PREFIX_PATH)
+endfunction()
+
 # This function finds arrow and sets any additional necessary environment variables.
 function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON
          ENABLE_PARQUET
 )
 
+  if(USE_LIBARROW_FROM_PYARROW)
+    # Generate a FindArrow.cmake to find pyarrow's libarrow.so
+    find_libarrow_in_python_wheel(${VERSION})
+    set(ARROW_FOUND
+        TRUE
+        PARENT_SCOPE
+    )
+    set(ARROW_LIBRARIES
+        arrow_shared
+        PARENT_SCOPE
+    )
+    return()
+  endif()
+
   if(BUILD_STATIC)
     if(TARGET arrow_static)
-      list(APPEND ARROW_LIBRARIES arrow_static)
       set(ARROW_FOUND
           TRUE
           PARENT_SCOPE
       )
       set(ARROW_LIBRARIES
-          ${ARROW_LIBRARIES}
+          arrow_static
           PARENT_SCOPE
       )
       return()
     endif()
   else()
     if(TARGET arrow_shared)
-      list(APPEND ARROW_LIBRARIES arrow_shared)
       set(ARROW_FOUND
           TRUE
           PARENT_SCOPE
       )
       set(ARROW_LIBRARIES
-          ${ARROW_LIBRARIES}
+          arrow_shared
           PARENT_SCOPE
       )
       return()
     endif()
   endif()
 
-  set(ARROW_BUILD_SHARED ON)
-  set(ARROW_BUILD_STATIC OFF)
-  set(CPMAddOrFindPackage CPMFindPackage)
-
   if(NOT ARROW_ARMV8_ARCH)
     set(ARROW_ARMV8_ARCH "armv8-a")
   endif()
@@ -69,8 +124,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     set(ARROW_BUILD_STATIC ON)
     set(ARROW_BUILD_SHARED OFF)
     # Turn off CPM using `find_package` so we always download and make sure we get proper static
-    # library
-    set(CPM_DOWNLOAD_ALL TRUE)
+    # library.
+    set(CPM_DOWNLOAD_Arrow TRUE)
+  else()
+    set(ARROW_BUILD_SHARED ON)
+    set(ARROW_BUILD_STATIC OFF)
   endif()
 
   set(ARROW_PYTHON_OPTIONS "")
@@ -91,7 +149,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
   rapids_cpm_find(
     Arrow ${VERSION}
-    GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared
+    GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static
+                   arrow_dataset_static
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow.git
     GIT_TAG apache-arrow-${VERSION}
@@ -125,61 +184,65 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
             "xsimd_SOURCE AUTO"
   )
 
-  set(ARROW_FOUND TRUE)
-  set(ARROW_LIBRARIES "")
+  set(ARROW_FOUND
+      TRUE
+      PARENT_SCOPE
+  )
 
-  # Arrow_ADDED: set if CPM downloaded Arrow from Github Arrow_DIR:   set if CPM found Arrow on the
-  # system/conda/etc.
-  if(Arrow_ADDED OR Arrow_DIR)
-    if(BUILD_STATIC)
-      list(APPEND ARROW_LIBRARIES arrow_static)
-    else()
-      list(APPEND ARROW_LIBRARIES arrow_shared)
-    endif()
+  if(BUILD_STATIC)
+    set(ARROW_LIBRARIES arrow_static)
+  else()
+    set(ARROW_LIBRARIES arrow_shared)
+  endif()
 
-    if(Arrow_DIR)
-      find_package(Arrow REQUIRED QUIET)
-      if(ENABLE_PARQUET)
-        if(NOT Parquet_DIR)
-          # Set this to enable `find_package(Parquet)`
-          set(Parquet_DIR "${Arrow_DIR}")
-        endif()
-        # Set this to enable `find_package(ArrowDataset)`
-        set(ArrowDataset_DIR "${Arrow_DIR}")
-        find_package(ArrowDataset REQUIRED QUIET)
+  # Arrow_DIR:   set if CPM found Arrow on the system/conda/etc.
+  if(Arrow_DIR)
+    # This extra find_package is necessary because rapids_cpm_find does not propagate all the
+    # variables from find_package that we might need. This is especially problematic when
+    # rapids_cpm_find builds from source.
+    find_package(Arrow REQUIRED QUIET)
+    if(ENABLE_PARQUET)
+      # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow.
+      if(NOT Parquet_DIR)
+        # Set this to enable `find_package(Parquet)`
+        set(Parquet_DIR "${Arrow_DIR}")
       endif()
-    elseif(Arrow_ADDED)
-      # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
-      # target_include_directories. That defeats ccache.
-      file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
-           DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
+      # Set this to enable `find_package(ArrowDataset)`
+      set(ArrowDataset_DIR "${Arrow_DIR}")
+      find_package(ArrowDataset REQUIRED QUIET)
+    endif()
+    # Arrow_ADDED: set if CPM downloaded Arrow from Github
+  elseif(Arrow_ADDED)
+    # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
+    # target_include_directories. That defeats ccache.
+    file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
+         DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
+    )
+    if(ENABLE_PARQUET)
+      file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
+           DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
       )
-      if(ENABLE_PARQUET)
-        file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
-             DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
-        )
-      endif()
-      #
-      # This shouldn't be necessary!
-      #
-      # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
-      # targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
-      #
-      # This only works because we know exactly which components we're using. Don't forget to update
-      # this list if we add more!
-      #
-      foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
-        target_include_directories(
-          ${ARROW_LIBRARY}
-          INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
-                    "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
-                    "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
-                    "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
-        )
-      endforeach()
     endif()
+    # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
+    # targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
+    #
+    # This only works because we know exactly which components we're using. Don't forget to update
+    # this list if we add more!
+    #
+    foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
+      target_include_directories(
+        ${ARROW_LIBRARY}
+        INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
+                  "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
+                  "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
+                  "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
+      )
+    endforeach()
   else()
-    set(ARROW_FOUND FALSE)
+    set(ARROW_FOUND
+        FALSE
+        PARENT_SCOPE
+    )
     message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.")
   endif()
 
@@ -294,15 +357,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
   endif()
 
-  set(ARROW_FOUND
-      "${ARROW_FOUND}"
-      PARENT_SCOPE
-  )
   set(ARROW_LIBRARIES
       "${ARROW_LIBRARIES}"
       PARENT_SCOPE
   )
-
 endfunction()
 
 if(NOT DEFINED CUDF_VERSION_Arrow)
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index e49e270625b..3c085984a0e 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -403,6 +403,14 @@ Functions like merge or groupby in libcudf make no guarantees about the order of
 Promising deterministic ordering is not, in general, conducive to fast parallel algorithms.
 Calling code is responsible for performing sorts after the fact if sorted outputs are needed.
 
+## libcudf does not promise specific exception messages
+
+libcudf documents the exceptions that will be thrown by an API for different kinds of invalid inputs.
+The types of those exceptions (e.g. `cudf::logic_error`) are part of the public API.
+However, the explanatory string returned by the `what` method of those exceptions is not part of the API and is subject to change.
+Calling code should not rely on the contents of libcudf error messages to determine the nature of the error.
+For information on the types of exceptions that libcudf throws under different circumstances, see the [section on error handling](#errors).
+
 # libcudf API and Implementation
 
 ## Streams
@@ -837,7 +845,7 @@ description of what has broken from the past release. Label pull requests that c
 with the "non-breaking" tag.
 
 
-# Error Handling
+# Error Handling {#errors}
 
 libcudf follows conventions (and provides utilities) enforcing compile-time and run-time
 conditions and detecting and handling CUDA errors. Communication of errors is always via C++
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 2acdc007afa..57d834e6277 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -128,7 +128,7 @@ void gather_helper(InputItr source_itr,
 {
   using map_type = typename std::iterator_traits<MapIterator>::value_type;
   if (nullify_out_of_bounds) {
-    thrust::gather_if(rmm::exec_policy(stream),
+    thrust::gather_if(rmm::exec_policy_nosync(stream),
                       gather_map_begin,
                       gather_map_end,
                       gather_map_begin,
@@ -137,7 +137,7 @@ void gather_helper(InputItr source_itr,
                       bounds_checker<map_type>{0, source_size});
   } else {
     thrust::gather(
-      rmm::exec_policy(stream), gather_map_begin, gather_map_end, source_itr, target_itr);
+      rmm::exec_policy_nosync(stream), gather_map_begin, gather_map_end, source_itr, target_itr);
   }
 }
 
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index 752f8ef6367..3764b03641e 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <cudf/quantiles.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index ad5a2134afe..c8b17e22df2 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -79,14 +79,14 @@ auto scatter_to_gather(MapIterator scatter_map_begin,
   // We'll use the `numeric_limits::lowest()` value for this since it should always be outside the
   // valid range.
   auto gather_map = rmm::device_uvector<size_type>(gather_rows, stream);
-  thrust::uninitialized_fill(rmm::exec_policy(stream),
+  thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
                              gather_map.begin(),
                              gather_map.end(),
                              std::numeric_limits<size_type>::lowest());
 
   // Convert scatter map to a gather map
   thrust::scatter(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<MapValueType>(0),
     thrust::make_counting_iterator<MapValueType>(std::distance(scatter_map_begin, scatter_map_end)),
     scatter_map_begin,
@@ -114,13 +114,13 @@ auto scatter_to_gather_complement(MapIterator scatter_map_begin,
                                   rmm::cuda_stream_view stream)
 {
   auto gather_map = rmm::device_uvector<size_type>(gather_rows, stream);
-  thrust::sequence(rmm::exec_policy(stream), gather_map.begin(), gather_map.end(), 0);
+  thrust::sequence(rmm::exec_policy_nosync(stream), gather_map.begin(), gather_map.end(), 0);
 
   auto const out_of_bounds_begin =
     thrust::make_constant_iterator(std::numeric_limits<size_type>::lowest());
   auto const out_of_bounds_end =
     out_of_bounds_begin + thrust::distance(scatter_map_begin, scatter_map_end);
-  thrust::scatter(rmm::exec_policy(stream),
+  thrust::scatter(rmm::exec_policy_nosync(stream),
                   out_of_bounds_begin,
                   out_of_bounds_end,
                   scatter_map_begin,
@@ -152,7 +152,7 @@ struct column_scatterer_impl<Element, std::enable_if_t<cudf::is_fixed_width<Elem
 
     // NOTE use source.begin + scatter rows rather than source.end in case the
     // scatter map is smaller than the number of source rows
-    thrust::scatter(rmm::exec_policy(stream),
+    thrust::scatter(rmm::exec_policy_nosync(stream),
                     source.begin<Element>(),
                     source.begin<Element>() + cudf::distance(scatter_map_begin, scatter_map_end),
                     scatter_map_begin,
@@ -226,7 +226,7 @@ struct column_scatterer_impl<dictionary32> {
     auto source_itr  = indexalator_factory::make_input_iterator(source_view.indices());
     auto new_indices = std::make_unique<column>(target_view.get_indices_annotated(), stream, mr);
     auto target_itr  = indexalator_factory::make_output_iterator(new_indices->mutable_view());
-    thrust::scatter(rmm::exec_policy(stream),
+    thrust::scatter(rmm::exec_policy_nosync(stream),
                     source_itr,
                     source_itr + std::distance(scatter_map_begin, scatter_map_end),
                     scatter_map_begin,
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 77f9978ff1b..9df3f9daf3f 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -23,9 +23,8 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
-namespace detail {
-
 namespace tdigest {
+namespace detail {
 
 /**
  * @brief Generate a tdigest column from a grouped set of numeric input values.
@@ -328,6 +327,6 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr);
 
-}  // namespace tdigest
 }  // namespace detail
+}  // namespace tdigest
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 7675dc70cb2..8c7a7a21978 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -30,25 +30,28 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
+namespace cudf::io {
 
 // Forward declaration
 class parquet_reader_options;
 class parquet_writer_options;
 class chunked_parquet_writer_options;
 
-namespace detail {
-namespace parquet {
+namespace detail::parquet {
 
 /**
  * @brief Class to read Parquet dataset data into columns.
  */
 class reader {
- private:
+ protected:
   class impl;
   std::unique_ptr<impl> _impl;
 
+  /**
+   * @brief Default constructor, needed for subclassing.
+   */
+  reader();
+
  public:
   /**
    * @brief Constructor from an array of datasources
@@ -66,7 +69,7 @@ class reader {
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
    */
-  ~reader();
+  virtual ~reader();
 
   /**
    * @brief Reads the dataset as per given options.
@@ -154,7 +157,5 @@ class writer {
     const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
 };
 
-};  // namespace parquet
-};  // namespace detail
-};  // namespace io
-};  // namespace cudf
+}  // namespace detail::parquet
+}  // namespace cudf::io
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ff5b9f5c457..c5425de308c 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -30,8 +30,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
+namespace cudf::io {
 /**
  * @addtogroup io_readers
  * @{
@@ -1452,5 +1451,5 @@ class parquet_chunked_writer {
 };
 
 /** @} */  // end of group
-}  // namespace io
-}  // namespace cudf
+
+}  // namespace cudf::io
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 7db908c5b52..48c0ed8f6e9 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -89,7 +89,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
   // generate the compacted outgoing offsets.
   auto count_iter = thrust::make_counting_iterator<int32_t>(0);
   thrust::transform_exclusive_scan(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     count_iter,
     count_iter + offset_count,
     dst_offsets_v.begin<int32_t>(),
@@ -125,7 +125,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
   // generate the base offsets
   rmm::device_uvector<int32_t> base_offsets = rmm::device_uvector<int32_t>(output_count, stream);
   thrust::transform(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     gather_map,
     gather_map + output_count,
     base_offsets.data(),
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index 5d89a9be29c..f4106fb5cdf 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -58,7 +58,7 @@ rmm::device_uvector<unbound_list_view> list_vector_from_column(
 
   auto vector = rmm::device_uvector<unbound_list_view>(n_rows, stream, mr);
 
-  thrust::transform(rmm::exec_policy(stream),
+  thrust::transform(rmm::exec_policy_nosync(stream),
                     index_begin,
                     index_end,
                     vector.begin(),
@@ -104,7 +104,7 @@ std::unique_ptr<column> scatter_impl(
   auto const child_column_type = lists_column_view(target).child().type();
 
   // Scatter.
-  thrust::scatter(rmm::exec_policy(stream),
+  thrust::scatter(rmm::exec_policy_nosync(stream),
                   source_vector.begin(),
                   source_vector.end(),
                   scatter_map_begin,
@@ -239,7 +239,7 @@ std::unique_ptr<column> scatter(
               : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr);
   auto offset_column = make_numeric_column(
     data_type{type_to_id<offset_type>()}, 2, mask_state::UNALLOCATED, stream, mr);
-  thrust::sequence(rmm::exec_policy(stream),
+  thrust::sequence(rmm::exec_policy_nosync(stream),
                    offset_column->mutable_view().begin<offset_type>(),
                    offset_column->mutable_view().end<offset_type>(),
                    0,
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 531c7e3477d..1f3c26fa077 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -18,7 +18,7 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index d95dc2c418c..1718d205871 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -24,6 +24,9 @@
 
 namespace cudf {
 namespace strings {
+
+struct regex_program;
+
 /**
  * @addtogroup strings_contains
  * @{
@@ -58,6 +61,32 @@ std::unique_ptr<column> contains_re(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying rows which
+ * match the given regex_program object
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["abc", "123", "def456"]
+ * p = regex_program::create("\\d+")
+ * r = contains_re(s, p)
+ * r is now [false, true, true]
+ * @endcode
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
+ */
+std::unique_ptr<column> contains_re(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a boolean column identifying rows which
  * matching the given regex pattern but only at the beginning the string.
@@ -85,6 +114,32 @@ std::unique_ptr<column> matches_re(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying rows which
+ * matching the given regex_program object but only at the beginning the string.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["abc", "123", "def456"]
+ * p = regex_program::create("\\d+")
+ * r = matches_re(s, p)
+ * r is now [false, true, false]
+ * @endcode
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
+ */
+std::unique_ptr<column> matches_re(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns the number of times the given regex pattern
  * matches in each string.
@@ -112,6 +167,32 @@ std::unique_ptr<column> count_re(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns the number of times the given regex_program's pattern
+ * matches in each string
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["abc", "123", "def45"]
+ * p = regex_program::create("\\d")
+ * r = count_re(s, p)
+ * r is now [0, 3, 2]
+ * @endcode
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New INT32 column with counts for each string
+ */
+std::unique_ptr<column> count_re(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a boolean column identifying rows which
  * match the given like pattern.
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index ade28faf645..3b8ed0f4e0d 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -34,14 +34,12 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate(
-  table_view const& strings_columns,
-  string_scalar const& separator,
-  string_scalar const& narep,
-  separator_on_nulls separate_nulls = separator_on_nulls::YES,
-  // Move before separate_nulls?
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(table_view const& strings_columns,
+                                    string_scalar const& separator,
+                                    string_scalar const& narep,
+                                    separator_on_nulls separate_nulls,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc join_strings(table_view const&,string_scalar const&,string_scalar
@@ -49,12 +47,11 @@ std::unique_ptr<column> concatenate(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> join_strings(
-  strings_column_view const& strings,
-  string_scalar const& separator,
-  string_scalar const& narep,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> join_strings(strings_column_view const& strings,
+                                     string_scalar const& separator,
+                                     string_scalar const& narep,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc join_list_elements(table_view const&,string_scalar const&,string_scalar
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index caaeb2afbe7..511e240886a 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -42,10 +42,9 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with concatenated results.
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 79cec779e02..374c3b2cf68 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -56,13 +56,12 @@ namespace detail {
  * @return New strings column.
  */
 template <typename StringIterLeft, typename StringIterRight, typename Filter>
-std::unique_ptr<cudf::column> copy_if_else(
-  StringIterLeft lhs_begin,
-  StringIterLeft lhs_end,
-  StringIterRight rhs_begin,
-  Filter filter_fn,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::column> copy_if_else(StringIterLeft lhs_begin,
+                                           StringIterLeft lhs_end,
+                                           StringIterRight rhs_begin,
+                                           Filter filter_fn,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = std::distance(lhs_begin, lhs_end);
   if (strings_count == 0) return make_empty_column(type_id::STRING);
diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh
index e83f6dc0005..ee09ce9a7a9 100644
--- a/cpp/include/cudf/strings/detail/copy_range.cuh
+++ b/cpp/include/cudf/strings/detail/copy_range.cuh
@@ -99,14 +99,13 @@ namespace detail {
  * @return std::unique_ptr<column> The result target column
  */
 template <typename SourceValueIterator, typename SourceValidityIterator>
-std::unique_ptr<column> copy_range(
-  SourceValueIterator source_value_begin,
-  SourceValidityIterator source_validity_begin,
-  strings_column_view const& target,
-  size_type target_begin,
-  size_type target_end,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> copy_range(SourceValueIterator source_value_begin,
+                                   SourceValidityIterator source_validity_begin,
+                                   strings_column_view const& target,
+                                   size_type target_begin,
+                                   size_type target_end,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(
     (target_begin >= 0) && (target_begin < target.size()) && (target_end <= target.size()),
diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
index c70952b0962..7e82ad4c679 100644
--- a/cpp/include/cudf/strings/detail/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -49,13 +49,11 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column of size (end-start)/step.
  */
-std::unique_ptr<cudf::column> copy_slice(
-  strings_column_view const& strings,
-  size_type start,
-  size_type end = -1,
-  // Move before end?
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
+                                         size_type start,
+                                         size_type end,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Returns a new strings column created by shifting the rows by a specified offset.
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index 1ad9663a614..43e3f6198f3 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -42,13 +42,12 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
-std::unique_ptr<column> fill(
-  strings_column_view const& strings,
-  size_type begin,
-  size_type end,
-  string_scalar const& value,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> fill(strings_column_view const& strings,
+                             size_type begin,
+                             size_type end,
+                             string_scalar const& value,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index dfc8f0dacc5..28b98eac3b5 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -288,12 +288,11 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
  * @return New strings column containing the gathered strings.
  */
 template <bool NullifyOutOfBounds, typename MapIterator>
-std::unique_ptr<cudf::column> gather(
-  strings_column_view const& strings,
-  MapIterator begin,
-  MapIterator end,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
+                                     MapIterator begin,
+                                     MapIterator end,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   auto const output_count  = std::distance(begin, end);
   auto const strings_count = strings.size();
@@ -306,7 +305,7 @@ std::unique_ptr<cudf::column> gather(
   auto const d_in_offsets  = (strings_count > 0) ? strings.offsets_begin() : nullptr;
   auto const d_strings     = column_device_view::create(strings.parent(), stream);
   thrust::transform(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     begin,
     end,
     d_out_offsets,
@@ -318,7 +317,7 @@ std::unique_ptr<cudf::column> gather(
 
   // check total size is not too large
   size_t const total_bytes = thrust::transform_reduce(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     d_out_offsets,
     d_out_offsets + output_count,
     [] __device__(auto size) { return static_cast<size_t>(size); },
@@ -328,8 +327,10 @@ std::unique_ptr<cudf::column> gather(
                "total size of output strings is too large for a cudf column");
 
   // In-place convert output sizes into offsets
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         d_out_offsets,
+                         d_out_offsets + output_count + 1,
+                         d_out_offsets);
 
   // build chars column
   cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
@@ -372,13 +373,12 @@ std::unique_ptr<cudf::column> gather(
  * @return New strings column containing the gathered strings.
  */
 template <typename MapIterator>
-std::unique_ptr<cudf::column> gather(
-  strings_column_view const& strings,
-  MapIterator begin,
-  MapIterator end,
-  bool nullify_out_of_bounds,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
+                                     MapIterator begin,
+                                     MapIterator end,
+                                     bool nullify_out_of_bounds,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   if (nullify_out_of_bounds) return gather<true>(strings, begin, end, stream, mr);
   return gather<false>(strings, begin, end, stream, mr);
diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp
index 8ea579ae5c0..0fb06d36570 100644
--- a/cpp/include/cudf/strings/detail/json.hpp
+++ b/cpp/include/cudf/strings/detail/json.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/json.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -30,12 +32,11 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<cudf::column> get_json_object(
-  cudf::strings_column_view const& col,
-  cudf::string_scalar const& json_path,
-  get_json_object_options options,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
+                                              cudf::string_scalar const& json_path,
+                                              cudf::strings::get_json_object_options options,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index a9a6ef00103..aa6fb2feb3d 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -43,14 +43,12 @@ enum class replace_algorithm {
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 template <replace_algorithm alg = replace_algorithm::AUTO>
-std::unique_ptr<column> replace(
-  strings_column_view const& strings,
-  string_scalar const& target,
-  string_scalar const& repl,
-  int32_t maxrepl = -1,
-  // Move before maxrepl?
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace(strings_column_view const& strings,
+                                string_scalar const& target,
+                                string_scalar const& repl,
+                                int32_t maxrepl,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::strings::replace_slice(strings_column_view const&, string_scalar const&,
@@ -58,14 +56,12 @@ std::unique_ptr<column> replace(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_slice(
-  strings_column_view const& strings,
-  string_scalar const& repl = string_scalar(""),
-  size_type start           = 0,
-  size_type stop            = -1,
-  // Move before repl?
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_slice(strings_column_view const& strings,
+                                      string_scalar const& repl,
+                                      size_type start,
+                                      size_type stop,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&,
@@ -73,12 +69,11 @@ std::unique_ptr<column> replace_slice(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace(
-  strings_column_view const& strings,
-  strings_column_view const& targets,
-  strings_column_view const& repls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace(strings_column_view const& strings,
+                                strings_column_view const& targets,
+                                strings_column_view const& repls,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Replaces any null string entries with the given string.
@@ -98,12 +93,10 @@ std::unique_ptr<column> replace(
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
-std::unique_ptr<column> replace_nulls(
-  strings_column_view const& strings,
-  string_scalar const& repl = string_scalar(""),
-  // Move before repl?
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
+                                      string_scalar const& repl,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index c8a90ea538a..55dd5bda260 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -57,18 +57,18 @@ namespace detail {
  * @return New strings column.
  */
 template <typename SourceIterator, typename MapIterator>
-std::unique_ptr<column> scatter(
-  SourceIterator begin,
-  SourceIterator end,
-  MapIterator scatter_map,
-  strings_column_view const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> scatter(SourceIterator begin,
+                                SourceIterator end,
+                                MapIterator scatter_map,
+                                strings_column_view const& target,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   if (target.is_empty()) return make_empty_column(type_id::STRING);
 
   // create vector of string_view's to scatter into
-  rmm::device_uvector<string_view> target_vector = create_string_vector_from_column(target, stream);
+  rmm::device_uvector<string_view> target_vector =
+    create_string_vector_from_column(target, stream, rmm::mr::get_current_device_resource());
 
   // this ensures empty strings are not mapped to nulls in the make_strings_column function
   auto const size = thrust::distance(begin, end);
@@ -76,7 +76,8 @@ std::unique_ptr<column> scatter(
     begin, [] __device__(string_view const sv) { return sv.empty() ? string_view{} : sv; });
 
   // do the scatter
-  thrust::scatter(rmm::exec_policy(stream), itr, itr + size, scatter_map, target_vector.begin());
+  thrust::scatter(
+    rmm::exec_policy_nosync(stream), itr, itr + size, scatter_map, target_vector.begin());
 
   // build the output column
   auto sv_span = cudf::device_span<string_view const>(target_vector);
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index 9404ac14775..76e5f931981 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -50,11 +50,10 @@ namespace detail {
  * @return offsets child column for strings column
  */
 template <typename InputIterator>
-std::unique_ptr<column> make_offsets_child_column(
-  InputIterator begin,
-  InputIterator end,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> make_offsets_child_column(InputIterator begin,
+                                                  InputIterator end,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(begin < end, "Invalid iterator range");
   auto count = thrust::distance(begin, end);
@@ -117,12 +116,11 @@ __device__ inline char* copy_string(char* buffer, const string_view& d_string)
  * @return offsets child column and chars child column for a strings column
  */
 template <typename SizeAndExecuteFunction>
-auto make_strings_children(
-  SizeAndExecuteFunction size_and_exec_fn,
-  size_type exec_size,
-  size_type strings_count,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type exec_size,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr)
 {
   auto offsets_column = make_numeric_column(
     data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
@@ -175,11 +173,10 @@ auto make_strings_children(
  * @return offsets child column and chars child column for a strings column
  */
 template <typename SizeAndExecuteFunction>
-auto make_strings_children(
-  SizeAndExecuteFunction size_and_exec_fn,
-  size_type strings_count,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr)
 {
   return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
 }
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 829e0207110..41a2654dce3 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -36,10 +36,9 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return The chars child column for a strings column.
  */
-std::unique_ptr<column> create_chars_child_column(
-  size_type bytes,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> create_chars_child_column(size_type bytes,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Creates a string_view vector from a strings column.
@@ -52,7 +51,7 @@ std::unique_ptr<column> create_chars_child_column(
 rmm::device_uvector<string_view> create_string_vector_from_column(
   cudf::strings_column_view const strings,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index a30098bedb9..a80d971438d 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -23,6 +23,9 @@
 
 namespace cudf {
 namespace strings {
+
+struct regex_program;
+
 /**
  * @addtogroup strings_substring
  * @{
@@ -61,6 +64,37 @@ std::unique_ptr<table> extract(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a table of strings columns where each column corresponds to the matching
+ * group specified in the given regex_program object
+ *
+ * All the strings for the first group will go in the first output column; the second group
+ * go in the second column and so on. Null entries are added to the columns in row `i` if
+ * the string at row `i` does not match.
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["a1", "b2", "c3"]
+ * p = regex_program::create("([ab])(\\d)")
+ * r = extract(s, p)
+ * r is now [ ["a", "b", null],
+ *            ["1", "2", null] ]
+ * @endcode
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return Columns of strings extracted from the input column
+ */
+std::unique_ptr<table> extract(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a lists column of strings where each string column row corresponds to the
  * matching group specified in the given regular expression pattern.
@@ -96,6 +130,40 @@ std::unique_ptr<column> extract_all_record(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a lists column of strings where each string column row corresponds to the
+ * matching group specified in the given regex_program object
+ *
+ * All the matching groups for the first row will go in the first row output column; the second
+ * row results will go into the second row output column and so on.
+ *
+ * A null output row will result if the corresponding input string row does not match or
+ * that input row is null.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["a1 b4", "b2", "c3 a5", "b", null]
+ * p = regex_program::create("([ab])(\\d)")
+ * r = extract_all_record(s, p)
+ * r is now [ ["a", "1", "b", "4"],
+ *            ["b", "2"],
+ *            ["a", "5"],
+ *            null,
+ *            null ]
+ * @endcode
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate any returned device memory
+ * @return Lists column containing strings extracted from the input column
+ */
+std::unique_ptr<column> extract_all_record(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 6969ba35b1b..366e1eb0482 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -23,6 +23,9 @@
 
 namespace cudf {
 namespace strings {
+
+struct regex_program;
+
 /**
  * @addtogroup strings_contains
  * @{
@@ -63,6 +66,39 @@ std::unique_ptr<column> findall(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a lists column of strings for each matching occurrence using
+ * the regex_program pattern within each string
+ *
+ * Each output row includes all the substrings within the corresponding input row
+ * that match the given pattern. If no matches are found, the output row is empty.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["bunny", "rabbit", "hare", "dog"]
+ * p = regex_program::create("[ab]")
+ * r = findall(s, p)
+ * r is now a lists column like:
+ *  [ ["b"]
+ *    ["a","b","b"]
+ *    ["a"]
+ *    [] ]
+ * @endcode
+ *
+ * A null output row occurs if the corresponding input row is null.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param input Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New lists column of strings
+ */
+std::unique_ptr<column> findall(
+  strings_column_view const& input,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index 3a7051345fa..44ca68439e7 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -21,7 +21,7 @@ namespace cudf {
 namespace strings {
 
 /**
- * @addtogroup strings_contains
+ * @addtogroup strings_regex
  * @{
  */
 
diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp
new file mode 100644
index 00000000000..2b606393719
--- /dev/null
+++ b/cpp/include/cudf/strings/regex/regex_program.hpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/regex/flags.hpp>
+#include <cudf/types.hpp>
+
+#include <memory>
+#include <string>
+
+namespace cudf {
+namespace strings {
+
+/**
+ * @addtogroup strings_regex
+ * @{
+ */
+
+/**
+ * @brief Regex program class
+ *
+ * Create an instance from a regex pattern and use it to call the appropriate
+ * strings APIs. An instance can be reused.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns and APIs that support regex.
+ */
+struct regex_program {
+  struct regex_program_impl;
+
+  /**
+   * @brief Create a program from a pattern
+   *
+   * @throw cudf::logic_error If pattern is invalid or contains unsupported features
+   *
+   * @param pattern Regex pattern
+   * @param flags Regex flags for interpreting special characters in the pattern
+   * @param capture Controls how capture groups in the pattern are used
+   * @return Instance of this object
+   */
+  static std::unique_ptr<regex_program> create(std::string_view pattern,
+                                               regex_flags flags      = regex_flags::DEFAULT,
+                                               capture_groups capture = capture_groups::EXTRACT);
+
+  /**
+   * @brief Move constructor
+   *
+   * @param other Object to move from
+   */
+  regex_program(regex_program&& other);
+
+  /**
+   * @brief Move operator assignment
+   *
+   * @param other Object to move from
+   * @return this object
+   */
+  regex_program& operator=(regex_program&& other);
+
+  /**
+   * @brief Return the pattern used to create this instance
+   *
+   * @return regex pattern as a string
+   */
+  std::string pattern() const;
+
+  /**
+   * @brief Return the regex_flags used to create this instance
+   *
+   * @return regex flags setting
+   */
+  regex_flags flags() const;
+
+  /**
+   * @brief Return the capture_groups used to create this instance
+   *
+   * @return capture groups setting
+   */
+  capture_groups capture() const;
+
+  /**
+   * @brief Return the number of instructions in this instance
+   *
+   * @return Number of instructions
+   */
+  int32_t instructions_count() const;
+
+  /**
+   * @brief Return the number of capture groups in this instance
+   *
+   * @return Number of groups
+   */
+  int32_t groups_count() const;
+
+  /**
+   * @brief Return the pattern used to create this instance
+   *
+   * @param num_strings Number of strings for computation
+   * @return Size of the working memory in bytes
+   */
+  std::size_t compute_working_memory_size(int32_t num_strings) const;
+
+  ~regex_program();
+
+ private:
+  regex_program() = delete;
+
+  std::string _pattern;
+  regex_flags _flags;
+  capture_groups _capture;
+
+  std::unique_ptr<regex_program_impl> _impl;
+
+  /**
+   * @brief Constructor
+   *
+   * Called by create()
+   */
+  regex_program(std::string_view pattern, regex_flags flags, capture_groups capture);
+
+  friend struct regex_device_builder;
+};
+
+/** @} */  // end of doxygen group
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index d80b9a89b81..60c66956fb8 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -26,6 +26,9 @@
 
 namespace cudf {
 namespace strings {
+
+struct regex_program;
+
 /**
  * @addtogroup strings_replace
  * @{
@@ -58,6 +61,30 @@ std::unique_ptr<column> replace_re(
   regex_flags const flags                    = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief For each string, replaces any character sequence matching the given regex
+ * with the provided replacement string.
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param replacement The string used to replace the matched sequence in each string.
+ *        Default is an empty string.
+ * @param max_replace_count The maximum number of times to replace the matched pattern
+ *        within each string. Default replaces every substring that is matched.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+std::unique_ptr<column> replace_re(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  string_scalar const& replacement           = string_scalar(""),
+  std::optional<size_type> max_replace_count = std::nullopt,
+  rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
+
 /**
  * @brief For each string, replaces any character sequence matching the given patterns
  * with the corresponding string in the `replacements` column.
@@ -105,5 +132,28 @@ std::unique_ptr<column> replace_with_backrefs(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief For each string, replaces any character sequence matching the given regex
+ * using the replacement template for back-references.
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also
+ * if the index exceeds the group count specified in the pattern
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param replacement The replacement template for creating the output string
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+std::unique_ptr<column> replace_with_backrefs(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  std::string_view replacement,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 6fe07b0f5dc..c6bd1345ae6 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -23,6 +23,9 @@
 
 namespace cudf {
 namespace strings {
+
+struct regex_program;
+
 /**
  * @addtogroup strings_split
  * @{
@@ -77,6 +80,58 @@ std::unique_ptr<table> split_re(
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Splits strings elements into a table of strings columns
+ * using a regex_program's pattern to delimit each string
+ *
+ * Each element generates a vector of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of strings[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of rows in the output table will be the same as the number of
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
+ *
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
+ *
+ * The regex_program's regex_flags are ignored.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * p1 = regex_program::create("[_ ]")
+ * s1 = split_re(s, p1)
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * p2 = regex_program::create("[ _]")
+ * s2 = split_re(s, p2, 1)
+ * s2 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc def_g", "_bc", "ab cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split
+ * @param prog Regex program instance
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return A table of columns of strings
+ */
+std::unique_ptr<table> split_re(
+  strings_column_view const& input,
+  regex_program const& prog,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Splits strings elements into a table of strings columns
  * using a regex pattern to delimit each string starting from the end of the string.
@@ -127,6 +182,60 @@ std::unique_ptr<table> rsplit_re(
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Splits strings elements into a table of strings columns using a
+ * regex_program's pattern to delimit each string starting from the end of the string
+ *
+ * Each element generates a vector of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of string[row]`
+ * where `token` is the substring between each delimiter.
+ *
+ * The number of rows in the output table will be the same as the number of
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
+ *
+ * The regex_program's regex_flags are ignored.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * p1 = regex_program::create("[_ ]")
+ * s1 = rsplit_re(s, p1)
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * p2 = regex_program::create("[ _]")
+ * s2 = rsplit_re(s, p2, 1)
+ * s2 is a table of strings columns:
+ *     [ ["a_bc def", "a_", "_ab", "ab"],
+ *       ["g", "bc", "cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split.
+ * @param prog Regex program instance
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return A table of columns of strings.
+ */
+std::unique_ptr<table> rsplit_re(
+  strings_column_view const& input,
+  regex_program const& prog,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Splits strings elements into a list column of strings
  * using the given regex pattern to delimit each string.
@@ -179,6 +288,62 @@ std::unique_ptr<column> split_record_re(
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Splits strings elements into a list column of strings
+ * using the given regex_program to delimit each string
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null output row.
+ *
+ * The regex_program's regex_flags are ignored.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * p1 = regex_program::create("[_ ]")
+ * s1 = split_record_re(s, p1)
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * p2 = regex_program::create("[ _]")
+ * s2 = split_record_re(s, p2, 1)
+ * s2 is a lists column of strings:
+ *     [ ["a", "bc def_g"],
+ *       ["a", "_bc"],
+ *       ["", "ab cd"],
+ *       ["ab", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param input A column of string elements to be split
+ * @param prog Regex program instance
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return Lists column of strings.
+ */
+std::unique_ptr<column> split_record_re(
+  strings_column_view const& input,
+  regex_program const& prog,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Splits strings elements into a list column of strings
  * using the given regex pattern to delimit each string starting from the end of the string.
@@ -233,6 +398,64 @@ std::unique_ptr<column> rsplit_record_re(
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Splits strings elements into a list column of strings using the given
+ * regex_program to delimit each string starting from the end of the string
+ *
+ * Each element generates a vector of strings that are stored in an output
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the separation points within a string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null output row.
+ *
+ * The regex_program's regex_flags are ignored.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * p1 = regex_program::create("[_ ]")
+ * s1 = rsplit_record_re(s, p1)
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * p2 = regex_program::create("[ _]")
+ * s2 = rsplit_record_re(s, p2, 1)
+ * s2 is a lists column of strings:
+ *     [ ["a_bc def", "g"],
+ *       ["a_", "bc"],
+ *       ["_ab", "cd"],
+ *       ["ab_cd", ""] ]
+ * @endcode
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split
+ * @param prog Regex program instance
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return Lists column of strings
+ */
+std::unique_ptr<column> rsplit_record_re(
+  strings_column_view const& input,
+  regex_program const& prog,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.cuh b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
similarity index 83%
rename from cpp/include/cudf/tdigest/tdigest_column_view.cuh
rename to cpp/include/cudf/tdigest/tdigest_column_view.hpp
index 64371fd5c45..c63e2b16326 100644
--- a/cpp/include/cudf/tdigest/tdigest_column_view.cuh
+++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
@@ -16,30 +16,11 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 
 namespace cudf {
 namespace tdigest {
 
-/**
- * @brief Functor to compute the size of each tdigest of a column.
- *
- */
-struct tdigest_size {
-  size_type const* offsets;  ///< Offsets of the t-digest column
-  /**
-   * @brief Returns size of the each tdigest in the column
-   *
-   * @param tdigest_index Index of the tdigest in the column
-   * @return Size of the tdigest
-   */
-  __device__ size_type operator()(size_type tdigest_index)
-  {
-    return offsets[tdigest_index + 1] - offsets[tdigest_index];
-  }
-};
-
 /**
  * @brief Given a column_view containing tdigest data, an instance of this class
  * provides a wrapper on the compound column for tdigest operations.
@@ -127,18 +108,6 @@ class tdigest_column_view : private column_view {
    */
   [[nodiscard]] column_view weights() const;
 
-  /**
-   * @brief Returns an iterator that returns the size of each tdigest
-   * in the column (each row is 1 digest)
-   *
-   * @return An iterator that returns the size of each tdigest in the column
-   */
-  [[nodiscard]] auto size_begin() const
-  {
-    return cudf::detail::make_counting_transform_iterator(
-      0, tdigest_size{centroids().offsets_begin()});
-  }
-
   /**
    * @brief Returns the first min value for the column. Each row corresponds
    * to the minimum value for the accompanying digest.
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index fb2680545d3..ab45d90f2d2 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -110,58 +110,6 @@ struct TypeList<Types<TYPES...>> {
  */
 #define EXPECT_CUDA_SUCCEEDED(expr) EXPECT_EQ(cudaSuccess, expr)
 
-/**
- * @brief Utility for testing the expectation that an expression x throws the specified
- * exception whose what() message ends with the msg
- *
- * @param x The expression to test
- * @param exception The exception type to test for
- * @param startswith The start of the expected message
- * @param endswith The end of the expected message
- */
-#define EXPECT_THROW_MESSAGE(x, exception, startswith, endswith)    \
-  do {                                                              \
-    EXPECT_THROW(                                                   \
-      {                                                             \
-        try {                                                       \
-          x;                                                        \
-        } catch (const exception& e) {                              \
-          ASSERT_NE(nullptr, e.what());                             \
-          EXPECT_THAT(e.what(), testing::StartsWith((startswith))); \
-          EXPECT_THAT(e.what(), testing::EndsWith((endswith)));     \
-          throw;                                                    \
-        }                                                           \
-      },                                                            \
-      exception);                                                   \
-  } while (0)
-
-/**
- * @brief test macro to be expected to throw cudf::logic_error with a message
- *
- * @param x The statement to be tested
- * @param msg The message associated with the exception
- */
-#define CUDF_EXPECT_THROW_MESSAGE(x, msg) \
-  EXPECT_THROW_MESSAGE(x, cudf::logic_error, "cuDF failure at:", msg)
-
-/**
- * @brief test macro to be expected to throw cudf::cuda_error with a message
- *
- * @param x The statement to be tested
- * @param msg The message associated with the exception
- */
-#define CUDA_EXPECT_THROW_MESSAGE(x, msg) \
-  EXPECT_THROW_MESSAGE(x, cudf::cuda_error, "CUDA error encountered at:", msg)
-
-/**
- * @brief test macro to be expected to throw cudf::fatal_logic_error with a message
- *
- * @param x The statement to be tested
- * @param msg The message associated with the exception
- */
-#define FATAL_CUDA_EXPECT_THROW_MESSAGE(x, msg) \
-  EXPECT_THROW_MESSAGE(x, cudf::fatal_cuda_error, "Fatal CUDA error encountered at:", msg)
-
 /**
  * @brief test macro to be expected as no exception.
  *
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 250f8ea8580..ce45ad91be1 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -16,16 +16,14 @@
 
 #pragma once
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <cudf_test/column_wrapper.hpp>
-
-#include <tests/groupby/groupby_test_util.hpp>
-
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
@@ -102,6 +100,58 @@ struct tdigest_gen {
   // @endcond
 };
 
+template <typename T>
+inline T frand()
+{
+  return static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
+}
+
+template <typename T>
+inline T rand_range(T min, T max)
+{
+  return min + static_cast<T>(frand<T>() * (max - min));
+}
+
+inline std::unique_ptr<column> generate_typed_percentile_distribution(
+  std::vector<double> const& buckets,
+  std::vector<int> const& sizes,
+  data_type t,
+  bool sorted = false)
+{
+  srand(0);
+
+  std::vector<double> values;
+  size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0);
+  values.reserve(total_size);
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    double min = idx == 0 ? 0.0f : buckets[idx - 1];
+    double max = buckets[idx];
+
+    for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) {
+      values.push_back(rand_range(min, max));
+    }
+  }
+
+  if (sorted) { std::sort(values.begin(), values.end()); }
+
+  cudf::test::fixed_width_column_wrapper<double> src(values.begin(), values.end());
+  return cudf::cast(src, t);
+}
+
+// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent
+// is to provide a standardized set of inputs for use with tdigest generation tests and
+// percentile_approx tests. std::vector<double>
+// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector<int>
+// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
+inline std::unique_ptr<column> generate_standardized_percentile_distribution(
+  data_type t = data_type{type_id::FLOAT64}, bool sorted = false)
+{
+  std::vector<double> buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f};
+  std::vector<int> b_sizes{
+    50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
+  return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted);
+}
+
 /**
  * @brief Compare a tdigest column against a sampling of expected values.
  */
@@ -217,7 +267,7 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::detail::tdigest::make_empty_tdigest_column(cudf::get_default_stream());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -508,9 +558,9 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::detail::tdigest::make_empty_tdigest_column(cudf::get_default_stream());
-  auto b = cudf::detail::tdigest::make_empty_tdigest_column(cudf::get_default_stream());
-  auto c = cudf::detail::tdigest::make_empty_tdigest_column(cudf::get_default_stream());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto b = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -520,7 +570,7 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::detail::tdigest::make_empty_tdigest_column(cudf::get_default_stream());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index c0ea06959b2..5c335b720d5 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -129,6 +129,7 @@
  *   @defgroup strings_replace Replacing
  *   @defgroup strings_split Splitting
  *   @defgroup strings_json JSON
+ *   @defgroup strings_regex Regex
  * @}
  * @defgroup dictionary_apis Dictionary
  * @{
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 802b47e4664..577d6427b19 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -180,10 +180,8 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
   if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
   while (output_index < output_size) {
     // Lookup input index by searching for output index in offsets
-    // thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
-    auto const offset_it =
-      -1 + thrust::upper_bound(
-             thrust::seq, input_offsets, input_offsets + num_input_views, output_index);
+    auto const offset_it            = thrust::prev(thrust::upper_bound(
+      thrust::seq, input_offsets, input_offsets + num_input_views, output_index));
     size_type const partition_index = offset_it - input_offsets;
 
     // Copy input data to output
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 4ebe465b945..6083a698560 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -119,7 +119,7 @@ struct column_scalar_scatterer_impl {
     auto scalar_iter =
       thrust::make_permutation_iterator(scalar_impl->data(), thrust::make_constant_iterator(0));
 
-    thrust::scatter(rmm::exec_policy(stream),
+    thrust::scatter(rmm::exec_policy_nosync(stream),
                     scalar_iter,
                     scalar_iter + scatter_rows,
                     scatter_iter,
@@ -191,8 +191,11 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
     auto new_indices = std::make_unique<column>(dict_view.get_indices_annotated(), stream, mr);
     auto target_iter = indexalator_factory::make_output_iterator(new_indices->mutable_view());
 
-    thrust::scatter(
-      rmm::exec_policy(stream), scalar_iter, scalar_iter + scatter_rows, scatter_iter, target_iter);
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    scalar_iter,
+                    scalar_iter + scatter_rows,
+                    scatter_iter,
+                    target_iter);
 
     // build the dictionary indices column from the result
     auto const indices_type = new_indices->type();
@@ -383,7 +386,7 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
     data_type{type_id::INT32}, target.size(), mask_state::UNALLOCATED, stream);
   auto mutable_indices = indices->mutable_view();
 
-  thrust::sequence(rmm::exec_policy(stream),
+  thrust::sequence(rmm::exec_policy_nosync(stream),
                    mutable_indices.begin<size_type>(),
                    mutable_indices.end<size_type>(),
                    0);
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index c07833520ab..90c869b8c58 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -512,18 +512,33 @@ rmm::device_uvector<size_type> extract_populated_keys(map_type const& map,
 {
   rmm::device_uvector<size_type> populated_keys(num_keys, stream);
 
-  auto get_key    = [] __device__(auto const& element) { return element.first; };  // first = key
-  auto get_key_it = thrust::make_transform_iterator(map.data(), get_key);
-  auto key_used   = [unused = map.get_unused_key()] __device__(auto key) { return key != unused; };
-
-  auto end_it = thrust::copy_if(rmm::exec_policy(stream),
-                                get_key_it,
-                                get_key_it + map.capacity(),
-                                populated_keys.begin(),
-                                key_used);
-
-  populated_keys.resize(std::distance(populated_keys.begin(), end_it), stream);
+  auto const get_key = [] __device__(auto const& element) { return element.first; };  // first = key
+  auto const key_used = [unused = map.get_unused_key()] __device__(auto key) {
+    return key != unused;
+  };
+  auto key_itr = thrust::make_transform_iterator(map.data(), get_key);
+
+  // thrust::copy_if has a bug where it cannot iterate over int-max values
+  // so if map.capacity() > int-max we'll call thrust::copy_if in chunks instead
+  auto const copy_size =
+    std::min(map.capacity(), static_cast<std::size_t>(std::numeric_limits<int>::max()));
+  auto const key_end = key_itr + map.capacity();
+  auto pop_keys_itr  = populated_keys.begin();
+
+  std::size_t output_size = 0;
+  while (key_itr != key_end) {
+    auto const copy_end = static_cast<std::size_t>(std::distance(key_itr, key_end)) <= copy_size
+                            ? key_end
+                            : key_itr + copy_size;
+    auto const end_it =
+      thrust::copy_if(rmm::exec_policy(stream), key_itr, copy_end, pop_keys_itr, key_used);
+    auto const copied = std::distance(pop_keys_itr, end_it);
+    pop_keys_itr += copied;
+    output_size += copied;
+    key_itr = copy_end;
+  }
 
+  populated_keys.resize(output_size, stream);
   return populated_keys;
 }
 
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 55a0b89e446..e3d14f1deb7 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -701,7 +701,7 @@ void aggregate_result_functor::operator()<aggregation::TDIGEST>(aggregation cons
 
   cache.add_result(values,
                    agg,
-                   cudf::detail::tdigest::group_tdigest(
+                   cudf::tdigest::detail::group_tdigest(
                      get_sorted_values(),
                      helper.group_offsets(stream),
                      helper.group_labels(stream),
@@ -745,7 +745,7 @@ void aggregate_result_functor::operator()<aggregation::MERGE_TDIGEST>(aggregatio
     dynamic_cast<cudf::detail::merge_tdigest_aggregation const&>(agg).max_centroids;
   cache.add_result(values,
                    agg,
-                   cudf::detail::tdigest::group_merge_tdigest(get_grouped_values(),
+                   cudf::tdigest::detail::group_merge_tdigest(get_grouped_values(),
                                                               helper.group_offsets(stream),
                                                               helper.group_labels(stream),
                                                               helper.num_groups(stream),
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index fd794b2e66c..fd0cbeced3a 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -31,46 +31,23 @@
 #include NVCOMP_ZSTD_HEADER
 #endif
 
-#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 3)
-#define NVCOMP_HAS_ZSTD_DECOMP 1
-#else
-#define NVCOMP_HAS_ZSTD_DECOMP 0
-#endif
+#define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3))
 
-#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 4)
-#define NVCOMP_HAS_ZSTD_COMP 1
-#else
-#define NVCOMP_HAS_ZSTD_COMP 0
-#endif
+#define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4))
 
-#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 3)
-#define NVCOMP_HAS_DEFLATE 1
-#else
-#define NVCOMP_HAS_DEFLATE 0
-#endif
+#define NVCOMP_HAS_DEFLATE(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 5))
 
-#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION > 3) or \
-  (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION == 3 and NVCOMP_PATCH_VERSION >= 1)
-#define NVCOMP_HAS_TEMPSIZE_EX 1
-#else
-#define NVCOMP_HAS_TEMPSIZE_EX 0
-#endif
+#define NVCOMP_HAS_TEMPSIZE_EX(MAJOR, MINOR, PATCH) \
+  (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 1))
 
 // ZSTD is stable for nvcomp 2.3.2 or newer
-#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION > 3) or \
-  (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION == 3 and NVCOMP_PATCH_VERSION >= 2)
-#define NVCOMP_ZSTD_IS_STABLE 1
-#else
-#define NVCOMP_ZSTD_IS_STABLE 0
-#endif
+#define NVCOMP_ZSTD_DECOMP_IS_STABLE(MAJOR, MINOR, PATCH) \
+  (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 2))
 
 // Issue https://github.com/NVIDIA/spark-rapids/issues/6614 impacts nvCOMP 2.4.0 ZSTD decompression
 // on compute 6.x
-#if NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION == 4 and NVCOMP_PATCH_VERSION == 0
-#define NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL 1
-#else
-#define NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL 0
-#endif
+#define NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL(MAJOR, MINOR, PATCH) \
+  (MAJOR == 2 and MINOR == 4 and PATCH == 0)
 
 namespace cudf::io::nvcomp {
 
@@ -79,12 +56,12 @@ template <typename... Args>
 std::optional<nvcompStatus_t> batched_decompress_get_temp_size_ex(compression_type compression,
                                                                   Args&&... args)
 {
-#if NVCOMP_HAS_TEMPSIZE_EX
+#if NVCOMP_HAS_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
   switch (compression) {
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressGetTempSizeEx(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP
+#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedZstdDecompressGetTempSizeEx(std::forward<Args>(args)...);
 #else
       return std::nullopt;
@@ -104,16 +81,18 @@ auto batched_decompress_get_temp_size(compression_type compression, Args&&... ar
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressGetTempSize(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP
+#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedZstdDecompressGetTempSize(std::forward<Args>(args)...);
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Decompression error: " +
+                nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE
+#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedDeflateDecompressGetTempSize(std::forward<Args>(args)...);
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Decompression error: " +
+                nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value());
 #endif
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -127,16 +106,18 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressAsync(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP
+#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedZstdDecompressAsync(std::forward<Args>(args)...);
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Decompression error: " +
+                nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE
+#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedDeflateDecompressAsync(std::forward<Args>(args)...);
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Decompression error: " +
+                nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value());
 #endif
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -163,22 +144,6 @@ size_t batched_decompress_temp_size(compression_type compression,
   return temp_size;
 }
 
-void check_is_zstd_enabled()
-{
-  CUDF_EXPECTS(NVCOMP_HAS_ZSTD_DECOMP, "nvCOMP 2.3 or newer is required for Zstandard compression");
-  CUDF_EXPECTS(NVCOMP_ZSTD_IS_STABLE or cudf::io::detail::nvcomp_integration::is_all_enabled(),
-               "Zstandard compression is experimental, you can enable it through "
-               "`LIBCUDF_NVCOMP_POLICY` environment variable.");
-
-#if NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL
-  int device;
-  int cc_major;
-  CUDF_CUDA_TRY(cudaGetDevice(&device));
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
-  CUDF_EXPECTS(cc_major != 6, "Zstandard decompression is disabled on Pascal GPUs");
-#endif
-}
-
 void batched_decompress(compression_type compression,
                         device_span<device_span<uint8_t const> const> inputs,
                         device_span<device_span<uint8_t> const> outputs,
@@ -187,8 +152,6 @@ void batched_decompress(compression_type compression,
                         size_t max_total_uncomp_size,
                         rmm::cuda_stream_view stream)
 {
-  if (compression == compression_type::ZSTD) { check_is_zstd_enabled(); }
-
   auto const num_chunks = inputs.size();
 
   // cuDF inflate inputs converted to nvcomp inputs
@@ -228,20 +191,22 @@ auto batched_compress_temp_size(compression_type compression,
         batch_size, max_uncompressed_chunk_bytes, nvcompBatchedSnappyDefaultOpts, &temp_size);
       break;
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE
+#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedDeflateCompressGetTempSize(
         batch_size, max_uncompressed_chunk_bytes, nvcompBatchedDeflateDefaultOpts, &temp_size);
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
 #endif
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP
+#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedZstdCompressGetTempSize(
         batch_size, max_uncompressed_chunk_bytes, nvcompBatchedZstdDefaultOpts, &temp_size);
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -266,20 +231,22 @@ size_t compress_max_output_chunk_size(compression_type compression,
         capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
       break;
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE
+#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
 #endif
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP
+#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       status = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size);
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -316,7 +283,7 @@ static void batched_compress_async(compression_type compression,
                                                        stream.value());
       break;
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE
+#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedDeflateCompressAsync(device_uncompressed_ptrs,
                                                         device_uncompressed_bytes,
                                                         max_uncompressed_chunk_bytes,
@@ -329,10 +296,11 @@ static void batched_compress_async(compression_type compression,
                                                         stream.value());
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
 #endif
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP
+#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedZstdCompressAsync(device_uncompressed_ptrs,
                                                      device_uncompressed_bytes,
                                                      max_uncompressed_chunk_bytes,
@@ -345,7 +313,8 @@ static void batched_compress_async(compression_type compression,
                                                      stream.value());
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -390,18 +359,109 @@ void batched_compress(compression_type compression,
   update_compression_results(actual_compressed_data_sizes, results, stream);
 }
 
-bool is_compression_enabled(compression_type compression)
+feature_status_parameters::feature_status_parameters()
+  : lib_major_version{NVCOMP_MAJOR_VERSION},
+    lib_minor_version{NVCOMP_MINOR_VERSION},
+    lib_patch_version{NVCOMP_PATCH_VERSION},
+    are_all_integrations_enabled{detail::nvcomp_integration::is_all_enabled()},
+    are_stable_integrations_enabled{detail::nvcomp_integration::is_stable_enabled()}
+{
+  int device;
+  CUDF_CUDA_TRY(cudaGetDevice(&device));
+  CUDF_CUDA_TRY(
+    cudaDeviceGetAttribute(&compute_capability_major, cudaDevAttrComputeCapabilityMajor, device));
+}
+
+std::optional<std::string> is_compression_disabled(compression_type compression,
+                                                   feature_status_parameters params)
 {
   switch (compression) {
-    case compression_type::DEFLATE:
-      // See https://github.com/rapidsai/cudf/issues/11812
-      return false;
-    case compression_type::SNAPPY: return detail::nvcomp_integration::is_stable_enabled();
-    case compression_type::ZSTD:
-      return NVCOMP_HAS_ZSTD_COMP and detail::nvcomp_integration::is_all_enabled();
-    default: return false;
+    case compression_type::DEFLATE: {
+      if (not NVCOMP_HAS_DEFLATE(
+            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
+        return "nvCOMP 2.5 or newer is required for Deflate compression";
+      }
+      if (not params.are_all_integrations_enabled) {
+        return "DEFLATE compression is experimental, you can enable it through "
+               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::SNAPPY: {
+      if (not params.are_stable_integrations_enabled) {
+        return "Snappy compression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
+               "environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::ZSTD: {
+      if (not NVCOMP_HAS_ZSTD_COMP(
+            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
+        return "nvCOMP 2.4 or newer is required for Zstandard compression";
+      }
+      if (not params.are_stable_integrations_enabled) {
+        return "Zstandard compression is experimental, you can enable it through "
+               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+    default: return "Unsupported compression type";
+  }
+  return "Unsupported compression type";
+}
+
+std::optional<std::string> is_zstd_decomp_disabled(feature_status_parameters const& params)
+{
+  if (not NVCOMP_HAS_ZSTD_DECOMP(
+        params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
+    return "nvCOMP 2.3 or newer is required for Zstandard decompression";
+  }
+
+  if (NVCOMP_ZSTD_DECOMP_IS_STABLE(
+        params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
+    if (not params.are_stable_integrations_enabled) {
+      return "Zstandard decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
+             "environment variable.";
+    }
+  } else if (not params.are_all_integrations_enabled) {
+    return "Zstandard decompression is experimental, you can enable it through "
+           "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+  }
+
+  if (NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL(
+        params.lib_major_version, params.lib_minor_version, params.lib_patch_version) and
+      params.compute_capability_major == 6) {
+    return "Zstandard decompression is disabled on Pascal GPUs";
+  }
+  return std::nullopt;
+}
+
+std::optional<std::string> is_decompression_disabled(compression_type compression,
+                                                     feature_status_parameters params)
+{
+  switch (compression) {
+    case compression_type::DEFLATE: {
+      if (not NVCOMP_HAS_DEFLATE(
+            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
+        return "nvCOMP 2.5 or newer is required for Deflate decompression";
+      }
+      if (not params.are_all_integrations_enabled) {
+        return "DEFLATE decompression is experimental, you can enable it through "
+               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::SNAPPY: {
+      if (not params.are_stable_integrations_enabled) {
+        return "Snappy decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
+               "environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::ZSTD: return is_zstd_decomp_disabled(params);
+    default: return "Unsupported compression type";
   }
-  return false;
+  return "Unsupported compression type";
 }
 
 size_t compress_input_alignment_bits(compression_type compression)
@@ -430,10 +490,11 @@ std::optional<size_t> compress_max_allowed_chunk_size(compression_type compressi
     case compression_type::DEFLATE: return 64 * 1024;
     case compression_type::SNAPPY: return std::nullopt;
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP
+#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompZstdCompressionMaxAllowedChunkSize;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     default: return std::nullopt;
   }
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index a13cb031163..a6bde7957c7 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -18,6 +18,8 @@
 
 #include "gpuinflate.hpp"
 
+#include <io/utilities/config_utils.hpp>
+
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -30,14 +32,52 @@ namespace cudf::io::nvcomp {
 enum class compression_type { SNAPPY, ZSTD, DEFLATE };
 
 /**
- * @brief Whether the given compression type is enabled through nvCOMP.
+ * @brief Set of parameters that impact whether the use nvCOMP features is enabled.
+ */
+struct feature_status_parameters {
+  int lib_major_version;
+  int lib_minor_version;
+  int lib_patch_version;
+  bool are_all_integrations_enabled;
+  bool are_stable_integrations_enabled;
+  int compute_capability_major;
+
+  feature_status_parameters();
+  feature_status_parameters(
+    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
+    : lib_major_version{major},
+      lib_minor_version{minor},
+      lib_patch_version{patch},
+      are_all_integrations_enabled{all_enabled},
+      are_stable_integrations_enabled{stable_enabled},
+      compute_capability_major{cc_major}
+  {
+  }
+};
+
+/**
+ * @brief If a compression type is disabled through nvCOMP, returns the reason as a string.
+ *
+ * Result cab depend on nvCOMP version and environment variables.
+ *
+ * @param compression Compression type
+ * @param params Optional parameters to query status with different configurations
+ * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
+ */
+[[nodiscard]] std::optional<std::string> is_compression_disabled(
+  compression_type compression, feature_status_parameters params = feature_status_parameters());
+
+/**
+ * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string.
  *
- * Result depends on nvCOMP version and environment variables.
+ * Result can depend on nvCOMP version and environment variables.
  *
  * @param compression Compression type
- * @returns true if nvCOMP use is enabled; false otherwise
+ * @param params Optional parameters to query status with different configurations
+ * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
  */
-[[nodiscard]] bool is_compression_enabled(compression_type compression);
+[[nodiscard]] std::optional<std::string> is_decompression_disabled(
+  compression_type compression, feature_status_parameters params = feature_status_parameters());
 
 /**
  * @brief Device batch decompression of given type.
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index d669dea3115..f812f272c25 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -134,7 +134,6 @@ std::vector<std::string> get_column_names(std::vector<char> const& header,
   if (header.size() <= 1) { return col_names; }
 
   std::vector<char> first_row = header;
-  int num_cols                = 0;
 
   bool quotation = false;
   for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) {
@@ -163,17 +162,16 @@ std::vector<std::string> get_column_names(std::vector<char> const& header,
 
         const string new_col_name(first_row.data() + prev, col_name_len);
         col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar));
-
-        // Stop parsing when we hit the line terminator; relevant when there is
-        // a blank line following the header. In this case, first_row includes
-        // multiple line terminators at the end, as the new recStart belongs to
-        // a line that comes after the blank line(s)
-        if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
       } else {
         // This is the first data row, add the automatically generated name
-        col_names.push_back(prefix + std::to_string(num_cols));
+        col_names.push_back(prefix + std::to_string(col_names.size()));
       }
-      num_cols++;
+
+      // Stop parsing when we hit the line terminator; relevant when there is
+      // a blank line following the header. In this case, first_row includes
+      // multiple line terminators at the end, as the new recStart belongs to
+      // a line that comes after the blank line(s)
+      if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
 
       // Skip adjacent delimiters if delim_whitespace is set
       while (parse_opts.multi_delimiter && pos < first_row.size() &&
@@ -540,8 +538,7 @@ void infer_column_types(parse_options const& parse_opts,
     auto const& stats = column_stats[inf_col_idx++];
     unsigned long long int_count_total =
       stats.big_int_count + stats.negative_small_int_count + stats.positive_small_int_count;
-
-    if (stats.null_count == num_records) {
+    if (stats.null_count == num_records or stats.total_count() == 0) {
       // Entire column is NULL; allocate the smallest amount of memory
       column_types[col_idx] = data_type(cudf::type_id::INT8);
     } else if (stats.string_count > 0L) {
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 7230b455d4a..ed2f412f291 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -364,8 +364,11 @@ void write_chunked(data_sink* out_sink,
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
   cudf::string_scalar newline{options.get_line_terminator()};
-  auto p_str_col_w_nl =
-    cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream);
+  auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
+                                                            newline,
+                                                            string_scalar("", false),
+                                                            stream,
+                                                            rmm::mr::get_current_device_resource());
   strings_column_view strings_column{p_str_col_w_nl->view()};
 
   auto total_num_bytes      = strings_column.chars_size();
@@ -470,9 +473,11 @@ void write_csv(data_sink* out_sink,
                                                     delimiter_str,
                                                     options.get_na_rep(),
                                                     strings::separator_on_nulls::YES,
-                                                    stream);
+                                                    stream,
+                                                    rmm::mr::get_current_device_resource());
         cudf::string_scalar narep{options.get_na_rep()};
-        return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
+        return cudf::strings::detail::replace_nulls(
+          str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource());
       }();
 
       write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp
index c0eaa43e68f..b0b7d5baa0f 100644
--- a/cpp/src/io/json/experimental/read_json.cpp
+++ b/cpp/src/io/json/experimental/read_json.cpp
@@ -19,27 +19,49 @@
 #include <io/comp/io_uncomp.hpp>
 #include <io/json/nested_json.hpp>
 
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <numeric>
 
 namespace cudf::io::detail::json::experimental {
 
-std::vector<uint8_t> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                      compression_type compression)
+size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
+                    size_t range_offset,
+                    size_t range_size)
 {
-  auto const total_source_size =
-    std::accumulate(sources.begin(), sources.end(), 0ul, [](size_t sum, auto& source) {
-      return sum + source->size();
-    });
-  auto buffer = std::vector<uint8_t>(total_source_size);
+  return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) {
+    auto const size = source->size();
+    // TODO take care of 0, 0, or *, 0 case.
+    return sum +
+           (range_size == 0 or range_offset + range_size > size ? size - range_offset : range_size);
+  });
+}
+
+std::vector<uint8_t> ingest_raw_input(host_span<std::unique_ptr<datasource>> const& sources,
+                                      compression_type compression,
+                                      size_t range_offset,
+                                      size_t range_size)
+{
+  CUDF_FUNC_RANGE();
+  // Iterate through the user defined sources and read the contents into the local buffer
+  auto const total_source_size = sources_size(sources, range_offset, range_size);
+  auto buffer                  = std::vector<uint8_t>(total_source_size);
 
   size_t bytes_read = 0;
   for (const auto& source : sources) {
-    bytes_read += source->host_read(0, source->size(), buffer.data() + bytes_read);
+    if (!source->is_empty()) {
+      auto data_size   = (range_size != 0) ? range_size : source->size();
+      auto destination = buffer.data() + bytes_read;
+      bytes_read += source->host_read(range_offset, data_size, destination);
+    }
   }
 
-  return (compression == compression_type::NONE) ? buffer : decompress(compression, buffer);
+  if (compression == compression_type::NONE) {
+    return buffer;
+  } else {
+    return decompress(compression, buffer);
+  }
 }
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
@@ -47,10 +69,14 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0,
                "specifying a byte range is not yet supported");
 
-  auto const buffer = ingest_raw_input(sources, reader_opts.get_compression());
+  auto const buffer = ingest_raw_input(sources,
+                                       reader_opts.get_compression(),
+                                       reader_opts.get_byte_range_offset(),
+                                       reader_opts.get_byte_range_size());
   auto data = host_span<char const>(reinterpret_cast<char const*>(buffer.data()), buffer.size());
 
   try {
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index cee023a1061..0ac3efb407e 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -525,14 +525,15 @@ void make_device_json_column(device_span<SymbolT const> input,
       auto parent_node_id = ordered_parent_node_ids[i];
       if (parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST) {
         // unique item
-        if (i == 0 ||
+        if (i == 0 or
             (col_ids[i - 1] != col_ids[i] or ordered_parent_node_ids[i - 1] != parent_node_id)) {
           // scatter to list_offset
           d_columns_data[original_col_ids[parent_node_id]]
             .child_offsets[row_offsets[parent_node_id]] = ordered_row_offsets[i];
         }
         // TODO: verify if this code is right. check with more test cases.
-        if (i == num_nodes - 1 || (col_ids[i] != col_ids[i + 1])) {
+        if (i == num_nodes - 1 or
+            (col_ids[i] != col_ids[i + 1] or ordered_parent_node_ids[i + 1] != parent_node_id)) {
           // last value of list child_offset is its size.
           d_columns_data[original_col_ids[parent_node_id]]
             .child_offsets[row_offsets[parent_node_id] + 1] = ordered_row_offsets[i] + 1;
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 48b2af81fcd..4bbe91b61d2 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -26,6 +26,7 @@
 #include <io/utilities/type_conversion.hpp>
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/groupby.hpp>
@@ -222,6 +223,7 @@ std::vector<uint8_t> ingest_raw_input(std::vector<std::unique_ptr<datasource>> c
                                       size_t range_size,
                                       size_t range_size_padded)
 {
+  CUDF_FUNC_RANGE();
   // Iterate through the user defined sources and read the contents into the local buffer
   size_t total_source_size = 0;
   for (const auto& source : sources) {
@@ -313,6 +315,7 @@ rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reade
                                                 rmm::device_uvector<uint64_t>& rec_starts,
                                                 rmm::cuda_stream_view stream)
 {
+  CUDF_FUNC_RANGE();
   size_t end_offset = h_data.size();
 
   // Trim lines that are outside range
@@ -592,6 +595,7 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   if (reader_opts.is_enabled_experimental()) {
     return experimental::read_json(sources, reader_opts, stream, mr);
   }
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 7fb83b2a24e..7a135c1f2f2 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -379,8 +379,10 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
     switch (decompressor.compression()) {
       case compression_type::ZLIB:
-        // See https://github.com/rapidsai/cudf/issues/11812
-        if (false) {
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
+          gpuinflate(
+            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
+        } else {
           nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
                                      inflate_in_view,
                                      inflate_out_view,
@@ -388,13 +390,12 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                                      max_uncomp_block_size,
                                      total_decomp_size,
                                      stream);
-        } else {
-          gpuinflate(
-            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
         }
         break;
       case compression_type::SNAPPY:
-        if (nvcomp_integration::is_stable_enabled()) {
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) {
+          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
+        } else {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      inflate_in_view,
                                      inflate_out_view,
@@ -402,11 +403,13 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                                      max_uncomp_block_size,
                                      total_decomp_size,
                                      stream);
-        } else {
-          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
         }
         break;
       case compression_type::ZSTD:
+        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD);
+            reason) {
+          CUDF_FAIL("Decompression error: " + reason.value());
+        }
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                                    inflate_in_view,
                                    inflate_out_view,
@@ -522,8 +525,8 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
           parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
         auto merged_mask      = static_cast<bitmask_type*>(merged_null_mask.data());
         uint32_t* dst_idx_ptr = dst_idx.data();
-        // Copy child valid bits from child column to valid indexes, this will merge both child and
-        // parent null masks
+        // Copy child valid bits from child column to valid indexes, this will merge both child
+        // and parent null masks
         thrust::for_each(rmm::exec_policy(stream),
                          thrust::make_counting_iterator(0),
                          thrust::make_counting_iterator(0) + dst_idx.size(),
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 109030ef160..013761343d3 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1332,11 +1332,11 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
 
   if (compression == SNAPPY) {
     try {
-      if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) {
+      if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
+        gpu_snap(comp_in, comp_out, comp_res, stream);
+      } else {
         nvcomp::batched_compress(
           nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
-      } else {
-        gpu_snap(comp_in, comp_out, comp_res, stream);
       }
     } catch (...) {
       // There was an error in compressing so set an error status for each block
@@ -1348,12 +1348,18 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
       // Since SNAPPY is the default compression (may not be explicitly requested), fall back to
       // writing without compression
     }
-  } else if (compression == ZLIB and
-             nvcomp::is_compression_enabled(nvcomp::compression_type::DEFLATE)) {
+  } else if (compression == ZLIB) {
+    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE);
+        reason) {
+      CUDF_FAIL("Compression error: " + reason.value());
+    }
     nvcomp::batched_compress(
       nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_res, stream);
-  } else if (compression == ZSTD and
-             nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) {
+  } else if (compression == ZSTD) {
+    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
+        reason) {
+      CUDF_FAIL("Compression error: " + reason.value());
+    }
     nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
   } else if (compression != NONE) {
     CUDF_FAIL("Unsupported compression type");
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index a5e9e9da4cb..c0ae58a64d9 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -118,9 +118,9 @@ constexpr size_t compression_block_size(orc::CompressionKind compression)
   if (compression == orc::CompressionKind::NONE) { return 0; }
 
   auto const ncomp_type   = to_nvcomp_compression_type(compression);
-  auto const nvcomp_limit = nvcomp::is_compression_enabled(ncomp_type)
-                              ? nvcomp::compress_max_allowed_chunk_size(ncomp_type)
-                              : std::nullopt;
+  auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
+                              ? std::nullopt
+                              : nvcomp::compress_max_allowed_chunk_size(ncomp_type);
 
   constexpr size_t max_block_size = 256 * 1024;
   return std::min(nvcomp_limit.value_or(max_block_size), max_block_size);
@@ -537,7 +537,7 @@ constexpr size_t RLE_stream_size(TypeKind kind, size_t count)
 auto uncomp_block_alignment(CompressionKind compression_kind)
 {
   if (compression_kind == NONE or
-      not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) {
+      nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
     return 1u;
   }
 
@@ -547,7 +547,7 @@ auto uncomp_block_alignment(CompressionKind compression_kind)
 auto comp_block_alignment(CompressionKind compression_kind)
 {
   if (compression_kind == NONE or
-      not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) {
+      nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
     return 1u;
   }
 
@@ -2161,7 +2161,8 @@ void writer::impl::write(table_view const& table)
 
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
-  auto const uncomp_block_align = uncomp_block_alignment(compression_kind_);
+  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind_);
+  auto const compressed_block_align   = comp_block_alignment(compression_kind_);
   auto streams =
     create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes));
   auto enc_data = encode_columns(orc_table,
@@ -2169,7 +2170,7 @@ void writer::impl::write(table_view const& table)
                                  std::move(dec_chunk_sizes),
                                  segmentation,
                                  streams,
-                                 uncomp_block_align,
+                                 uncompressed_block_align,
                                  stream);
 
   // Assemble individual disparate column chunks into contiguous data streams
@@ -2187,9 +2188,9 @@ void writer::impl::write(table_view const& table)
     auto const max_compressed_block_size =
       max_compression_output_size(compression_kind_, compression_blocksize_);
     auto const padded_max_compressed_block_size =
-      util::round_up_unsafe<size_t>(max_compressed_block_size, uncomp_block_align);
+      util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
     auto const padded_block_header_size =
-      util::round_up_unsafe<size_t>(block_header_size, uncomp_block_align);
+      util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
 
     auto stream_output = [&]() {
       size_t max_stream_size = 0;
@@ -2238,7 +2239,7 @@ void writer::impl::write(table_view const& table)
                                   compression_kind_,
                                   compression_blocksize_,
                                   max_compressed_block_size,
-                                  comp_block_alignment(compression_kind_),
+                                  compressed_block_align,
                                   strm_descs,
                                   enc_data.streams,
                                   comp_results,
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 671e34ac73d..999cad76d5d 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <io/parquet/parquet_gpu.hpp>
+#include "parquet_gpu.cuh"
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 8a07ee419b4..74e98de4100 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -13,7 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "parquet_gpu.hpp"
+
+#include "parquet_gpu.cuh"
 
 #include <io/utilities/block_utils.cuh>
 
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
new file mode 100644
index 00000000000..793573b465e
--- /dev/null
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "parquet_gpu.hpp"
+
+#include <cudf/lists/lists_column_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <cuco/static_map.cuh>
+
+namespace cudf::io::parquet::gpu {
+
+auto constexpr KEY_SENTINEL   = size_type{-1};
+auto constexpr VALUE_SENTINEL = size_type{-1};
+
+using map_type = cuco::static_map<size_type, size_type>;
+
+/**
+ * @brief The alias of `map_type::pair_atomic_type` class.
+ *
+ * Declare this struct by trivial subclassing instead of type aliasing so we can have forward
+ * declaration of this struct somewhere else.
+ */
+struct slot_type : public map_type::pair_atomic_type {
+};
+
+/**
+ * @brief Return the byte length of parquet dtypes that are physically represented by INT32
+ */
+inline uint32_t __device__ int32_logical_len(type_id id)
+{
+  switch (id) {
+    case cudf::type_id::INT8: [[fallthrough]];
+    case cudf::type_id::UINT8: return 1;
+    case cudf::type_id::INT16: [[fallthrough]];
+    case cudf::type_id::UINT16: return 2;
+    case cudf::type_id::DURATION_SECONDS: [[fallthrough]];
+    case cudf::type_id::DURATION_MILLISECONDS: return 8;
+    default: return 4;
+  }
+}
+
+/**
+ * @brief Translate the row index of a parent column_device_view into the index of the first value
+ * in the leaf child.
+ * Only works in the context of parquet writer where struct columns are previously modified s.t.
+ * they only have one immediate child.
+ */
+inline size_type __device__ row_to_value_idx(size_type idx,
+                                             parquet_column_device_view const& parquet_col)
+{
+  // with a byte array, we can't go all the way down to the leaf node, but instead we want to leave
+  // the size at the parent level because we are writing out parent row byte arrays.
+  auto col = *parquet_col.parent_column;
+  while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+    if (col.type().id() == type_id::STRUCT) {
+      idx += col.offset();
+      col = col.child(0);
+    } else {
+      auto list_col = cudf::detail::lists_column_device_view(col);
+      auto child    = list_col.child();
+      if (parquet_col.output_as_byte_array && child.type().id() == type_id::UINT8) { break; }
+      idx = list_col.offset_at(idx);
+      col = child;
+    }
+  }
+  return idx;
+}
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index ea3678129ac..7849e05eb68 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -23,14 +23,10 @@
 #include "io/utilities/column_buffer.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/lists/lists_column_device_view.cuh>
-#include <cudf/table/table_device_view.cuh>
+#include <cudf/io/datasource.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <cuco/static_map.cuh>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
@@ -39,9 +35,7 @@
 
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet {
 
 using cudf::io::detail::string_index_pair;
 
@@ -72,11 +66,6 @@ struct input_column_info {
 
 namespace gpu {
 
-auto constexpr KEY_SENTINEL   = size_type{-1};
-auto constexpr VALUE_SENTINEL = size_type{-1};
-using map_type                = cuco::static_map<size_type, size_type>;
-using slot_type               = map_type::pair_atomic_type;
-
 /**
  * @brief Enums for the flags in the page header
  */
@@ -108,7 +97,8 @@ struct PageNestingInfo {
   int32_t max_rep_level;
 
   // set during preprocessing
-  int32_t size;              // this page/nesting-level's size contribution to the output column
+  int32_t size;  // this page/nesting-level's row count contribution to the output column, if fully
+                 // decoded
   int32_t page_start_value;  // absolute output start index in output column data
 
   // set during data decoding
@@ -247,6 +237,17 @@ struct ColumnChunkDesc {
   int32_t src_col_schema;  // my schema index in the file
 };
 
+/**
+ * @brief The struct to store raw/intermediate file data before parsing.
+ */
+struct file_intermediate_data {
+  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
+  rmm::device_buffer decomp_page_data;
+  hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
+  hostdevice_vector<gpu::PageInfo> pages_info{};
+  hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
+};
+
 /**
  * @brief Struct describing an encoder column
  */
@@ -293,50 +294,8 @@ struct PageFragment {
 constexpr unsigned int kDictHashBits = 16;
 constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 
-/**
- * @brief Return the byte length of parquet dtypes that are physically represented by INT32
- */
-inline uint32_t __device__ int32_logical_len(type_id id)
-{
-  switch (id) {
-    case cudf::type_id::INT8: [[fallthrough]];
-    case cudf::type_id::UINT8: return 1;
-    case cudf::type_id::INT16: [[fallthrough]];
-    case cudf::type_id::UINT16: return 2;
-    case cudf::type_id::DURATION_SECONDS: [[fallthrough]];
-    case cudf::type_id::DURATION_MILLISECONDS: return 8;
-    default: return 4;
-  }
-}
-
-/**
- * @brief Translate the row index of a parent column_device_view into the index of the first value
- * in the leaf child.
- * Only works in the context of parquet writer where struct columns are previously modified s.t.
- * they only have one immediate child.
- */
-inline size_type __device__ row_to_value_idx(size_type idx,
-                                             parquet_column_device_view const& parquet_col)
-{
-  // with a byte array, we can't go all the way down to the leaf node, but instead we want to leave
-  // the size at the parent level because we are writing out parent row byte arrays.
-  auto col = *parquet_col.parent_column;
-  while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-    if (col.type().id() == type_id::STRUCT) {
-      idx += col.offset();
-      col = col.child(0);
-    } else {
-      auto list_col = cudf::detail::lists_column_device_view(col);
-      auto child    = list_col.child();
-      if (parquet_col.output_as_byte_array && child.type().id() == type_id::UINT8) { break; }
-      idx = list_col.offset_at(idx);
-      col = child;
-    }
-  }
-  return idx;
-}
-
 struct EncPage;
+struct slot_type;
 
 /**
  * @brief Struct describing an encoder column chunk
@@ -630,6 +589,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream);
 
 }  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
new file mode 100644
index 00000000000..6be6987b7cb
--- /dev/null
+++ b/cpp/src/io/parquet/reader.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+
+namespace cudf::io::detail::parquet {
+
+reader::reader() = default;
+
+reader::reader(std::vector<std::unique_ptr<datasource>>&& sources,
+               parquet_reader_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
+  : _impl(std::make_unique<impl>(std::move(sources), options, stream, mr))
+{
+}
+
+reader::~reader() = default;
+
+table_with_metadata reader::read(parquet_reader_options const& options)
+{
+  // if the user has specified custom row bounds
+  bool const uses_custom_row_bounds = options.get_num_rows() >= 0 || options.get_skip_rows() != 0;
+  return _impl->read(options.get_skip_rows(),
+                     options.get_num_rows(),
+                     uses_custom_row_bounds,
+                     options.get_row_groups());
+}
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
new file mode 100644
index 00000000000..a61f63f6645
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <numeric>
+
+namespace cudf::io::detail::parquet {
+
+void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
+{
+  auto& chunks       = _file_itm_data.chunks;
+  auto& pages        = _file_itm_data.pages_info;
+  auto& page_nesting = _file_itm_data.page_nesting_info;
+
+  auto is_dict_chunk = [](const gpu::ColumnChunkDesc& chunk) {
+    return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
+  };
+
+  // Count the number of string dictionary entries
+  // NOTE: Assumes first page in the chunk is always the dictionary page
+  size_t total_str_dict_indexes = 0;
+  for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
+    if (is_dict_chunk(chunks[c])) { total_str_dict_indexes += pages[page_count].num_input_values; }
+    page_count += chunks[c].max_num_pages;
+  }
+
+  // Build index for string dictionaries since they can't be indexed
+  // directly due to variable-sized elements
+  auto str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+    total_str_dict_indexes, _stream);
+
+  size_t const sum_max_depths = std::accumulate(
+    chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) {
+      return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
+    });
+
+  // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
+  // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
+  // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
+  auto chunk_nested_valids = hostdevice_vector<uint32_t*>(sum_max_depths, _stream);
+  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths, _stream);
+  auto chunk_offsets       = std::vector<size_t>();
+
+  // Update chunks with pointers to column data.
+  for (size_t c = 0, page_count = 0, str_ofs = 0, chunk_off = 0; c < chunks.size(); c++) {
+    input_column_info const& input_col = _input_columns[chunks[c].src_col_index];
+    CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
+                 "Column/page schema index mismatch");
+
+    if (is_dict_chunk(chunks[c])) {
+      chunks[c].str_dict_index = str_dict_index.data() + str_ofs;
+      str_ofs += pages[page_count].num_input_values;
+    }
+
+    size_t max_depth = _metadata->get_output_nesting_depth(chunks[c].src_col_schema);
+    chunk_offsets.push_back(chunk_off);
+
+    // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers
+    // to validity data
+    auto valids              = chunk_nested_valids.host_ptr(chunk_off);
+    chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off);
+
+    // get a slice of size `nesting depth` from `chunk_nested_data` to store an array of pointers to
+    // out data
+    auto data                  = chunk_nested_data.host_ptr(chunk_off);
+    chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
+
+    chunk_off += max_depth;
+
+    // fill in the arrays on the host.  there are some important considerations to
+    // take into account here for nested columns.  specifically, with structs
+    // there is sharing of output buffers between input columns.  consider this schema
+    //
+    //  required group field_id=1 name {
+    //    required binary field_id=2 firstname (String);
+    //    required binary field_id=3 middlename (String);
+    //    required binary field_id=4 lastname (String);
+    // }
+    //
+    // there are 3 input columns of data here (firstname, middlename, lastname), but
+    // only 1 output column (name).  The structure of the output column buffers looks like
+    // the schema itself
+    //
+    // struct      (name)
+    //     string  (firstname)
+    //     string  (middlename)
+    //     string  (lastname)
+    //
+    // The struct column can contain validity information. the problem is, the decode
+    // step for the input columns will all attempt to decode this validity information
+    // because each one has it's own copy of the repetition/definition levels. but
+    // since this is all happening in parallel it would mean multiple blocks would
+    // be stomping all over the same memory randomly.  to work around this, we set
+    // things up so that only 1 child of any given nesting level fills in the
+    // data (offsets in the case of lists) or validity information for the higher
+    // levels of the hierarchy that are shared.  In this case, it would mean we
+    // would just choose firstname to be the one that decodes the validity for name.
+    //
+    // we do this by only handing out the pointers to the first child we come across.
+    //
+    auto* cols = &_output_buffers;
+    for (size_t idx = 0; idx < max_depth; idx++) {
+      auto& out_buf = (*cols)[input_col.nesting[idx]];
+      cols          = &out_buf.children;
+
+      int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
+      if (owning_schema == 0 || owning_schema == input_col.schema_idx) {
+        valids[idx] = out_buf.null_mask();
+        data[idx]   = out_buf.data();
+        out_buf.user_data |=
+          static_cast<uint32_t>(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
+      } else {
+        valids[idx] = nullptr;
+        data[idx]   = nullptr;
+      }
+    }
+
+    // column_data_base will always point to leaf data, even for nested types.
+    page_count += chunks[c].max_num_pages;
+  }
+
+  chunks.host_to_device(_stream);
+  chunk_nested_valids.host_to_device(_stream);
+  chunk_nested_data.host_to_device(_stream);
+
+  if (total_str_dict_indexes > 0) {
+    gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
+  }
+
+  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _stream);
+  pages.device_to_host(_stream);
+  page_nesting.device_to_host(_stream);
+  _stream.synchronize();
+
+  // for list columns, add the final offset to every offset buffer.
+  // TODO : make this happen in more efficiently. Maybe use thrust::for_each
+  // on each buffer.  Or potentially do it in PreprocessColumnData
+  // Note : the reason we are doing this here instead of in the decode kernel is
+  // that it is difficult/impossible for a given page to know that it is writing the very
+  // last value that should then be followed by a terminator (because rows can span
+  // page boundaries).
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    input_column_info const& input_col = _input_columns[idx];
+
+    auto* cols = &_output_buffers;
+    for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
+      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
+      cols          = &out_buf.children;
+
+      if (out_buf.type.id() != type_id::LIST ||
+          (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED)) {
+        continue;
+      }
+      CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
+      auto& child = (*cols)[input_col.nesting[l_idx + 1]];
+
+      // the final offset for a list at level N is the size of it's child
+      int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
+      cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
+                      &offset,
+                      sizeof(offset),
+                      cudaMemcpyHostToDevice,
+                      _stream.value());
+      out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
+    }
+  }
+
+  // update null counts in the final column buffers
+  for (size_t idx = 0; idx < pages.size(); idx++) {
+    gpu::PageInfo* pi = &pages[idx];
+    if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    gpu::ColumnChunkDesc* col          = &chunks[pi->chunk_idx];
+    input_column_info const& input_col = _input_columns[col->src_col_index];
+
+    int index                 = pi->nesting - page_nesting.device_ptr();
+    gpu::PageNestingInfo* pni = &page_nesting[index];
+
+    auto* cols = &_output_buffers;
+    for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
+      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
+      cols          = &out_buf.children;
+
+      // if I wasn't the one who wrote out the validity bits, skip it
+      if (chunk_nested_valids.host_ptr(chunk_offsets[pi->chunk_idx])[l_idx] == nullptr) {
+        continue;
+      }
+      out_buf.null_count() += pni[l_idx].null_count;
+    }
+  }
+
+  _stream.synchronize();
+}
+
+reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                   parquet_reader_options const& options,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : _stream(stream), _mr(mr), _sources(std::move(sources))
+{
+  // Open and parse the source dataset metadata
+  _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
+
+  // Override output timestamp resolution if requested
+  if (options.get_timestamp_type().id() != type_id::EMPTY) {
+    _timestamp_type = options.get_timestamp_type();
+  }
+
+  // Strings may be returned as either string or categorical columns
+  _strings_to_categorical = options.is_enabled_convert_strings_to_categories();
+
+  // Binary columns can be read as binary or strings
+  _reader_column_schema = options.get_column_schema();
+
+  // Select only columns required by the options
+  std::tie(_input_columns, _output_buffers, _output_column_schemas) =
+    _metadata->select_columns(options.get_columns(),
+                              options.is_enabled_use_pandas_metadata(),
+                              _strings_to_categorical,
+                              _timestamp_type.id());
+}
+
+void reader::impl::prepare_data(size_type skip_rows,
+                                size_type num_rows,
+                                bool uses_custom_row_bounds,
+                                host_span<std::vector<size_type> const> row_group_indices)
+{
+  const auto [skip_rows_corrected, num_rows_corrected, row_groups_info] =
+    _metadata->select_row_groups(row_group_indices, skip_rows, num_rows);
+  _skip_rows = skip_rows_corrected;
+  _num_rows  = num_rows_corrected;
+
+  if (num_rows_corrected > 0 && row_groups_info.size() != 0 && _input_columns.size() != 0) {
+    load_and_decompress_data(row_groups_info, num_rows_corrected);
+  }
+}
+
+table_with_metadata reader::impl::read_chunk_internal(bool uses_custom_row_bounds)
+{
+  auto out_metadata = table_metadata{};
+
+  // output cudf columns as determined by the top level schema
+  auto out_columns = std::vector<std::unique_ptr<column>>{};
+  out_columns.reserve(_output_buffers.size());
+
+  if (_num_rows == 0) { return finalize_output(out_metadata, out_columns); }
+
+  allocate_columns(_skip_rows, _num_rows, uses_custom_row_bounds);
+
+  decode_page_data(_skip_rows, _num_rows);
+
+  // Create the final output cudf columns
+  for (size_t i = 0; i < _output_buffers.size(); ++i) {
+    auto const metadata        = _reader_column_schema.has_value()
+                                   ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
+                                   : std::nullopt;
+    column_name_info& col_name = out_metadata.schema_info.emplace_back("");
+    out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream, _mr));
+  }
+
+  return finalize_output(out_metadata, out_columns);
+}
+
+table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
+                                                  std::vector<std::unique_ptr<column>>& out_columns)
+{
+  // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
+  for (size_t i = out_columns.size(); i < _output_buffers.size(); ++i) {
+    column_name_info& col_name = out_metadata.schema_info.emplace_back("");
+    out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], &col_name, _stream, _mr));
+  }
+
+  // Return column names (must match order of returned columns)
+  out_metadata.column_names.resize(_output_buffers.size());
+  for (size_t i = 0; i < _output_column_schemas.size(); i++) {
+    auto const& schema           = _metadata->get_schema(_output_column_schemas[i]);
+    out_metadata.column_names[i] = schema.name;
+  }
+
+  // Return user metadata
+  out_metadata.per_file_user_data = _metadata->get_key_value_metadata();
+  out_metadata.user_data          = {out_metadata.per_file_user_data[0].begin(),
+                            out_metadata.per_file_user_data[0].end()};
+
+  return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+}
+
+table_with_metadata reader::impl::read(size_type skip_rows,
+                                       size_type num_rows,
+                                       bool uses_custom_row_bounds,
+                                       host_span<std::vector<size_type> const> row_group_indices)
+{
+  prepare_data(skip_rows, num_rows, uses_custom_row_bounds, row_group_indices);
+  return read_chunk_internal(uses_custom_row_bounds);
+}
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
deleted file mode 100644
index 50893ebe583..00000000000
--- a/cpp/src/io/parquet/reader_impl.cu
+++ /dev/null
@@ -1,1855 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file reader_impl.cu
- * @brief cuDF-IO Parquet reader class implementation
- */
-
-#include "reader_impl.hpp"
-
-#include "compact_protocol_reader.hpp"
-
-#include <io/comp/gpuinflate.hpp>
-#include <io/comp/nvcomp_adapter.hpp>
-#include <io/utilities/config_utils.hpp>
-#include <io/utilities/time_utils.cuh>
-
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/traits.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/logical.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
-
-#include <algorithm>
-#include <array>
-#include <numeric>
-#include <regex>
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
-// Import functionality that's independent of legacy code
-using namespace cudf::io::parquet;
-using namespace cudf::io;
-
-namespace {
-
-parquet::ConvertedType logical_type_to_converted_type(parquet::LogicalType const& logical)
-{
-  if (logical.isset.STRING) {
-    return parquet::UTF8;
-  } else if (logical.isset.MAP) {
-    return parquet::MAP;
-  } else if (logical.isset.LIST) {
-    return parquet::LIST;
-  } else if (logical.isset.ENUM) {
-    return parquet::ENUM;
-  } else if (logical.isset.DECIMAL) {
-    return parquet::DECIMAL;  // TODO set decimal values
-  } else if (logical.isset.DATE) {
-    return parquet::DATE;
-  } else if (logical.isset.TIME) {
-    if (logical.TIME.unit.isset.MILLIS)
-      return parquet::TIME_MILLIS;
-    else if (logical.TIME.unit.isset.MICROS)
-      return parquet::TIME_MICROS;
-  } else if (logical.isset.TIMESTAMP) {
-    if (logical.TIMESTAMP.unit.isset.MILLIS)
-      return parquet::TIMESTAMP_MILLIS;
-    else if (logical.TIMESTAMP.unit.isset.MICROS)
-      return parquet::TIMESTAMP_MICROS;
-  } else if (logical.isset.INTEGER) {
-    switch (logical.INTEGER.bitWidth) {
-      case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
-      case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16;
-      case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32;
-      case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64;
-      default: break;
-    }
-  } else if (logical.isset.UNKNOWN) {
-    return parquet::NA;
-  } else if (logical.isset.JSON) {
-    return parquet::JSON;
-  } else if (logical.isset.BSON) {
-    return parquet::BSON;
-  }
-  return parquet::UNKNOWN;
-}
-
-/**
- * @brief Function that translates Parquet datatype to cuDF type enum
- */
-type_id to_type_id(SchemaElement const& schema,
-                   bool strings_to_categorical,
-                   type_id timestamp_type_id)
-{
-  parquet::Type const physical            = schema.type;
-  parquet::LogicalType const logical_type = schema.logical_type;
-  parquet::ConvertedType converted_type   = schema.converted_type;
-  int32_t decimal_scale                   = schema.decimal_scale;
-
-  // Logical type used for actual data interpretation; the legacy converted type
-  // is superceded by 'logical' type whenever available.
-  auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
-  if (inferred_converted_type != parquet::UNKNOWN) converted_type = inferred_converted_type;
-  if (inferred_converted_type == parquet::DECIMAL && decimal_scale == 0)
-    decimal_scale = schema.logical_type.DECIMAL.scale;
-
-  switch (converted_type) {
-    case parquet::UINT_8: return type_id::UINT8;
-    case parquet::INT_8: return type_id::INT8;
-    case parquet::UINT_16: return type_id::UINT16;
-    case parquet::INT_16: return type_id::INT16;
-    case parquet::UINT_32: return type_id::UINT32;
-    case parquet::UINT_64: return type_id::UINT64;
-    case parquet::DATE: return type_id::TIMESTAMP_DAYS;
-    case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
-    case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS;
-    case parquet::TIMESTAMP_MILLIS:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_MILLISECONDS;
-    case parquet::TIMESTAMP_MICROS:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_MICROSECONDS;
-    case parquet::DECIMAL:
-      if (physical == parquet::INT32) { return type_id::DECIMAL32; }
-      if (physical == parquet::INT64) { return type_id::DECIMAL64; }
-      if (physical == parquet::FIXED_LEN_BYTE_ARRAY) {
-        if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
-          return type_id::DECIMAL32;
-        }
-        if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
-          return type_id::DECIMAL64;
-        }
-        if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
-          return type_id::DECIMAL128;
-        }
-      }
-      CUDF_FAIL("Invalid representation of decimal type");
-      break;
-
-    // maps are just List<Struct<>>.
-    case parquet::MAP:
-    case parquet::LIST: return type_id::LIST;
-    case parquet::NA: return type_id::STRING;
-    // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
-    default: break;
-  }
-
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
-      logical_type.TIMESTAMP.unit.isset.NANOS) {
-    return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                 : type_id::TIMESTAMP_NANOSECONDS;
-  }
-
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
-      logical_type.TIME.unit.isset.NANOS) {
-    return type_id::DURATION_NANOSECONDS;
-  }
-
-  // is it simply a struct?
-  if (schema.is_struct()) { return type_id::STRUCT; }
-
-  // Physical storage type supported by Parquet; controls the on-disk storage
-  // format in combination with the encoding type.
-  switch (physical) {
-    case parquet::BOOLEAN: return type_id::BOOL8;
-    case parquet::INT32: return type_id::INT32;
-    case parquet::INT64: return type_id::INT64;
-    case parquet::FLOAT: return type_id::FLOAT32;
-    case parquet::DOUBLE: return type_id::FLOAT64;
-    case parquet::BYTE_ARRAY:
-    case parquet::FIXED_LEN_BYTE_ARRAY:
-      // Can be mapped to INT32 (32-bit hash) or STRING
-      return strings_to_categorical ? type_id::INT32 : type_id::STRING;
-    case parquet::INT96:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_NANOSECONDS;
-    default: break;
-  }
-
-  return type_id::EMPTY;
-}
-
-/**
- * @brief Converts cuDF type enum to column logical type
- */
-data_type to_data_type(type_id t_id, SchemaElement const& schema)
-{
-  return t_id == type_id::DECIMAL32 || t_id == type_id::DECIMAL64 || t_id == type_id::DECIMAL128
-           ? data_type{t_id, numeric::scale_type{-schema.decimal_scale}}
-           : data_type{t_id};
-}
-
-/**
- * @brief Function that returns the required the number of bits to store a value
- */
-template <typename T = uint8_t>
-T required_bits(uint32_t max_level)
-{
-  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
-}
-
-/**
- * @brief Converts cuDF units to Parquet units.
- *
- * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
- */
-std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
-                                                     type_id timestamp_type_id,
-                                                     parquet::Type physical,
-                                                     int8_t converted,
-                                                     int32_t length)
-{
-  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
-  int32_t clock_rate = 0;
-  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
-    type_width = 1;  // I32 -> I8
-  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
-    type_width = 2;  // I32 -> I16
-  } else if (column_type_id == type_id::INT32) {
-    type_width = 4;  // str -> hash32
-  } else if (is_chrono(data_type{column_type_id})) {
-    clock_rate = to_clockrate(timestamp_type_id);
-  }
-
-  int8_t converted_type = converted;
-  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
-      not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
-  }
-  return std::make_tuple(type_width, clock_rate, converted_type);
-}
-
-inline void decompress_check(device_span<compression_result const> results,
-                             rmm::cuda_stream_view stream)
-{
-  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
-                              results.begin(),
-                              results.end(),
-                              [] __device__(auto const& res) {
-                                return res.status == compression_status::SUCCESS;
-                              }),
-               "Error during decompression");
-}
-}  // namespace
-
-std::string name_from_path(const std::vector<std::string>& path_in_schema)
-{
-  // For the case of lists, we will see a schema that looks like:
-  // a.list.element.list.element
-  // where each (list.item) pair represents a level of nesting.  According to the parquet spec,
-  // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
-  // the initial field must be named "list" and the inner element must be named "element".
-  // If we are dealing with a list, we want to return the topmost name of the group ("a").
-  //
-  // For other nested schemas, like structs we just want to return the bottom-most name. For
-  // example a struct with the schema
-  // b.employee.id,  the column representing "id" should simply be named "id".
-  //
-  // In short, this means : return the highest level of the schema that does not have list
-  // definitions underneath it.
-  //
-  std::string s = (path_in_schema.size() > 0) ? path_in_schema[0] : "";
-  for (size_t i = 1; i < path_in_schema.size(); i++) {
-    // The Parquet spec requires that the outer schema field is named "list". However it also
-    // provides a list of backwards compatibility cases that are applicable as well.  Currently
-    // we are only handling the formal spec.  This will get cleaned up and improved when we add
-    // support for structs. The correct thing to do will probably be to examine the type of
-    // the SchemaElement itself to concretely identify the start of a nested type of any kind rather
-    // than trying to derive it from the path string.
-    if (path_in_schema[i] == "list") {
-      // Again, strictly speaking, the Parquet spec says the inner field should be named
-      // "element", but there are some backwards compatibility issues that we have seen in the
-      // wild. For example, Pandas calls the field "item".  We will allow any name for now.
-      i++;
-      continue;
-    }
-    // otherwise, we've got a real nested column. update the name
-    s = path_in_schema[i];
-  }
-  return s;
-}
-
-/**
- * @brief Class for parsing dataset metadata
- */
-struct metadata : public FileMetaData {
-  explicit metadata(datasource* source)
-  {
-    constexpr auto header_len = sizeof(file_header_s);
-    constexpr auto ender_len  = sizeof(file_ender_s);
-
-    const auto len           = source->size();
-    const auto header_buffer = source->host_read(0, header_len);
-    const auto header        = reinterpret_cast<const file_header_s*>(header_buffer->data());
-    const auto ender_buffer  = source->host_read(len - ender_len, ender_len);
-    const auto ender         = reinterpret_cast<const file_ender_s*>(ender_buffer->data());
-    CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source");
-    CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic,
-                 "Corrupted header or footer");
-    CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len),
-                 "Incorrect footer length");
-
-    const auto buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
-    CompactProtocolReader cp(buffer->data(), ender->footer_len);
-    CUDF_EXPECTS(cp.read(this), "Cannot parse metadata");
-    CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema");
-  }
-};
-
-class aggregate_reader_metadata {
-  std::vector<metadata> per_file_metadata;
-  std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
-  size_type num_rows;
-  size_type num_row_groups;
-  /**
-   * @brief Create a metadata object from each element in the source vector
-   */
-  auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const& sources)
-  {
-    std::vector<metadata> metadatas;
-    std::transform(
-      sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
-        return metadata(source.get());
-      });
-    return metadatas;
-  }
-
-  /**
-   * @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps.
-   */
-  [[nodiscard]] auto collect_keyval_metadata()
-  {
-    std::vector<std::unordered_map<std::string, std::string>> kv_maps;
-    std::transform(per_file_metadata.cbegin(),
-                   per_file_metadata.cend(),
-                   std::back_inserter(kv_maps),
-                   [](auto const& pfm) {
-                     std::unordered_map<std::string, std::string> kv_map;
-                     std::transform(pfm.key_value_metadata.cbegin(),
-                                    pfm.key_value_metadata.cend(),
-                                    std::inserter(kv_map, kv_map.end()),
-                                    [](auto const& kv) {
-                                      return std::pair{kv.key, kv.value};
-                                    });
-                     return kv_map;
-                   });
-
-    return kv_maps;
-  }
-
-  /**
-   * @brief Sums up the number of rows of each source
-   */
-  [[nodiscard]] size_type calc_num_rows() const
-  {
-    return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
-        return sum + pfm.num_rows;
-      });
-  }
-
-  /**
-   * @brief Sums up the number of row groups of each source
-   */
-  [[nodiscard]] size_type calc_num_row_groups() const
-  {
-    return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
-        return sum + pfm.row_groups.size();
-      });
-  }
-
- public:
-  aggregate_reader_metadata(std::vector<std::unique_ptr<datasource>> const& sources)
-    : per_file_metadata(metadatas_from_sources(sources)),
-      keyval_maps(collect_keyval_metadata()),
-      num_rows(calc_num_rows()),
-      num_row_groups(calc_num_row_groups())
-  {
-    // Verify that the input files have matching numbers of columns
-    size_type num_cols = -1;
-    for (auto const& pfm : per_file_metadata) {
-      if (pfm.row_groups.size() != 0) {
-        if (num_cols == -1)
-          num_cols = pfm.row_groups[0].columns.size();
-        else
-          CUDF_EXPECTS(num_cols == static_cast<size_type>(pfm.row_groups[0].columns.size()),
-                       "All sources must have the same number of columns");
-      }
-    }
-    // Verify that the input files have matching schemas
-    for (auto const& pfm : per_file_metadata) {
-      CUDF_EXPECTS(per_file_metadata[0].schema == pfm.schema,
-                   "All sources must have the same schemas");
-    }
-  }
-
-  [[nodiscard]] auto const& get_row_group(size_type row_group_index, size_type src_idx) const
-  {
-    CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast<size_type>(per_file_metadata.size()),
-                 "invalid source index");
-    return per_file_metadata[src_idx].row_groups[row_group_index];
-  }
-
-  [[nodiscard]] auto const& get_column_metadata(size_type row_group_index,
-                                                size_type src_idx,
-                                                int schema_idx) const
-  {
-    auto col = std::find_if(
-      per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
-      per_file_metadata[src_idx].row_groups[row_group_index].columns.end(),
-      [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx ? true : false; });
-    CUDF_EXPECTS(col != std::end(per_file_metadata[src_idx].row_groups[row_group_index].columns),
-                 "Found no metadata for schema index");
-    return col->meta_data;
-  }
-
-  [[nodiscard]] auto get_num_rows() const { return num_rows; }
-
-  [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
-
-  [[nodiscard]] auto const& get_schema(int schema_idx) const
-  {
-    return per_file_metadata[0].schema[schema_idx];
-  }
-
-  [[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; }
-
-  /**
-   * @brief Gets the concrete nesting depth of output cudf columns
-   *
-   * @param schema_index Schema index of the input column
-   *
-   * @return comma-separated index column names in quotes
-   */
-  [[nodiscard]] inline int get_output_nesting_depth(int schema_index) const
-  {
-    auto& pfm = per_file_metadata[0];
-    int depth = 0;
-
-    // walk upwards, skipping repeated fields
-    while (schema_index > 0) {
-      if (!pfm.schema[schema_index].is_stub()) { depth++; }
-      // schema of one-level encoding list doesn't contain nesting information, so we need to
-      // manually add an extra nesting level
-      if (pfm.schema[schema_index].is_one_level_list()) { depth++; }
-      schema_index = pfm.schema[schema_index].parent_idx;
-    }
-    return depth;
-  }
-
-  /**
-   * @brief Extracts the pandas "index_columns" section
-   *
-   * PANDAS adds its own metadata to the key_value section when writing out the
-   * dataframe to a file to aid in exact reconstruction. The JSON-formatted
-   * metadata contains the index column(s) and PANDA-specific datatypes.
-   *
-   * @return comma-separated index column names in quotes
-   */
-  [[nodiscard]] std::string get_pandas_index() const
-  {
-    // Assumes that all input files have the same metadata
-    // TODO: verify this assumption
-    auto it = keyval_maps[0].find("pandas");
-    if (it != keyval_maps[0].end()) {
-      // Captures a list of quoted strings found inside square brackets after `"index_columns":`
-      // Inside quotes supports newlines, brackets, escaped quotes, etc.
-      // One-liner regex:
-      // "index_columns"\s*:\s*\[\s*((?:"(?:|(?:.*?(?![^\\]")).?)[^\\]?",?\s*)*)\]
-      // Documented below.
-      std::regex index_columns_expr{
-        R"("index_columns"\s*:\s*\[\s*)"  // match preamble, opening square bracket, whitespace
-        R"(()"                            // Open first capturing group
-        R"((?:")"                         // Open non-capturing group match opening quote
-        R"((?:|(?:.*?(?![^\\]")).?))"     // match empty string or anything between quotes
-        R"([^\\]?")"                      // Match closing non-escaped quote
-        R"(,?\s*)"                        // Match optional comma and whitespace
-        R"()*)"                           // Close non-capturing group and repeat 0 or more times
-        R"())"                            // Close first capturing group
-        R"(\])"                           // Match closing square brackets
-      };
-      std::smatch sm;
-      if (std::regex_search(it->second, sm, index_columns_expr)) { return sm[1].str(); }
-    }
-    return "";
-  }
-
-  /**
-   * @brief Extracts the column name(s) used for the row indexes in a dataframe
-   *
-   * @param names List of column names to load, where index column name(s) will be added
-   */
-  [[nodiscard]] std::vector<std::string> get_pandas_index_names() const
-  {
-    std::vector<std::string> names;
-    auto str = get_pandas_index();
-    if (str.length() != 0) {
-      std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"};
-      std::smatch sm;
-      while (std::regex_search(str, sm, index_name_expr)) {
-        if (sm.size() == 2) {  // 2 = whole match, first item
-          if (std::find(names.begin(), names.end(), sm[1].str()) == names.end()) {
-            std::regex esc_quote{R"(\\")"};
-            names.emplace_back(std::regex_replace(sm[1].str(), esc_quote, R"(")"));
-          }
-        }
-        str = sm.suffix();
-      }
-    }
-    return names;
-  }
-
-  struct row_group_info {
-    size_type const index;
-    size_t const start_row;  // TODO source index
-    size_type const source_index;
-    row_group_info(size_type index, size_t start_row, size_type source_index)
-      : index(index), start_row(start_row), source_index(source_index)
-    {
-    }
-  };
-
-  /**
-   * @brief Filters and reduces down to a selection of row groups
-   *
-   * @param row_groups Lists of row groups to read, one per source
-   * @param row_start Starting row of the selection
-   * @param row_count Total number of rows selected
-   *
-   * @return List of row group indexes and its starting row
-   */
-  [[nodiscard]] auto select_row_groups(std::vector<std::vector<size_type>> const& row_groups,
-                                       size_type& row_start,
-                                       size_type& row_count) const
-  {
-    if (!row_groups.empty()) {
-      std::vector<row_group_info> selection;
-      CUDF_EXPECTS(row_groups.size() == per_file_metadata.size(),
-                   "Must specify row groups for each source");
-
-      row_count = 0;
-      for (size_t src_idx = 0; src_idx < row_groups.size(); ++src_idx) {
-        for (auto const& rowgroup_idx : row_groups[src_idx]) {
-          CUDF_EXPECTS(
-            rowgroup_idx >= 0 &&
-              rowgroup_idx < static_cast<size_type>(per_file_metadata[src_idx].row_groups.size()),
-            "Invalid rowgroup index");
-          selection.emplace_back(rowgroup_idx, row_count, src_idx);
-          row_count += get_row_group(rowgroup_idx, src_idx).num_rows;
-        }
-      }
-      return selection;
-    }
-
-    row_start = std::max(row_start, 0);
-    if (row_count < 0) {
-      row_count = static_cast<size_type>(
-        std::min<int64_t>(get_num_rows(), std::numeric_limits<size_type>::max()));
-    }
-    row_count = min(row_count, get_num_rows() - row_start);
-    CUDF_EXPECTS(row_count >= 0, "Invalid row count");
-    CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start");
-
-    std::vector<row_group_info> selection;
-    size_type count = 0;
-    for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) {
-      for (size_t rg_idx = 0; rg_idx < per_file_metadata[src_idx].row_groups.size(); ++rg_idx) {
-        auto const chunk_start_row = count;
-        count += get_row_group(rg_idx, src_idx).num_rows;
-        if (count > row_start || count == 0) {
-          selection.emplace_back(rg_idx, chunk_start_row, src_idx);
-        }
-        if (count >= row_start + row_count) { break; }
-      }
-    }
-
-    return selection;
-  }
-
-  /**
-   * @brief Filters and reduces down to a selection of columns
-   *
-   * @param use_names List of paths of column names to select; `nullopt` if user did not select
-   * columns to read
-   * @param include_index Whether to always include the PANDAS index column(s)
-   * @param strings_to_categorical Type conversion parameter
-   * @param timestamp_type_id Type conversion parameter
-   *
-   * @return input column information, output column information, list of output column schema
-   * indices
-   */
-  [[nodiscard]] auto select_columns(std::optional<std::vector<std::string>> const& use_names,
-                                    bool include_index,
-                                    bool strings_to_categorical,
-                                    type_id timestamp_type_id) const
-  {
-    auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
-      auto const& col_schema_idx = std::find_if(
-        schema_elem.children_idx.cbegin(),
-        schema_elem.children_idx.cend(),
-        [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; });
-
-      return (col_schema_idx != schema_elem.children_idx.end()) ? static_cast<int>(*col_schema_idx)
-                                                                : -1;
-    };
-
-    std::vector<column_buffer> output_columns;
-    std::vector<input_column_info> input_columns;
-    std::vector<int> nesting;
-
-    // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
-    // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
-    // not a child of "struct1" then the function will return false for "struct1"
-    std::function<bool(column_name_info const*, int, std::vector<column_buffer>&, bool)>
-      build_column = [&](column_name_info const* col_name_info,
-                         int schema_idx,
-                         std::vector<column_buffer>& out_col_array,
-                         bool has_list_parent) {
-        if (schema_idx < 0) { return false; }
-        auto const& schema_elem = get_schema(schema_idx);
-
-        // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer
-        // hierarchy. So continue on
-        if (schema_elem.is_stub()) {
-          // is this legit?
-          CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub");
-          auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr;
-          return build_column(
-            child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent);
-        }
-
-        // if we're at the root, this is a new output column
-        auto const col_type =
-          schema_elem.is_one_level_list()
-            ? type_id::LIST
-            : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
-        auto const dtype = to_data_type(col_type, schema_elem);
-
-        column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
-        if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
-        // store the index of this element if inserted in out_col_array
-        nesting.push_back(static_cast<int>(out_col_array.size()));
-        output_col.name = schema_elem.name;
-
-        // build each child
-        bool path_is_valid = false;
-        if (col_name_info == nullptr or col_name_info->children.empty()) {
-          // add all children of schema_elem.
-          // At this point, we can no longer pass a col_name_info to build_column
-          for (int idx = 0; idx < schema_elem.num_children; idx++) {
-            path_is_valid |= build_column(nullptr,
-                                          schema_elem.children_idx[idx],
-                                          output_col.children,
-                                          has_list_parent || col_type == type_id::LIST);
-          }
-        } else {
-          for (size_t idx = 0; idx < col_name_info->children.size(); idx++) {
-            path_is_valid |=
-              build_column(&col_name_info->children[idx],
-                           find_schema_child(schema_elem, col_name_info->children[idx].name),
-                           output_col.children,
-                           has_list_parent || col_type == type_id::LIST);
-          }
-        }
-
-        // if I have no children, we're at a leaf and I'm an input column (that is, one with actual
-        // data stored) so add me to the list.
-        if (schema_elem.num_children == 0) {
-          input_column_info& input_col = input_columns.emplace_back(
-            input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0});
-
-          // set up child output column for one-level encoding list
-          if (schema_elem.is_one_level_list()) {
-            // determine the element data type
-            auto const element_type =
-              to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
-            auto const element_dtype = to_data_type(element_type, schema_elem);
-
-            column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
-            if (has_list_parent || col_type == type_id::LIST) {
-              element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
-            }
-            // store the index of this element
-            nesting.push_back(static_cast<int>(output_col.children.size()));
-            // TODO: not sure if we should assign a name or leave it blank
-            element_col.name = "element";
-
-            output_col.children.push_back(std::move(element_col));
-          }
-
-          std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
-
-          // pop off the extra nesting element.
-          if (schema_elem.is_one_level_list()) { nesting.pop_back(); }
-
-          path_is_valid = true;  // If we're able to reach leaf then path is valid
-        }
-
-        if (path_is_valid) { out_col_array.push_back(std::move(output_col)); }
-
-        nesting.pop_back();
-        return path_is_valid;
-      };
-
-    std::vector<int> output_column_schemas;
-
-    //
-    // there is not necessarily a 1:1 mapping between input columns and output columns.
-    // For example, parquet does not explicitly store a ColumnChunkDesc for struct columns.
-    // The "structiness" is simply implied by the schema.  For example, this schema:
-    //  required group field_id=1 name {
-    //    required binary field_id=2 firstname (String);
-    //    required binary field_id=3 middlename (String);
-    //    required binary field_id=4 lastname (String);
-    // }
-    // will only contain 3 internal columns of data (firstname, middlename, lastname).  But of
-    // course "name" is ultimately the struct column we want to return.
-    //
-    // "firstname", "middlename" and "lastname" represent the input columns in the file that we
-    // process to produce the final cudf "name" column.
-    //
-    // A user can ask for a single field out of the struct e.g. firstname.
-    // In this case they'll pass a fully qualified name to the schema element like
-    // ["name", "firstname"]
-    //
-    auto const& root = get_schema(0);
-    if (not use_names.has_value()) {
-      for (auto const& schema_idx : root.children_idx) {
-        build_column(nullptr, schema_idx, output_columns, false);
-        output_column_schemas.push_back(schema_idx);
-      }
-    } else {
-      struct path_info {
-        std::string full_path;
-        int schema_idx;
-      };
-
-      // Convert schema into a vector of every possible path
-      std::vector<path_info> all_paths;
-      std::function<void(std::string, int)> add_path = [&](std::string path_till_now,
-                                                           int schema_idx) {
-        auto const& schema_elem = get_schema(schema_idx);
-        std::string curr_path   = path_till_now + schema_elem.name;
-        all_paths.push_back({curr_path, schema_idx});
-        for (auto const& child_idx : schema_elem.children_idx) {
-          add_path(curr_path + ".", child_idx);
-        }
-      };
-      for (auto const& child_idx : get_schema(0).children_idx) {
-        add_path("", child_idx);
-      }
-
-      // Find which of the selected paths are valid and get their schema index
-      std::vector<path_info> valid_selected_paths;
-      for (auto const& selected_path : *use_names) {
-        auto found_path =
-          std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
-            return valid_path.full_path == selected_path;
-          });
-        if (found_path != all_paths.end()) {
-          valid_selected_paths.push_back({selected_path, found_path->schema_idx});
-        }
-      }
-
-      // Now construct paths as vector of strings for further consumption
-      std::vector<std::vector<std::string>> use_names3;
-      std::transform(valid_selected_paths.begin(),
-                     valid_selected_paths.end(),
-                     std::back_inserter(use_names3),
-                     [&](path_info const& valid_path) {
-                       auto schema_idx = valid_path.schema_idx;
-                       std::vector<std::string> result_path;
-                       do {
-                         SchemaElement const& elem = get_schema(schema_idx);
-                         result_path.push_back(elem.name);
-                         schema_idx = elem.parent_idx;
-                       } while (schema_idx > 0);
-                       return std::vector<std::string>(result_path.rbegin(), result_path.rend());
-                     });
-
-      std::vector<column_name_info> selected_columns;
-      if (include_index) {
-        std::vector<std::string> index_names = get_pandas_index_names();
-        std::transform(index_names.cbegin(),
-                       index_names.cend(),
-                       std::back_inserter(selected_columns),
-                       [](std::string const& name) { return column_name_info(name); });
-      }
-      // Merge the vector use_names into a set of hierarchical column_name_info objects
-      /* This is because if we have columns like this:
-       *     col1
-       *      / \
-       *    s3   f4
-       *   / \
-       * f5   f6
-       *
-       * there may be common paths in use_names like:
-       * {"col1", "s3", "f5"}, {"col1", "f4"}
-       * which means we want the output to contain
-       *     col1
-       *      / \
-       *    s3   f4
-       *   /
-       * f5
-       *
-       * rather than
-       *  col1   col1
-       *   |      |
-       *   s3     f4
-       *   |
-       *   f5
-       */
-      for (auto const& path : use_names3) {
-        auto array_to_find_in = &selected_columns;
-        for (size_t depth = 0; depth < path.size(); ++depth) {
-          // Check if the path exists in our selected_columns and if not, add it.
-          auto const& name_to_find = path[depth];
-          auto found_col           = std::find_if(
-            array_to_find_in->begin(),
-            array_to_find_in->end(),
-            [&name_to_find](column_name_info const& col) { return col.name == name_to_find; });
-          if (found_col == array_to_find_in->end()) {
-            auto& col        = array_to_find_in->emplace_back(name_to_find);
-            array_to_find_in = &col.children;
-          } else {
-            // Path exists. go down further.
-            array_to_find_in = &found_col->children;
-          }
-        }
-      }
-      for (auto& col : selected_columns) {
-        auto const& top_level_col_schema_idx = find_schema_child(root, col.name);
-        bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false);
-        if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx);
-      }
-    }
-
-    return std::make_tuple(
-      std::move(input_columns), std::move(output_columns), std::move(output_column_schemas));
-  }
-};
-
-/**
- * @brief Generate depth remappings for repetition and definition levels.
- *
- * When dealing with columns that contain lists, we must examine incoming
- * repetition and definition level pairs to determine what range of output nesting
- * is indicated when adding new values.  This function generates the mappings of
- * the R/D levels to those start/end bounds
- *
- * @param remap Maps column schema index to the R/D remapping vectors for that column
- * @param src_col_schema The column schema to generate the new mapping for
- * @param md File metadata information
- */
-void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
-                               int src_col_schema,
-                               aggregate_reader_metadata const& md)
-{
-  // already generated for this level
-  if (remap.find(src_col_schema) != remap.end()) { return; }
-  auto schema   = md.get_schema(src_col_schema);
-  int max_depth = md.get_output_nesting_depth(src_col_schema);
-
-  CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(),
-               "Attempting to remap a schema more than once");
-  auto inserted =
-    remap.insert(std::pair<int, std::pair<std::vector<int>, std::vector<int>>>{src_col_schema, {}});
-  auto& depth_remap = inserted.first->second;
-
-  std::vector<int>& rep_depth_remap = (depth_remap.first);
-  rep_depth_remap.resize(schema.max_repetition_level + 1);
-  std::vector<int>& def_depth_remap = (depth_remap.second);
-  def_depth_remap.resize(schema.max_definition_level + 1);
-
-  // the key:
-  // for incoming level values  R/D
-  // add values starting at the shallowest nesting level X has repetition level R
-  // until you reach the deepest nesting level Y that corresponds to the repetition level R1
-  // held by the nesting level that has definition level D
-  //
-  // Example: a 3 level struct with a list at the bottom
-  //
-  //                     R / D   Depth
-  // level0              0 / 1     0
-  //   level1            0 / 2     1
-  //     level2          0 / 3     2
-  //       list          0 / 3     3
-  //         element     1 / 4     4
-  //
-  // incoming R/D : 0, 0  -> add values from depth 0 to 3   (def level 0 always maps to depth 0)
-  // incoming R/D : 0, 1  -> add values from depth 0 to 3
-  // incoming R/D : 0, 2  -> add values from depth 0 to 3
-  // incoming R/D : 1, 4  -> add values from depth 4 to 4
-  //
-  // Note : the -validity- of values is simply checked by comparing the incoming D value against the
-  // D value of the given nesting level (incoming D >= the D for the nesting level == valid,
-  // otherwise NULL).  The tricky part is determining what nesting levels to add values at.
-  //
-  // For schemas with no repetition level (no lists), X is always 0 and Y is always max nesting
-  // depth.
-  //
-
-  // compute "X" from above
-  for (int s_idx = schema.max_repetition_level; s_idx >= 0; s_idx--) {
-    auto find_shallowest = [&](int r) {
-      int shallowest = -1;
-      int cur_depth  = max_depth - 1;
-      int schema_idx = src_col_schema;
-      while (schema_idx > 0) {
-        auto cur_schema = md.get_schema(schema_idx);
-        if (cur_schema.max_repetition_level == r) {
-          // if this is a repeated field, map it one level deeper
-          shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth;
-        }
-        // if it's one-level encoding list
-        else if (cur_schema.is_one_level_list()) {
-          shallowest = cur_depth - 1;
-        }
-        if (!cur_schema.is_stub()) { cur_depth--; }
-        schema_idx = cur_schema.parent_idx;
-      }
-      return shallowest;
-    };
-    rep_depth_remap[s_idx] = find_shallowest(s_idx);
-  }
-
-  // compute "Y" from above
-  for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) {
-    auto find_deepest = [&](int d) {
-      SchemaElement prev_schema;
-      int schema_idx = src_col_schema;
-      int r1         = 0;
-      while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
-        if (cur_schema.max_definition_level == d) {
-          // if this is a repeated field, map it one level deeper
-          r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level
-                                    : cur_schema.max_repetition_level;
-          break;
-        }
-        prev_schema = cur_schema;
-        schema_idx  = cur_schema.parent_idx;
-      }
-
-      // we now know R1 from above. return the deepest nesting level that has the
-      // same repetition level
-      schema_idx = src_col_schema;
-      int depth  = max_depth - 1;
-      while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
-        if (cur_schema.max_repetition_level == r1) {
-          // if this is a repeated field, map it one level deeper
-          depth = cur_schema.is_stub() ? depth + 1 : depth;
-          break;
-        }
-        if (!cur_schema.is_stub()) { depth--; }
-        prev_schema = cur_schema;
-        schema_idx  = cur_schema.parent_idx;
-      }
-      return depth;
-    };
-    def_depth_remap[s_idx] = find_deepest(s_idx);
-  }
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::read_column_chunks
- */
-std::future<void> reader::impl::read_column_chunks(
-  std::vector<std::unique_ptr<datasource::buffer>>& page_data,
-  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,  // TODO const?
-  size_t begin_chunk,
-  size_t end_chunk,
-  const std::vector<size_t>& column_chunk_offsets,
-  std::vector<size_type> const& chunk_source_map)
-{
-  // Transfer chunk data, coalescing adjacent chunks
-  std::vector<std::future<size_t>> read_tasks;
-  for (size_t chunk = begin_chunk; chunk < end_chunk;) {
-    const size_t io_offset   = column_chunk_offsets[chunk];
-    size_t io_size           = chunks[chunk].compressed_size;
-    size_t next_chunk        = chunk + 1;
-    const bool is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
-    while (next_chunk < end_chunk) {
-      const size_t next_offset = column_chunk_offsets[next_chunk];
-      const bool is_next_compressed =
-        (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED);
-      if (next_offset != io_offset + io_size || is_next_compressed != is_compressed) {
-        // Can't merge if not contiguous or mixing compressed and uncompressed
-        // Not coalescing uncompressed with compressed chunks is so that compressed buffers can be
-        // freed earlier (immediately after decompression stage) to limit peak memory requirements
-        break;
-      }
-      io_size += chunks[next_chunk].compressed_size;
-      next_chunk++;
-    }
-    if (io_size != 0) {
-      auto& source = _sources[chunk_source_map[chunk]];
-      if (source->is_device_read_preferred(io_size)) {
-        auto buffer        = rmm::device_buffer(io_size, _stream);
-        auto fut_read_size = source->device_read_async(
-          io_offset, io_size, static_cast<uint8_t*>(buffer.data()), _stream);
-        read_tasks.emplace_back(std::move(fut_read_size));
-        page_data[chunk] = datasource::buffer::create(std::move(buffer));
-      } else {
-        auto const buffer = source->host_read(io_offset, io_size);
-        page_data[chunk] =
-          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), _stream));
-      }
-      auto d_compdata = page_data[chunk]->data();
-      do {
-        chunks[chunk].compressed_data = d_compdata;
-        d_compdata += chunks[chunk].compressed_size;
-      } while (++chunk != next_chunk);
-    } else {
-      chunk = next_chunk;
-    }
-  }
-  auto sync_fn = [](decltype(read_tasks) read_tasks) {
-    for (auto& task : read_tasks) {
-      task.wait();
-    }
-  };
-  return std::async(std::launch::deferred, sync_fn, std::move(read_tasks));
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::count_page_headers
- */
-size_t reader::impl::count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks)
-{
-  size_t total_pages = 0;
-
-  chunks.host_to_device(_stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), _stream);
-  chunks.device_to_host(_stream, true);
-
-  for (size_t c = 0; c < chunks.size(); c++) {
-    total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages;
-  }
-
-  return total_pages;
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::decode_page_headers
- */
-void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                       hostdevice_vector<gpu::PageInfo>& pages)
-{
-  // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
-  // please update preprocess_nested_columns to reflect this.
-  for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-    chunks[c].max_num_pages = chunks[c].num_data_pages + chunks[c].num_dict_pages;
-    chunks[c].page_info     = pages.device_ptr(page_count);
-    page_count += chunks[c].max_num_pages;
-  }
-
-  chunks.host_to_device(_stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), _stream);
-  pages.device_to_host(_stream, true);
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::decompress_page_data
- */
-rmm::device_buffer reader::impl::decompress_page_data(
-  hostdevice_vector<gpu::ColumnChunkDesc>& chunks, hostdevice_vector<gpu::PageInfo>& pages)
-{
-  auto for_each_codec_page = [&](parquet::Compression codec, const std::function<void(size_t)>& f) {
-    for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-      const auto page_stride = chunks[c].max_num_pages;
-      if (chunks[c].codec == codec) {
-        for (int k = 0; k < page_stride; k++) {
-          f(page_count + k);
-        }
-      }
-      page_count += page_stride;
-    }
-  };
-
-  // Brotli scratch memory for decompressing
-  rmm::device_buffer debrotli_scratch;
-
-  // Count the exact number of compressed pages
-  size_t num_comp_pages    = 0;
-  size_t total_decomp_size = 0;
-
-  struct codec_stats {
-    parquet::Compression compression_type = UNCOMPRESSED;
-    size_t num_pages                      = 0;
-    int32_t max_decompressed_size         = 0;
-    size_t total_decomp_size              = 0;
-  };
-
-  std::array codecs{codec_stats{parquet::GZIP},
-                    codec_stats{parquet::SNAPPY},
-                    codec_stats{parquet::BROTLI},
-                    codec_stats{parquet::ZSTD}};
-
-  auto is_codec_supported = [&codecs](int8_t codec) {
-    if (codec == parquet::UNCOMPRESSED) return true;
-    return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
-             return codec == cstats.compression_type;
-           }) != codecs.end();
-  };
-  CUDF_EXPECTS(std::all_of(chunks.begin(),
-                           chunks.end(),
-                           [&is_codec_supported](auto const& chunk) {
-                             return is_codec_supported(chunk.codec);
-                           }),
-               "Unsupported compression type");
-
-  for (auto& codec : codecs) {
-    for_each_codec_page(codec.compression_type, [&](size_t page) {
-      auto page_uncomp_size = pages[page].uncompressed_page_size;
-      total_decomp_size += page_uncomp_size;
-      codec.total_decomp_size += page_uncomp_size;
-      codec.max_decompressed_size = std::max(codec.max_decompressed_size, page_uncomp_size);
-      codec.num_pages++;
-      num_comp_pages++;
-    });
-    if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) {
-      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), _stream);
-    }
-  }
-
-  // Dispatch batches of pages to decompress for each codec
-  rmm::device_buffer decomp_pages(total_decomp_size, _stream);
-
-  std::vector<device_span<uint8_t const>> comp_in;
-  comp_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> comp_out;
-  comp_out.reserve(num_comp_pages);
-
-  // vectors to save v2 def and rep level data, if any
-  std::vector<device_span<uint8_t const>> copy_in;
-  copy_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> copy_out;
-  copy_out.reserve(num_comp_pages);
-
-  rmm::device_uvector<compression_result> comp_res(num_comp_pages, _stream);
-  thrust::fill(rmm::exec_policy(_stream),
-               comp_res.begin(),
-               comp_res.end(),
-               compression_result{0, compression_status::FAILURE});
-
-  size_t decomp_offset = 0;
-  int32_t start_pos    = 0;
-  for (const auto& codec : codecs) {
-    if (codec.num_pages == 0) { continue; }
-
-    for_each_codec_page(codec.compression_type, [&](size_t page_idx) {
-      auto const dst_base = static_cast<uint8_t*>(decomp_pages.data()) + decomp_offset;
-      auto& page          = pages[page_idx];
-      // offset will only be non-zero for V2 pages
-      auto const offset = page.def_lvl_bytes + page.rep_lvl_bytes;
-      // for V2 need to copy def and rep level info into place, and then offset the
-      // input and output buffers. otherwise we'd have to keep both the compressed
-      // and decompressed data.
-      if (offset != 0) {
-        copy_in.emplace_back(page.page_data, offset);
-        copy_out.emplace_back(dst_base, offset);
-      }
-      comp_in.emplace_back(page.page_data + offset,
-                           static_cast<size_t>(page.compressed_page_size - offset));
-      comp_out.emplace_back(dst_base + offset,
-                            static_cast<size_t>(page.uncompressed_page_size - offset));
-      page.page_data = dst_base;
-      decomp_offset += page.uncompressed_page_size;
-    });
-
-    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
-                                                             codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, _stream);
-    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
-                                                        codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, _stream);
-    device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
-
-    switch (codec.compression_type) {
-      case parquet::GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, _stream);
-        break;
-      case parquet::SNAPPY:
-        if (nvcomp_integration::is_stable_enabled()) {
-          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     d_comp_in,
-                                     d_comp_out,
-                                     d_comp_res_view,
-                                     codec.max_decompressed_size,
-                                     codec.total_decomp_size,
-                                     _stream);
-        } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, _stream);
-        }
-        break;
-      case parquet::ZSTD:
-        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   d_comp_in,
-                                   d_comp_out,
-                                   d_comp_res_view,
-                                   codec.max_decompressed_size,
-                                   codec.total_decomp_size,
-                                   _stream);
-        break;
-      case parquet::BROTLI:
-        gpu_debrotli(d_comp_in,
-                     d_comp_out,
-                     d_comp_res_view,
-                     debrotli_scratch.data(),
-                     debrotli_scratch.size(),
-                     _stream);
-        break;
-      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
-    }
-    start_pos += codec.num_pages;
-  }
-
-  decompress_check(comp_res, _stream);
-
-  // now copy the uncompressed V2 def and rep level data
-  if (not copy_in.empty()) {
-    auto const d_copy_in  = cudf::detail::make_device_uvector_async(copy_in, _stream);
-    auto const d_copy_out = cudf::detail::make_device_uvector_async(copy_out, _stream);
-
-    gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, _stream);
-    _stream.synchronize();
-  }
-
-  // Update the page information in device memory with the updated value of
-  // page_data; it now points to the uncompressed data buffer
-  pages.host_to_device(_stream);
-
-  return decomp_pages;
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::allocate_nesting_info
- */
-void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
-                                         hostdevice_vector<gpu::PageInfo>& pages,
-                                         hostdevice_vector<gpu::PageNestingInfo>& page_nesting_info)
-{
-  // compute total # of page_nesting infos needed and allocate space. doing this in one
-  // buffer to keep it to a single gpu allocation
-  size_t const total_page_nesting_infos = std::accumulate(
-    chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto& chunk) {
-      // the schema of the input column
-      auto const& schema                    = _metadata->get_schema(chunk.src_col_schema);
-      auto const per_page_nesting_info_size = max(
-        schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema));
-      return total + (per_page_nesting_info_size * chunk.num_data_pages);
-    });
-
-  page_nesting_info = hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
-
-  // retrieve from the gpu so we can update
-  pages.device_to_host(_stream, true);
-
-  // update pointers in the PageInfos
-  int target_page_index = 0;
-  int src_info_index    = 0;
-  for (size_t idx = 0; idx < chunks.size(); idx++) {
-    int src_col_schema                    = chunks[idx].src_col_schema;
-    auto& schema                          = _metadata->get_schema(src_col_schema);
-    auto const per_page_nesting_info_size = std::max(
-      schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
-
-    // skip my dict pages
-    target_page_index += chunks[idx].num_dict_pages;
-    for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
-      pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index;
-      pages[target_page_index + p_idx].num_nesting_levels = per_page_nesting_info_size;
-
-      src_info_index += per_page_nesting_info_size;
-    }
-    target_page_index += chunks[idx].num_data_pages;
-  }
-
-  // copy back to the gpu
-  pages.host_to_device(_stream);
-
-  // fill in
-  int nesting_info_index = 0;
-  std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-  for (size_t idx = 0; idx < chunks.size(); idx++) {
-    int src_col_schema = chunks[idx].src_col_schema;
-
-    // schema of the input column
-    auto& schema = _metadata->get_schema(src_col_schema);
-    // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
-    int max_depth = _metadata->get_output_nesting_depth(src_col_schema);
-
-    // # of nesting infos stored per page for this column
-    auto const per_page_nesting_info_size = std::max(schema.max_definition_level + 1, max_depth);
-
-    // if this column has lists, generate depth remapping
-    std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-    if (schema.max_repetition_level > 0) {
-      generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
-    }
-
-    // fill in host-side nesting info
-    int schema_idx  = src_col_schema;
-    auto cur_schema = _metadata->get_schema(schema_idx);
-    int cur_depth   = max_depth - 1;
-    while (schema_idx > 0) {
-      // stub columns (basically the inner field of a list scheme element) are not real columns.
-      // we can ignore them for the purposes of output nesting info
-      if (!cur_schema.is_stub()) {
-        // initialize each page within the chunk
-        for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
-          gpu::PageNestingInfo* pni =
-            &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
-
-          // if we have lists, set our start and end depth remappings
-          if (schema.max_repetition_level > 0) {
-            auto remap = depth_remapping.find(src_col_schema);
-            CUDF_EXPECTS(remap != depth_remapping.end(),
-                         "Could not find depth remapping for schema");
-            std::vector<int> const& rep_depth_remap = (remap->second.first);
-            std::vector<int> const& def_depth_remap = (remap->second.second);
-
-            for (size_t m = 0; m < rep_depth_remap.size(); m++) {
-              pni[m].start_depth = rep_depth_remap[m];
-            }
-            for (size_t m = 0; m < def_depth_remap.size(); m++) {
-              pni[m].end_depth = def_depth_remap[m];
-            }
-          }
-
-          // values indexed by output column index
-          pni[cur_depth].max_def_level = cur_schema.max_definition_level;
-          pni[cur_depth].max_rep_level = cur_schema.max_repetition_level;
-          pni[cur_depth].size          = 0;
-        }
-
-        // move up the hierarchy
-        cur_depth--;
-      }
-
-      // next schema
-      schema_idx = cur_schema.parent_idx;
-      cur_schema = _metadata->get_schema(schema_idx);
-    }
-
-    nesting_info_index += (per_page_nesting_info_size * chunks[idx].num_data_pages);
-  }
-
-  // copy nesting info to the device
-  page_nesting_info.host_to_device(_stream);
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::preprocess_columns
- */
-void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                      hostdevice_vector<gpu::PageInfo>& pages,
-                                      size_t min_row,
-                                      size_t total_rows,
-                                      bool uses_custom_row_bounds)
-{
-  // iterate over all input columns and allocate any associated output
-  // buffers if they are not part of a list hierarchy. mark down
-  // if we have any list columns that need further processing.
-  bool has_lists = false;
-  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
-    auto const& input_col  = _input_columns[idx];
-    size_t const max_depth = input_col.nesting_depth();
-
-    auto* cols = &_output_columns;
-    for (size_t l_idx = 0; l_idx < max_depth; l_idx++) {
-      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
-      cols          = &out_buf.children;
-
-      // if this has a list parent, we will have to do further work in gpu::PreprocessColumnData
-      // to know how big this buffer actually is.
-      if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
-        has_lists = true;
-      }
-      // if we haven't already processed this column because it is part of a struct hierarchy
-      else if (out_buf.size == 0) {
-        // add 1 for the offset if this is a list column
-        out_buf.create(
-          out_buf.type.id() == type_id::LIST && l_idx < max_depth ? total_rows + 1 : total_rows,
-          _stream,
-          _mr);
-      }
-    }
-  }
-
-  // if we have columns containing lists, further preprocessing is necessary.
-  if (has_lists) {
-    gpu::PreprocessColumnData(pages,
-                              chunks,
-                              _input_columns,
-                              _output_columns,
-                              total_rows,
-                              min_row,
-                              uses_custom_row_bounds,
-                              _stream,
-                              _mr);
-    _stream.synchronize();
-  }
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::decode_page_data
- */
-void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                    hostdevice_vector<gpu::PageInfo>& pages,
-                                    hostdevice_vector<gpu::PageNestingInfo>& page_nesting,
-                                    size_t min_row,
-                                    size_t total_rows)
-{
-  auto is_dict_chunk = [](const gpu::ColumnChunkDesc& chunk) {
-    return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
-  };
-
-  // Count the number of string dictionary entries
-  // NOTE: Assumes first page in the chunk is always the dictionary page
-  size_t total_str_dict_indexes = 0;
-  for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-    if (is_dict_chunk(chunks[c])) { total_str_dict_indexes += pages[page_count].num_input_values; }
-    page_count += chunks[c].max_num_pages;
-  }
-
-  // Build index for string dictionaries since they can't be indexed
-  // directly due to variable-sized elements
-  auto str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-    total_str_dict_indexes, _stream);
-
-  // TODO (dm): hd_vec should have begin and end iterator members
-  size_t sum_max_depths =
-    std::accumulate(chunks.host_ptr(),
-                    chunks.host_ptr(chunks.size()),
-                    0,
-                    [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) {
-                      return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
-                    });
-
-  // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
-  // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
-  // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
-  auto chunk_nested_valids = hostdevice_vector<uint32_t*>(sum_max_depths, _stream);
-  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths, _stream);
-  auto chunk_offsets       = std::vector<size_t>();
-
-  // Update chunks with pointers to column data.
-  for (size_t c = 0, page_count = 0, str_ofs = 0, chunk_off = 0; c < chunks.size(); c++) {
-    input_column_info const& input_col = _input_columns[chunks[c].src_col_index];
-    CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
-                 "Column/page schema index mismatch");
-
-    if (is_dict_chunk(chunks[c])) {
-      chunks[c].str_dict_index = str_dict_index.data() + str_ofs;
-      str_ofs += pages[page_count].num_input_values;
-    }
-
-    size_t max_depth = _metadata->get_output_nesting_depth(chunks[c].src_col_schema);
-    chunk_offsets.push_back(chunk_off);
-
-    // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers
-    // to validity data
-    auto valids              = chunk_nested_valids.host_ptr(chunk_off);
-    chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off);
-
-    // get a slice of size `nesting depth` from `chunk_nested_data` to store an array of pointers to
-    // out data
-    auto data                  = chunk_nested_data.host_ptr(chunk_off);
-    chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
-
-    chunk_off += max_depth;
-
-    // fill in the arrays on the host.  there are some important considerations to
-    // take into account here for nested columns.  specifically, with structs
-    // there is sharing of output buffers between input columns.  consider this schema
-    //
-    //  required group field_id=1 name {
-    //    required binary field_id=2 firstname (String);
-    //    required binary field_id=3 middlename (String);
-    //    required binary field_id=4 lastname (String);
-    // }
-    //
-    // there are 3 input columns of data here (firstname, middlename, lastname), but
-    // only 1 output column (name).  The structure of the output column buffers looks like
-    // the schema itself
-    //
-    // struct      (name)
-    //     string  (firstname)
-    //     string  (middlename)
-    //     string  (lastname)
-    //
-    // The struct column can contain validity information. the problem is, the decode
-    // step for the input columns will all attempt to decode this validity information
-    // because each one has it's own copy of the repetition/definition levels. but
-    // since this is all happening in parallel it would mean multiple blocks would
-    // be stomping all over the same memory randomly.  to work around this, we set
-    // things up so that only 1 child of any given nesting level fills in the
-    // data (offsets in the case of lists) or validity information for the higher
-    // levels of the hierarchy that are shared.  In this case, it would mean we
-    // would just choose firstname to be the one that decodes the validity for name.
-    //
-    // we do this by only handing out the pointers to the first child we come across.
-    //
-    auto* cols = &_output_columns;
-    for (size_t idx = 0; idx < max_depth; idx++) {
-      auto& out_buf = (*cols)[input_col.nesting[idx]];
-      cols          = &out_buf.children;
-
-      int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
-      if (owning_schema == 0 || owning_schema == input_col.schema_idx) {
-        valids[idx] = out_buf.null_mask();
-        data[idx]   = out_buf.data();
-        out_buf.user_data |=
-          static_cast<uint32_t>(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
-      } else {
-        valids[idx] = nullptr;
-        data[idx]   = nullptr;
-      }
-    }
-
-    // column_data_base will always point to leaf data, even for nested types.
-    page_count += chunks[c].max_num_pages;
-  }
-
-  chunks.host_to_device(_stream);
-  chunk_nested_valids.host_to_device(_stream);
-  chunk_nested_data.host_to_device(_stream);
-
-  if (total_str_dict_indexes > 0) {
-    gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
-  }
-
-  gpu::DecodePageData(pages, chunks, total_rows, min_row, _stream);
-  pages.device_to_host(_stream);
-  page_nesting.device_to_host(_stream);
-  _stream.synchronize();
-
-  // for list columns, add the final offset to every offset buffer.
-  // TODO : make this happen in more efficiently. Maybe use thrust::for_each
-  // on each buffer.  Or potentially do it in PreprocessColumnData
-  // Note : the reason we are doing this here instead of in the decode kernel is
-  // that it is difficult/impossible for a given page to know that it is writing the very
-  // last value that should then be followed by a terminator (because rows can span
-  // page boundaries).
-  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
-    input_column_info const& input_col = _input_columns[idx];
-
-    auto* cols = &_output_columns;
-    for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
-      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
-      cols          = &out_buf.children;
-
-      if (out_buf.type.id() != type_id::LIST ||
-          (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED)) {
-        continue;
-      }
-      CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
-      auto& child = (*cols)[input_col.nesting[l_idx + 1]];
-
-      // the final offset for a list at level N is the size of it's child
-      int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
-      cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
-                      &offset,
-                      sizeof(offset),
-                      cudaMemcpyHostToDevice,
-                      _stream.value());
-      out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
-    }
-  }
-
-  // update null counts in the final column buffers
-  for (size_t idx = 0; idx < pages.size(); idx++) {
-    gpu::PageInfo* pi = &pages[idx];
-    if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    gpu::ColumnChunkDesc* col          = &chunks[pi->chunk_idx];
-    input_column_info const& input_col = _input_columns[col->src_col_index];
-
-    int index                 = pi->nesting - page_nesting.device_ptr();
-    gpu::PageNestingInfo* pni = &page_nesting[index];
-
-    auto* cols = &_output_columns;
-    for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
-      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
-      cols          = &out_buf.children;
-
-      // if I wasn't the one who wrote out the validity bits, skip it
-      if (chunk_nested_valids.host_ptr(chunk_offsets[pi->chunk_idx])[l_idx] == nullptr) {
-        continue;
-      }
-      out_buf.null_count() += pni[l_idx].null_count;
-    }
-  }
-
-  _stream.synchronize();
-}
-
-reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   parquet_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : _stream(stream), _mr(mr), _sources(std::move(sources))
-{
-  // Open and parse the source dataset metadata
-  _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
-
-  // Override output timestamp resolution if requested
-  if (options.get_timestamp_type().id() != type_id::EMPTY) {
-    _timestamp_type = options.get_timestamp_type();
-  }
-
-  // Strings may be returned as either string or categorical columns
-  _strings_to_categorical = options.is_enabled_convert_strings_to_categories();
-
-  // Binary columns can be read as binary or strings
-  _reader_column_schema = options.get_column_schema();
-
-  // Select only columns required by the options
-  std::tie(_input_columns, _output_columns, _output_column_schemas) =
-    _metadata->select_columns(options.get_columns(),
-                              options.is_enabled_use_pandas_metadata(),
-                              _strings_to_categorical,
-                              _timestamp_type.id());
-}
-
-table_with_metadata reader::impl::read(size_type skip_rows,
-                                       size_type num_rows,
-                                       bool uses_custom_row_bounds,
-                                       std::vector<std::vector<size_type>> const& row_group_list)
-{
-  // Select only row groups required
-  const auto selected_row_groups =
-    _metadata->select_row_groups(row_group_list, skip_rows, num_rows);
-
-  table_metadata out_metadata;
-
-  // output cudf columns as determined by the top level schema
-  std::vector<std::unique_ptr<column>> out_columns;
-  out_columns.reserve(_output_columns.size());
-
-  if (selected_row_groups.size() != 0 && _input_columns.size() != 0) {
-    // Descriptors for all the chunks that make up the selected columns
-    const auto num_input_columns = _input_columns.size();
-    const auto num_chunks        = selected_row_groups.size() * num_input_columns;
-    hostdevice_vector<gpu::ColumnChunkDesc> chunks(0, num_chunks, _stream);
-
-    // Association between each column chunk and its source
-    std::vector<size_type> chunk_source_map(num_chunks);
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    std::vector<std::unique_ptr<datasource::buffer>> page_data(num_chunks);
-
-    // Keep track of column chunk file offsets
-    std::vector<size_t> column_chunk_offsets(num_chunks);
-
-    // Initialize column chunk information
-    size_t total_decompressed_size = 0;
-    auto remaining_rows            = num_rows;
-    std::vector<std::future<void>> read_rowgroup_tasks;
-    for (const auto& rg : selected_row_groups) {
-      const auto& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
-      auto const row_group_start  = rg.start_row;
-      auto const row_group_source = rg.source_index;
-      auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
-      auto const io_chunk_idx     = chunks.size();
-
-      // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
-      for (size_t i = 0; i < num_input_columns; ++i) {
-        auto col = _input_columns[i];
-        // look up metadata
-        auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-        auto& schema   = _metadata->get_schema(col.schema_idx);
-
-        auto [type_width, clock_rate, converted_type] =
-          conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                          _timestamp_type.id(),
-                          schema.type,
-                          schema.converted_type,
-                          schema.type_length);
-
-        column_chunk_offsets[chunks.size()] =
-          (col_meta.dictionary_page_offset != 0)
-            ? std::min(col_meta.data_page_offset, col_meta.dictionary_page_offset)
-            : col_meta.data_page_offset;
-
-        chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
-                                              nullptr,
-                                              col_meta.num_values,
-                                              schema.type,
-                                              type_width,
-                                              row_group_start,
-                                              row_group_rows,
-                                              schema.max_definition_level,
-                                              schema.max_repetition_level,
-                                              _metadata->get_output_nesting_depth(col.schema_idx),
-                                              required_bits(schema.max_definition_level),
-                                              required_bits(schema.max_repetition_level),
-                                              col_meta.codec,
-                                              converted_type,
-                                              schema.logical_type,
-                                              schema.decimal_scale,
-                                              clock_rate,
-                                              i,
-                                              col.schema_idx));
-
-        // Map each column chunk to its column index and its source index
-        chunk_source_map[chunks.size() - 1] = row_group_source;
-
-        if (col_meta.codec != Compression::UNCOMPRESSED) {
-          total_decompressed_size += col_meta.total_uncompressed_size;
-        }
-      }
-      // Read compressed chunk data to device memory
-      read_rowgroup_tasks.push_back(read_column_chunks(
-        page_data, chunks, io_chunk_idx, chunks.size(), column_chunk_offsets, chunk_source_map));
-
-      remaining_rows -= row_group.num_rows;
-    }
-    for (auto& task : read_rowgroup_tasks) {
-      task.wait();
-    }
-    assert(remaining_rows <= 0);
-
-    // Process dataset chunk pages into output columns
-    const auto total_pages = count_page_headers(chunks);
-    if (total_pages > 0) {
-      hostdevice_vector<gpu::PageInfo> pages(total_pages, total_pages, _stream);
-      rmm::device_buffer decomp_page_data;
-
-      // decoding of column/page information
-      decode_page_headers(chunks, pages);
-      if (total_decompressed_size > 0) {
-        decomp_page_data = decompress_page_data(chunks, pages);
-        // Free compressed data
-        for (size_t c = 0; c < chunks.size(); c++) {
-          if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { page_data[c].reset(); }
-        }
-      }
-
-      // build output column info
-      // walk the schema, building out_buffers that mirror what our final cudf columns will look
-      // like. important : there is not necessarily a 1:1 mapping between input columns and output
-      // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct
-      // columns. The "structiness" is simply implied by the schema.  For example, this schema:
-      //  required group field_id=1 name {
-      //    required binary field_id=2 firstname (String);
-      //    required binary field_id=3 middlename (String);
-      //    required binary field_id=4 lastname (String);
-      // }
-      // will only contain 3 columns of data (firstname, middlename, lastname).  But of course
-      // "name" is a struct column that we want to return, so we have to make sure that we
-      // create it ourselves.
-      // std::vector<output_column_info> output_info = build_output_column_info();
-
-      // nesting information (sizes, etc) stored -per page-
-      // note : even for flat schemas, we allocate 1 level of "nesting" info
-      hostdevice_vector<gpu::PageNestingInfo> page_nesting_info;
-      allocate_nesting_info(chunks, pages, page_nesting_info);
-
-      // - compute column sizes and allocate output buffers.
-      //   important:
-      //   for nested schemas, we have to do some further preprocessing to determine:
-      //    - real column output sizes per level of nesting (in a flat schema, there's only 1 level
-      //    of
-      //      nesting and it's size is the row count)
-      //
-      // - for nested schemas, output buffer offset values per-page, per nesting-level for the
-      // purposes of decoding.
-      preprocess_columns(chunks, pages, skip_rows, num_rows, uses_custom_row_bounds);
-
-      // decoding of column data itself
-      decode_page_data(chunks, pages, page_nesting_info, skip_rows, num_rows);
-
-      // create the final output cudf columns
-      for (size_t i = 0; i < _output_columns.size(); ++i) {
-        column_name_info& col_name = out_metadata.schema_info.emplace_back("");
-        auto const metadata =
-          _reader_column_schema.has_value()
-            ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
-            : std::nullopt;
-        out_columns.emplace_back(
-          make_column(_output_columns[i], &col_name, metadata, _stream, _mr));
-      }
-    }
-  }
-
-  // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
-  for (size_t i = out_columns.size(); i < _output_columns.size(); ++i) {
-    column_name_info& col_name = out_metadata.schema_info.emplace_back("");
-    out_columns.emplace_back(io::detail::empty_like(_output_columns[i], &col_name, _stream, _mr));
-  }
-
-  // Return column names (must match order of returned columns)
-  out_metadata.column_names.resize(_output_columns.size());
-  for (size_t i = 0; i < _output_column_schemas.size(); i++) {
-    auto const& schema           = _metadata->get_schema(_output_column_schemas[i]);
-    out_metadata.column_names[i] = schema.name;
-  }
-
-  // Return user metadata
-  out_metadata.per_file_user_data = _metadata->get_key_value_metadata();
-  out_metadata.user_data          = {out_metadata.per_file_user_data[0].begin(),
-                            out_metadata.per_file_user_data[0].end()};
-
-  return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
-}
-
-// Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-               parquet_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sources), options, stream, mr))
-{
-}
-
-// Destructor within this translation unit
-reader::~reader() = default;
-
-// Forward to implementation
-table_with_metadata reader::read(parquet_reader_options const& options)
-{
-  // if the user has specified custom row bounds
-  bool const uses_custom_row_bounds = options.get_num_rows() >= 0 || options.get_skip_rows() != 0;
-  return _impl->read(options.get_skip_rows(),
-                     options.get_num_rows(),
-                     uses_custom_row_bounds,
-                     options.get_row_groups());
-}
-
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 6c3e05b4264..b53487c824b 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -21,32 +21,23 @@
 
 #pragma once
 
-#include "parquet.hpp"
 #include "parquet_gpu.hpp"
+#include "reader_impl_helpers.hpp"
 
 #include <io/utilities/column_buffer.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <memory>
-#include <string>
-#include <utility>
+#include <optional>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
-using namespace cudf::io::parquet;
-using namespace cudf::io;
-
-// Forward declarations
-class aggregate_reader_metadata;
+namespace cudf::io::detail::parquet {
 
 /**
  * @brief Implementation for Parquet reader
@@ -71,8 +62,8 @@ class reader::impl {
    *
    * @param skip_rows Number of rows to skip from the start
    * @param num_rows Number of rows to read
-   * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific
-   * bounds
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
    * @param row_group_indices Lists of row groups to read, one per source
    *
    * @return The set of columns along with metadata
@@ -80,111 +71,79 @@ class reader::impl {
   table_with_metadata read(size_type skip_rows,
                            size_type num_rows,
                            bool uses_custom_row_bounds,
-                           std::vector<std::vector<size_type>> const& row_group_indices);
+                           host_span<std::vector<size_type> const> row_group_indices);
 
  private:
   /**
-   * @brief Reads compressed page data to device memory
-   *
-   * @param page_data Buffers to hold compressed page data for each chunk
-   * @param chunks List of column chunk descriptors
-   * @param begin_chunk Index of first column chunk to read
-   * @param end_chunk Index after the last column chunk to read
-   * @param column_chunk_offsets File offset for all chunks
+   * @brief Perform the necessary data preprocessing for parsing file later on.
    *
+   * @param skip_rows Number of rows to skip from the start
+   * @param num_rows Number of rows to read, or `-1` to read all rows
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
+   * @param row_group_indices Lists of row groups to read (one per source), or empty if read all
    */
-  std::future<void> read_column_chunks(std::vector<std::unique_ptr<datasource::buffer>>& page_data,
-                                       hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                       size_t begin_chunk,
-                                       size_t end_chunk,
-                                       const std::vector<size_t>& column_chunk_offsets,
-                                       std::vector<size_type> const& chunk_source_map);
+  void prepare_data(size_type skip_rows,
+                    size_type num_rows,
+                    bool uses_custom_row_bounds,
+                    host_span<std::vector<size_type> const> row_group_indices);
 
   /**
-   * @brief Returns the number of total pages from the given column chunks
-   *
-   * @param chunks List of column chunk descriptors
-   *
-   * @return The total number of pages
+   * @brief Load and decompress the input file(s) into memory.
    */
-  size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks);
+  void load_and_decompress_data(std::vector<row_group_info> const& row_groups_info,
+                                size_type num_rows);
 
   /**
-   * @brief Returns the page information from the given column chunks.
+   * @brief Allocate nesting information storage for all pages and set pointers to it.
    *
-   * @param chunks List of column chunk descriptors
-   * @param pages List of page information
+   * One large contiguous buffer of PageNestingInfo structs is allocated and
+   * distributed among the PageInfo structs.
+   *
+   * Note that this gets called even in the flat schema case so that we have a
+   * consistent place to store common information such as value counts, etc.
    */
-  void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                           hostdevice_vector<gpu::PageInfo>& pages);
+  void allocate_nesting_info();
 
   /**
-   * @brief Decompresses the page data, at page granularity.
+   * @brief Read a chunk of data and return an output table.
    *
-   * @param chunks List of column chunk descriptors
-   * @param pages List of page information
+   * This function is called internally and expects all preprocessing steps have already been done.
    *
-   * @return Device buffer to decompressed page data
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
+   * @return The output table along with columns' metadata
    */
-  rmm::device_buffer decompress_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                          hostdevice_vector<gpu::PageInfo>& pages);
+  table_with_metadata read_chunk_internal(bool uses_custom_row_bounds);
 
   /**
-   * @brief Allocate nesting information storage for all pages and set pointers
-   *        to it.
-   *
-   * One large contiguous buffer of PageNestingInfo structs is allocated and
-   * distributed among the PageInfo structs.
-   *
-   * Note that this gets called even in the flat schema case so that we have a
-   * consistent place to store common information such as value counts, etc.
+   * @brief Finalize the output table by adding empty columns for the non-selected columns in
+   * schema.
    *
-   * @param chunks List of column chunk descriptors
-   * @param pages List of page information
-   * @param page_nesting_info The allocated nesting info structs.
+   * @param out_metadata The output table metadata
+   * @param out_columns The columns for building the output table
+   * @return The output table along with columns' metadata
    */
-  void allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
-                             hostdevice_vector<gpu::PageInfo>& pages,
-                             hostdevice_vector<gpu::PageNestingInfo>& page_nesting_info);
+  table_with_metadata finalize_output(table_metadata& out_metadata,
+                                      std::vector<std::unique_ptr<column>>& out_columns);
 
   /**
-   * @brief Preprocess column information and allocate output buffers.
-   *
-   * There are several pieces of information we can't compute directly from row counts in
-   * the parquet headers when dealing with nested schemas.
-   * - The total sizes of all output columns at all nesting levels
-   * - The starting output buffer offset for each page, for each nesting level
+   * @brief Allocate data bufers for the output columns.
    *
-   * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders)
-   *
-   * @param chunks All chunks to be decoded
-   * @param pages All pages to be decoded
-   * @param min_rows crop all rows below min_row
-   * @param total_rows Maximum number of rows to read
-   * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific
-   * bounds
-   * a preprocess.
+   * @param skip_rows Crop all rows below skip_rows
+   * @param num_rows Maximum number of rows to read
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
    */
-  void preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                          hostdevice_vector<gpu::PageInfo>& pages,
-                          size_t min_row,
-                          size_t total_rows,
-                          bool uses_custom_row_bounds);
+  void allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds);
 
   /**
    * @brief Converts the page data and outputs to columns.
    *
-   * @param chunks List of column chunk descriptors
-   * @param pages List of page information
-   * @param page_nesting Page nesting array
-   * @param min_row Minimum number of rows from start
-   * @param total_rows Number of rows to output
+   * @param skip_rows Minimum number of rows from start
+   * @param num_rows Number of rows to output
    */
-  void decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                        hostdevice_vector<gpu::PageInfo>& pages,
-                        hostdevice_vector<gpu::PageNestingInfo>& page_nesting,
-                        size_t min_row,
-                        size_t total_rows);
+  void decode_page_data(size_t skip_rows, size_t num_rows);
 
  private:
   rmm::cuda_stream_view _stream;
@@ -195,17 +154,21 @@ class reader::impl {
 
   // input columns to be processed
   std::vector<input_column_info> _input_columns;
-  // output columns to be generated
-  std::vector<column_buffer> _output_columns;
-  // _output_columns associated schema indices
+
+  // Buffers for generating output columns
+  std::vector<column_buffer> _output_buffers;
+
+  // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
 
   bool _strings_to_categorical = false;
   std::optional<std::vector<reader_column_schema>> _reader_column_schema;
   data_type _timestamp_type{type_id::EMPTY};
+
+  cudf::io::parquet::gpu::file_intermediate_data _file_itm_data;
+
+  size_type _skip_rows{0};
+  size_type _num_rows{0};
 };
 
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
new file mode 100644
index 00000000000..7090df2cae0
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl_helpers.hpp"
+
+#include <numeric>
+#include <regex>
+
+namespace cudf::io::detail::parquet {
+
+namespace {
+
+ConvertedType logical_type_to_converted_type(LogicalType const& logical)
+{
+  if (logical.isset.STRING) {
+    return parquet::UTF8;
+  } else if (logical.isset.MAP) {
+    return parquet::MAP;
+  } else if (logical.isset.LIST) {
+    return parquet::LIST;
+  } else if (logical.isset.ENUM) {
+    return parquet::ENUM;
+  } else if (logical.isset.DECIMAL) {
+    return parquet::DECIMAL;  // TODO set decimal values
+  } else if (logical.isset.DATE) {
+    return parquet::DATE;
+  } else if (logical.isset.TIME) {
+    if (logical.TIME.unit.isset.MILLIS)
+      return parquet::TIME_MILLIS;
+    else if (logical.TIME.unit.isset.MICROS)
+      return parquet::TIME_MICROS;
+  } else if (logical.isset.TIMESTAMP) {
+    if (logical.TIMESTAMP.unit.isset.MILLIS)
+      return parquet::TIMESTAMP_MILLIS;
+    else if (logical.TIMESTAMP.unit.isset.MICROS)
+      return parquet::TIMESTAMP_MICROS;
+  } else if (logical.isset.INTEGER) {
+    switch (logical.INTEGER.bitWidth) {
+      case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
+      case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16;
+      case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32;
+      case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64;
+      default: break;
+    }
+  } else if (logical.isset.UNKNOWN) {
+    return parquet::NA;
+  } else if (logical.isset.JSON) {
+    return parquet::JSON;
+  } else if (logical.isset.BSON) {
+    return parquet::BSON;
+  }
+  return parquet::UNKNOWN;
+}
+
+}  // namespace
+
+/**
+ * @brief Function that translates Parquet datatype to cuDF type enum
+ */
+type_id to_type_id(SchemaElement const& schema,
+                   bool strings_to_categorical,
+                   type_id timestamp_type_id)
+{
+  parquet::Type const physical            = schema.type;
+  parquet::LogicalType const logical_type = schema.logical_type;
+  parquet::ConvertedType converted_type   = schema.converted_type;
+  int32_t decimal_scale                   = schema.decimal_scale;
+
+  // Logical type used for actual data interpretation; the legacy converted type
+  // is superceded by 'logical' type whenever available.
+  auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
+  if (inferred_converted_type != parquet::UNKNOWN) converted_type = inferred_converted_type;
+  if (inferred_converted_type == parquet::DECIMAL && decimal_scale == 0)
+    decimal_scale = schema.logical_type.DECIMAL.scale;
+
+  switch (converted_type) {
+    case parquet::UINT_8: return type_id::UINT8;
+    case parquet::INT_8: return type_id::INT8;
+    case parquet::UINT_16: return type_id::UINT16;
+    case parquet::INT_16: return type_id::INT16;
+    case parquet::UINT_32: return type_id::UINT32;
+    case parquet::UINT_64: return type_id::UINT64;
+    case parquet::DATE: return type_id::TIMESTAMP_DAYS;
+    case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
+    case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS;
+    case parquet::TIMESTAMP_MILLIS:
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_MILLISECONDS;
+    case parquet::TIMESTAMP_MICROS:
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_MICROSECONDS;
+    case parquet::DECIMAL:
+      if (physical == parquet::INT32) { return type_id::DECIMAL32; }
+      if (physical == parquet::INT64) { return type_id::DECIMAL64; }
+      if (physical == parquet::FIXED_LEN_BYTE_ARRAY) {
+        if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
+          return type_id::DECIMAL32;
+        }
+        if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
+          return type_id::DECIMAL64;
+        }
+        if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
+          return type_id::DECIMAL128;
+        }
+      }
+      CUDF_FAIL("Invalid representation of decimal type");
+      break;
+
+    // maps are just List<Struct<>>.
+    case parquet::MAP:
+    case parquet::LIST: return type_id::LIST;
+    case parquet::NA: return type_id::STRING;
+    // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
+    default: break;
+  }
+
+  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+      logical_type.TIMESTAMP.unit.isset.NANOS) {
+    return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                 : type_id::TIMESTAMP_NANOSECONDS;
+  }
+
+  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+      logical_type.TIME.unit.isset.NANOS) {
+    return type_id::DURATION_NANOSECONDS;
+  }
+
+  // is it simply a struct?
+  if (schema.is_struct()) { return type_id::STRUCT; }
+
+  // Physical storage type supported by Parquet; controls the on-disk storage
+  // format in combination with the encoding type.
+  switch (physical) {
+    case parquet::BOOLEAN: return type_id::BOOL8;
+    case parquet::INT32: return type_id::INT32;
+    case parquet::INT64: return type_id::INT64;
+    case parquet::FLOAT: return type_id::FLOAT32;
+    case parquet::DOUBLE: return type_id::FLOAT64;
+    case parquet::BYTE_ARRAY:
+    case parquet::FIXED_LEN_BYTE_ARRAY:
+      // Can be mapped to INT32 (32-bit hash) or STRING
+      return strings_to_categorical ? type_id::INT32 : type_id::STRING;
+    case parquet::INT96:
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_NANOSECONDS;
+    default: break;
+  }
+
+  return type_id::EMPTY;
+}
+
+metadata::metadata(datasource* source)
+{
+  constexpr auto header_len = sizeof(file_header_s);
+  constexpr auto ender_len  = sizeof(file_ender_s);
+
+  const auto len           = source->size();
+  const auto header_buffer = source->host_read(0, header_len);
+  const auto header        = reinterpret_cast<const file_header_s*>(header_buffer->data());
+  const auto ender_buffer  = source->host_read(len - ender_len, ender_len);
+  const auto ender         = reinterpret_cast<const file_ender_s*>(ender_buffer->data());
+  CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source");
+  CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic,
+               "Corrupted header or footer");
+  CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len),
+               "Incorrect footer length");
+
+  const auto buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
+  CompactProtocolReader cp(buffer->data(), ender->footer_len);
+  CUDF_EXPECTS(cp.read(this), "Cannot parse metadata");
+  CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema");
+}
+
+std::vector<metadata> aggregate_reader_metadata::metadatas_from_sources(
+  std::vector<std::unique_ptr<datasource>> const& sources)
+{
+  std::vector<metadata> metadatas;
+  std::transform(
+    sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
+      return metadata(source.get());
+    });
+  return metadatas;
+}
+
+std::vector<std::unordered_map<std::string, std::string>>
+aggregate_reader_metadata::collect_keyval_metadata() const
+{
+  std::vector<std::unordered_map<std::string, std::string>> kv_maps;
+  std::transform(per_file_metadata.cbegin(),
+                 per_file_metadata.cend(),
+                 std::back_inserter(kv_maps),
+                 [](auto const& pfm) {
+                   std::unordered_map<std::string, std::string> kv_map;
+                   std::transform(pfm.key_value_metadata.cbegin(),
+                                  pfm.key_value_metadata.cend(),
+                                  std::inserter(kv_map, kv_map.end()),
+                                  [](auto const& kv) {
+                                    return std::pair{kv.key, kv.value};
+                                  });
+                   return kv_map;
+                 });
+
+  return kv_maps;
+}
+
+size_type aggregate_reader_metadata::calc_num_rows() const
+{
+  return std::accumulate(
+    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
+      return sum + pfm.num_rows;
+    });
+}
+
+size_type aggregate_reader_metadata::calc_num_row_groups() const
+{
+  return std::accumulate(
+    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
+      return sum + pfm.row_groups.size();
+    });
+}
+
+aggregate_reader_metadata::aggregate_reader_metadata(
+  std::vector<std::unique_ptr<datasource>> const& sources)
+  : per_file_metadata(metadatas_from_sources(sources)),
+    keyval_maps(collect_keyval_metadata()),
+    num_rows(calc_num_rows()),
+    num_row_groups(calc_num_row_groups())
+{
+  if (per_file_metadata.size() > 0) {
+    auto const& first_meta = per_file_metadata.front();
+    auto const num_cols =
+      first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0;
+    auto const& schema = first_meta.schema;
+
+    // Verify that the input files have matching numbers of columns and schema.
+    for (auto const& pfm : per_file_metadata) {
+      if (pfm.row_groups.size() > 0) {
+        CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
+                     "All sources must have the same number of columns");
+      }
+      CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
+    }
+  }
+}
+
+RowGroup const& aggregate_reader_metadata::get_row_group(size_type row_group_index,
+                                                         size_type src_idx) const
+{
+  CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast<size_type>(per_file_metadata.size()),
+               "invalid source index");
+  return per_file_metadata[src_idx].row_groups[row_group_index];
+}
+
+ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_type row_group_index,
+                                                                          size_type src_idx,
+                                                                          int schema_idx) const
+{
+  auto col = std::find_if(
+    per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
+    per_file_metadata[src_idx].row_groups[row_group_index].columns.end(),
+    [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx ? true : false; });
+  CUDF_EXPECTS(col != std::end(per_file_metadata[src_idx].row_groups[row_group_index].columns),
+               "Found no metadata for schema index");
+  return col->meta_data;
+}
+
+std::string aggregate_reader_metadata::get_pandas_index() const
+{
+  // Assumes that all input files have the same metadata
+  // TODO: verify this assumption
+  auto it = keyval_maps[0].find("pandas");
+  if (it != keyval_maps[0].end()) {
+    // Captures a list of quoted strings found inside square brackets after `"index_columns":`
+    // Inside quotes supports newlines, brackets, escaped quotes, etc.
+    // One-liner regex:
+    // "index_columns"\s*:\s*\[\s*((?:"(?:|(?:.*?(?![^\\]")).?)[^\\]?",?\s*)*)\]
+    // Documented below.
+    std::regex index_columns_expr{
+      R"("index_columns"\s*:\s*\[\s*)"  // match preamble, opening square bracket, whitespace
+      R"(()"                            // Open first capturing group
+      R"((?:")"                         // Open non-capturing group match opening quote
+      R"((?:|(?:.*?(?![^\\]")).?))"     // match empty string or anything between quotes
+      R"([^\\]?")"                      // Match closing non-escaped quote
+      R"(,?\s*)"                        // Match optional comma and whitespace
+      R"()*)"                           // Close non-capturing group and repeat 0 or more times
+      R"())"                            // Close first capturing group
+      R"(\])"                           // Match closing square brackets
+    };
+    std::smatch sm;
+    if (std::regex_search(it->second, sm, index_columns_expr)) { return sm[1].str(); }
+  }
+  return "";
+}
+
+std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() const
+{
+  std::vector<std::string> names;
+  auto str = get_pandas_index();
+  if (str.length() != 0) {
+    std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"};
+    std::smatch sm;
+    while (std::regex_search(str, sm, index_name_expr)) {
+      if (sm.size() == 2) {  // 2 = whole match, first item
+        if (std::find(names.begin(), names.end(), sm[1].str()) == names.end()) {
+          std::regex esc_quote{R"(\\")"};
+          names.emplace_back(std::regex_replace(sm[1].str(), esc_quote, R"(")"));
+        }
+      }
+      str = sm.suffix();
+    }
+  }
+  return names;
+}
+
+std::tuple<size_type, size_type, std::vector<row_group_info>>
+aggregate_reader_metadata::select_row_groups(
+  host_span<std::vector<size_type> const> row_group_indices,
+  size_type row_start,
+  size_type row_count) const
+{
+  std::vector<row_group_info> selection;
+
+  if (!row_group_indices.empty()) {
+    CUDF_EXPECTS(row_group_indices.size() == per_file_metadata.size(),
+                 "Must specify row groups for each source");
+
+    row_count = 0;
+    for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
+      for (auto const& rowgroup_idx : row_group_indices[src_idx]) {
+        CUDF_EXPECTS(
+          rowgroup_idx >= 0 &&
+            rowgroup_idx < static_cast<size_type>(per_file_metadata[src_idx].row_groups.size()),
+          "Invalid rowgroup index");
+        selection.emplace_back(rowgroup_idx, row_count, src_idx);
+        row_count += get_row_group(rowgroup_idx, src_idx).num_rows;
+      }
+    }
+
+    return {row_start, row_count, std::move(selection)};
+  }
+
+  row_start = std::max(row_start, 0);
+  if (row_count < 0) {
+    row_count = std::min(get_num_rows(), std::numeric_limits<size_type>::max());
+  }
+  row_count = std::min(row_count, get_num_rows() - row_start);
+  CUDF_EXPECTS(row_count >= 0, "Invalid row count");
+  CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start");
+
+  size_type count = 0;
+  for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) {
+    for (size_t rg_idx = 0; rg_idx < per_file_metadata[src_idx].row_groups.size(); ++rg_idx) {
+      auto const chunk_start_row = count;
+      count += get_row_group(rg_idx, src_idx).num_rows;
+      if (count > row_start || count == 0) {
+        selection.emplace_back(rg_idx, chunk_start_row, src_idx);
+      }
+      if (count >= row_start + row_count) { break; }
+    }
+  }
+
+  return {row_start, row_count, std::move(selection)};
+}
+
+std::tuple<std::vector<input_column_info>, std::vector<column_buffer>, std::vector<size_type>>
+aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
+                                          bool include_index,
+                                          bool strings_to_categorical,
+                                          type_id timestamp_type_id) const
+{
+  auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
+    auto const& col_schema_idx =
+      std::find_if(schema_elem.children_idx.cbegin(),
+                   schema_elem.children_idx.cend(),
+                   [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; });
+
+    return (col_schema_idx != schema_elem.children_idx.end())
+             ? static_cast<size_type>(*col_schema_idx)
+             : -1;
+  };
+
+  std::vector<column_buffer> output_columns;
+  std::vector<input_column_info> input_columns;
+  std::vector<int> nesting;
+
+  // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
+  // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
+  // not a child of "struct1" then the function will return false for "struct1"
+  std::function<bool(column_name_info const*, int, std::vector<column_buffer>&, bool)>
+    build_column = [&](column_name_info const* col_name_info,
+                       int schema_idx,
+                       std::vector<column_buffer>& out_col_array,
+                       bool has_list_parent) {
+      if (schema_idx < 0) { return false; }
+      auto const& schema_elem = get_schema(schema_idx);
+
+      // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer
+      // hierarchy. So continue on
+      if (schema_elem.is_stub()) {
+        // is this legit?
+        CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub");
+        auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr;
+        return build_column(
+          child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent);
+      }
+
+      // if we're at the root, this is a new output column
+      auto const col_type = schema_elem.is_one_level_list()
+                              ? type_id::LIST
+                              : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
+      auto const dtype    = to_data_type(col_type, schema_elem);
+
+      column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
+      // store the index of this element if inserted in out_col_array
+      nesting.push_back(static_cast<int>(out_col_array.size()));
+      output_col.name = schema_elem.name;
+
+      // build each child
+      bool path_is_valid = false;
+      if (col_name_info == nullptr or col_name_info->children.empty()) {
+        // add all children of schema_elem.
+        // At this point, we can no longer pass a col_name_info to build_column
+        for (int idx = 0; idx < schema_elem.num_children; idx++) {
+          path_is_valid |= build_column(nullptr,
+                                        schema_elem.children_idx[idx],
+                                        output_col.children,
+                                        has_list_parent || col_type == type_id::LIST);
+        }
+      } else {
+        for (size_t idx = 0; idx < col_name_info->children.size(); idx++) {
+          path_is_valid |=
+            build_column(&col_name_info->children[idx],
+                         find_schema_child(schema_elem, col_name_info->children[idx].name),
+                         output_col.children,
+                         has_list_parent || col_type == type_id::LIST);
+        }
+      }
+
+      // if I have no children, we're at a leaf and I'm an input column (that is, one with actual
+      // data stored) so add me to the list.
+      if (schema_elem.num_children == 0) {
+        input_column_info& input_col = input_columns.emplace_back(
+          input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0});
+
+        // set up child output column for one-level encoding list
+        if (schema_elem.is_one_level_list()) {
+          // determine the element data type
+          auto const element_type =
+            to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
+          auto const element_dtype = to_data_type(element_type, schema_elem);
+
+          column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+          if (has_list_parent || col_type == type_id::LIST) {
+            element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
+          }
+          // store the index of this element
+          nesting.push_back(static_cast<int>(output_col.children.size()));
+          // TODO: not sure if we should assign a name or leave it blank
+          element_col.name = "element";
+
+          output_col.children.push_back(std::move(element_col));
+        }
+
+        std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
+
+        // pop off the extra nesting element.
+        if (schema_elem.is_one_level_list()) { nesting.pop_back(); }
+
+        path_is_valid = true;  // If we're able to reach leaf then path is valid
+      }
+
+      if (path_is_valid) { out_col_array.push_back(std::move(output_col)); }
+
+      nesting.pop_back();
+      return path_is_valid;
+    };
+
+  std::vector<int> output_column_schemas;
+
+  //
+  // there is not necessarily a 1:1 mapping between input columns and output columns.
+  // For example, parquet does not explicitly store a ColumnChunkDesc for struct columns.
+  // The "structiness" is simply implied by the schema.  For example, this schema:
+  //  required group field_id=1 name {
+  //    required binary field_id=2 firstname (String);
+  //    required binary field_id=3 middlename (String);
+  //    required binary field_id=4 lastname (String);
+  // }
+  // will only contain 3 internal columns of data (firstname, middlename, lastname).  But of
+  // course "name" is ultimately the struct column we want to return.
+  //
+  // "firstname", "middlename" and "lastname" represent the input columns in the file that we
+  // process to produce the final cudf "name" column.
+  //
+  // A user can ask for a single field out of the struct e.g. firstname.
+  // In this case they'll pass a fully qualified name to the schema element like
+  // ["name", "firstname"]
+  //
+  auto const& root = get_schema(0);
+  if (not use_names.has_value()) {
+    for (auto const& schema_idx : root.children_idx) {
+      build_column(nullptr, schema_idx, output_columns, false);
+      output_column_schemas.push_back(schema_idx);
+    }
+  } else {
+    struct path_info {
+      std::string full_path;
+      int schema_idx;
+    };
+
+    // Convert schema into a vector of every possible path
+    std::vector<path_info> all_paths;
+    std::function<void(std::string, int)> add_path = [&](std::string path_till_now,
+                                                         int schema_idx) {
+      auto const& schema_elem = get_schema(schema_idx);
+      std::string curr_path   = path_till_now + schema_elem.name;
+      all_paths.push_back({curr_path, schema_idx});
+      for (auto const& child_idx : schema_elem.children_idx) {
+        add_path(curr_path + ".", child_idx);
+      }
+    };
+    for (auto const& child_idx : get_schema(0).children_idx) {
+      add_path("", child_idx);
+    }
+
+    // Find which of the selected paths are valid and get their schema index
+    std::vector<path_info> valid_selected_paths;
+    for (auto const& selected_path : *use_names) {
+      auto found_path =
+        std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
+          return valid_path.full_path == selected_path;
+        });
+      if (found_path != all_paths.end()) {
+        valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+      }
+    }
+
+    // Now construct paths as vector of strings for further consumption
+    std::vector<std::vector<std::string>> use_names3;
+    std::transform(valid_selected_paths.begin(),
+                   valid_selected_paths.end(),
+                   std::back_inserter(use_names3),
+                   [&](path_info const& valid_path) {
+                     auto schema_idx = valid_path.schema_idx;
+                     std::vector<std::string> result_path;
+                     do {
+                       SchemaElement const& elem = get_schema(schema_idx);
+                       result_path.push_back(elem.name);
+                       schema_idx = elem.parent_idx;
+                     } while (schema_idx > 0);
+                     return std::vector<std::string>(result_path.rbegin(), result_path.rend());
+                   });
+
+    std::vector<column_name_info> selected_columns;
+    if (include_index) {
+      std::vector<std::string> index_names = get_pandas_index_names();
+      std::transform(index_names.cbegin(),
+                     index_names.cend(),
+                     std::back_inserter(selected_columns),
+                     [](std::string const& name) { return column_name_info(name); });
+    }
+    // Merge the vector use_names into a set of hierarchical column_name_info objects
+    /* This is because if we have columns like this:
+     *     col1
+     *      / \
+     *    s3   f4
+     *   / \
+     * f5   f6
+     *
+     * there may be common paths in use_names like:
+     * {"col1", "s3", "f5"}, {"col1", "f4"}
+     * which means we want the output to contain
+     *     col1
+     *      / \
+     *    s3   f4
+     *   /
+     * f5
+     *
+     * rather than
+     *  col1   col1
+     *   |      |
+     *   s3     f4
+     *   |
+     *   f5
+     */
+    for (auto const& path : use_names3) {
+      auto array_to_find_in = &selected_columns;
+      for (size_t depth = 0; depth < path.size(); ++depth) {
+        // Check if the path exists in our selected_columns and if not, add it.
+        auto const& name_to_find = path[depth];
+        auto found_col           = std::find_if(
+          array_to_find_in->begin(),
+          array_to_find_in->end(),
+          [&name_to_find](column_name_info const& col) { return col.name == name_to_find; });
+        if (found_col == array_to_find_in->end()) {
+          auto& col        = array_to_find_in->emplace_back(name_to_find);
+          array_to_find_in = &col.children;
+        } else {
+          // Path exists. go down further.
+          array_to_find_in = &found_col->children;
+        }
+      }
+    }
+    for (auto& col : selected_columns) {
+      auto const& top_level_col_schema_idx = find_schema_child(root, col.name);
+      bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false);
+      if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx);
+    }
+  }
+
+  return std::make_tuple(
+    std::move(input_columns), std::move(output_columns), std::move(output_column_schemas));
+}
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
new file mode 100644
index 00000000000..6fa86a77e46
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "compact_protocol_reader.hpp"
+#include "parquet_gpu.hpp"
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/types.hpp>
+
+#include <tuple>
+#include <vector>
+
+namespace cudf::io::detail::parquet {
+
+using namespace cudf::io::parquet;
+
+/**
+ * @brief Function that translates Parquet datatype to cuDF type enum
+ */
+[[nodiscard]] type_id to_type_id(SchemaElement const& schema,
+                                 bool strings_to_categorical,
+                                 type_id timestamp_type_id);
+
+/**
+ * @brief Converts cuDF type enum to column logical type
+ */
+[[nodiscard]] inline data_type to_data_type(type_id t_id, SchemaElement const& schema)
+{
+  return t_id == type_id::DECIMAL32 || t_id == type_id::DECIMAL64 || t_id == type_id::DECIMAL128
+           ? data_type{t_id, numeric::scale_type{-schema.decimal_scale}}
+           : data_type{t_id};
+}
+
+/**
+ * @brief The row_group_info class
+ */
+struct row_group_info {
+  size_type const index;
+  size_t const start_row;  // TODO source index
+  size_type const source_index;
+  row_group_info(size_type index, size_t start_row, size_type source_index)
+    : index(index), start_row(start_row), source_index(source_index)
+  {
+  }
+};
+
+/**
+ * @brief Class for parsing dataset metadata
+ */
+struct metadata : public FileMetaData {
+  explicit metadata(datasource* source);
+};
+
+class aggregate_reader_metadata {
+  std::vector<metadata> per_file_metadata;
+  std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
+  size_type num_rows;
+  size_type num_row_groups;
+
+  /**
+   * @brief Create a metadata object from each element in the source vector
+   */
+  static std::vector<metadata> metadatas_from_sources(
+    std::vector<std::unique_ptr<datasource>> const& sources);
+
+  /**
+   * @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps.
+   */
+  [[nodiscard]] std::vector<std::unordered_map<std::string, std::string>> collect_keyval_metadata()
+    const;
+
+  /**
+   * @brief Sums up the number of rows of each source
+   */
+  [[nodiscard]] size_type calc_num_rows() const;
+
+  /**
+   * @brief Sums up the number of row groups of each source
+   */
+  [[nodiscard]] size_type calc_num_row_groups() const;
+
+ public:
+  aggregate_reader_metadata(std::vector<std::unique_ptr<datasource>> const& sources);
+
+  [[nodiscard]] RowGroup const& get_row_group(size_type row_group_index, size_type src_idx) const;
+
+  [[nodiscard]] ColumnChunkMetaData const& get_column_metadata(size_type row_group_index,
+                                                               size_type src_idx,
+                                                               int schema_idx) const;
+
+  [[nodiscard]] auto get_num_rows() const { return num_rows; }
+
+  [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
+
+  [[nodiscard]] auto const& get_schema(int schema_idx) const
+  {
+    return per_file_metadata[0].schema[schema_idx];
+  }
+
+  [[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; }
+
+  /**
+   * @brief Gets the concrete nesting depth of output cudf columns
+   *
+   * @param schema_index Schema index of the input column
+   *
+   * @return comma-separated index column names in quotes
+   */
+  [[nodiscard]] inline int get_output_nesting_depth(int schema_index) const
+  {
+    auto& pfm = per_file_metadata[0];
+    int depth = 0;
+
+    // walk upwards, skipping repeated fields
+    while (schema_index > 0) {
+      if (!pfm.schema[schema_index].is_stub()) { depth++; }
+      // schema of one-level encoding list doesn't contain nesting information, so we need to
+      // manually add an extra nesting level
+      if (pfm.schema[schema_index].is_one_level_list()) { depth++; }
+      schema_index = pfm.schema[schema_index].parent_idx;
+    }
+    return depth;
+  }
+
+  /**
+   * @brief Extracts the pandas "index_columns" section
+   *
+   * PANDAS adds its own metadata to the key_value section when writing out the
+   * dataframe to a file to aid in exact reconstruction. The JSON-formatted
+   * metadata contains the index column(s) and PANDA-specific datatypes.
+   *
+   * @return comma-separated index column names in quotes
+   */
+  [[nodiscard]] std::string get_pandas_index() const;
+
+  /**
+   * @brief Extracts the column name(s) used for the row indexes in a dataframe
+   *
+   * @param names List of column names to load, where index column name(s) will be added
+   */
+  [[nodiscard]] std::vector<std::string> get_pandas_index_names() const;
+
+  /**
+   * @brief Filters and reduces down to a selection of row groups
+   *
+   * The input `row_start` and `row_count` parameters will be recomputed and output as the valid
+   * values based on the input row group list.
+   *
+   * @param row_group_indices Lists of row groups to read, one per source
+   * @param row_start Starting row of the selection
+   * @param row_count Total number of rows selected
+   *
+   * @return A tuple of corrected row_start, row_count and list of row group indexes and its
+   *         starting row
+   */
+  [[nodiscard]] std::tuple<size_type, size_type, std::vector<row_group_info>> select_row_groups(
+    host_span<std::vector<size_type> const> row_group_indices,
+    size_type row_start,
+    size_type row_count) const;
+
+  /**
+   * @brief Filters and reduces down to a selection of columns
+   *
+   * @param use_names List of paths of column names to select; `nullopt` if user did not select
+   * columns to read
+   * @param include_index Whether to always include the PANDAS index column(s)
+   * @param strings_to_categorical Type conversion parameter
+   * @param timestamp_type_id Type conversion parameter
+   *
+   * @return input column information, output column information, list of output column schema
+   * indices
+   */
+  [[nodiscard]] std::
+    tuple<std::vector<input_column_info>, std::vector<column_buffer>, std::vector<size_type>>
+    select_columns(std::optional<std::vector<std::string>> const& use_names,
+                   bool include_index,
+                   bool strings_to_categorical,
+                   type_id timestamp_type_id) const;
+};
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
new file mode 100644
index 00000000000..ca2009d3c74
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -0,0 +1,814 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+
+#include <io/comp/nvcomp_adapter.hpp>
+#include <io/utilities/config_utils.hpp>
+#include <io/utilities/time_utils.cuh>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/fill.h>
+#include <thrust/logical.h>
+
+#include <numeric>
+
+namespace cudf::io::detail::parquet {
+
+namespace {
+
+/**
+ * @brief Generate depth remappings for repetition and definition levels.
+ *
+ * When dealing with columns that contain lists, we must examine incoming
+ * repetition and definition level pairs to determine what range of output nesting
+ * is indicated when adding new values.  This function generates the mappings of
+ * the R/D levels to those start/end bounds
+ *
+ * @param remap Maps column schema index to the R/D remapping vectors for that column
+ * @param src_col_schema The column schema to generate the new mapping for
+ * @param md File metadata information
+ */
+void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
+                               int src_col_schema,
+                               aggregate_reader_metadata const& md)
+{
+  // already generated for this level
+  if (remap.find(src_col_schema) != remap.end()) { return; }
+  auto schema   = md.get_schema(src_col_schema);
+  int max_depth = md.get_output_nesting_depth(src_col_schema);
+
+  CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(),
+               "Attempting to remap a schema more than once");
+  auto inserted =
+    remap.insert(std::pair<int, std::pair<std::vector<int>, std::vector<int>>>{src_col_schema, {}});
+  auto& depth_remap = inserted.first->second;
+
+  std::vector<int>& rep_depth_remap = (depth_remap.first);
+  rep_depth_remap.resize(schema.max_repetition_level + 1);
+  std::vector<int>& def_depth_remap = (depth_remap.second);
+  def_depth_remap.resize(schema.max_definition_level + 1);
+
+  // the key:
+  // for incoming level values  R/D
+  // add values starting at the shallowest nesting level X has repetition level R
+  // until you reach the deepest nesting level Y that corresponds to the repetition level R1
+  // held by the nesting level that has definition level D
+  //
+  // Example: a 3 level struct with a list at the bottom
+  //
+  //                     R / D   Depth
+  // level0              0 / 1     0
+  //   level1            0 / 2     1
+  //     level2          0 / 3     2
+  //       list          0 / 3     3
+  //         element     1 / 4     4
+  //
+  // incoming R/D : 0, 0  -> add values from depth 0 to 3   (def level 0 always maps to depth 0)
+  // incoming R/D : 0, 1  -> add values from depth 0 to 3
+  // incoming R/D : 0, 2  -> add values from depth 0 to 3
+  // incoming R/D : 1, 4  -> add values from depth 4 to 4
+  //
+  // Note : the -validity- of values is simply checked by comparing the incoming D value against the
+  // D value of the given nesting level (incoming D >= the D for the nesting level == valid,
+  // otherwise NULL).  The tricky part is determining what nesting levels to add values at.
+  //
+  // For schemas with no repetition level (no lists), X is always 0 and Y is always max nesting
+  // depth.
+  //
+
+  // compute "X" from above
+  for (int s_idx = schema.max_repetition_level; s_idx >= 0; s_idx--) {
+    auto find_shallowest = [&](int r) {
+      int shallowest = -1;
+      int cur_depth  = max_depth - 1;
+      int schema_idx = src_col_schema;
+      while (schema_idx > 0) {
+        auto cur_schema = md.get_schema(schema_idx);
+        if (cur_schema.max_repetition_level == r) {
+          // if this is a repeated field, map it one level deeper
+          shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth;
+        }
+        // if it's one-level encoding list
+        else if (cur_schema.is_one_level_list()) {
+          shallowest = cur_depth - 1;
+        }
+        if (!cur_schema.is_stub()) { cur_depth--; }
+        schema_idx = cur_schema.parent_idx;
+      }
+      return shallowest;
+    };
+    rep_depth_remap[s_idx] = find_shallowest(s_idx);
+  }
+
+  // compute "Y" from above
+  for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) {
+    auto find_deepest = [&](int d) {
+      SchemaElement prev_schema;
+      int schema_idx = src_col_schema;
+      int r1         = 0;
+      while (schema_idx > 0) {
+        SchemaElement cur_schema = md.get_schema(schema_idx);
+        if (cur_schema.max_definition_level == d) {
+          // if this is a repeated field, map it one level deeper
+          r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level
+                                    : cur_schema.max_repetition_level;
+          break;
+        }
+        prev_schema = cur_schema;
+        schema_idx  = cur_schema.parent_idx;
+      }
+
+      // we now know R1 from above. return the deepest nesting level that has the
+      // same repetition level
+      schema_idx = src_col_schema;
+      int depth  = max_depth - 1;
+      while (schema_idx > 0) {
+        SchemaElement cur_schema = md.get_schema(schema_idx);
+        if (cur_schema.max_repetition_level == r1) {
+          // if this is a repeated field, map it one level deeper
+          depth = cur_schema.is_stub() ? depth + 1 : depth;
+          break;
+        }
+        if (!cur_schema.is_stub()) { depth--; }
+        prev_schema = cur_schema;
+        schema_idx  = cur_schema.parent_idx;
+      }
+      return depth;
+    };
+    def_depth_remap[s_idx] = find_deepest(s_idx);
+  }
+}
+
+/**
+ * @brief Function that returns the required the number of bits to store a value
+ */
+template <typename T = uint8_t>
+[[nodiscard]] T required_bits(uint32_t max_level)
+{
+  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
+}
+
+/**
+ * @brief Converts cuDF units to Parquet units.
+ *
+ * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
+ */
+[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
+                                                                   type_id timestamp_type_id,
+                                                                   parquet::Type physical,
+                                                                   int8_t converted,
+                                                                   int32_t length)
+{
+  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
+  int32_t clock_rate = 0;
+  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
+    type_width = 1;  // I32 -> I8
+  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
+    type_width = 2;  // I32 -> I16
+  } else if (column_type_id == type_id::INT32) {
+    type_width = 4;  // str -> hash32
+  } else if (is_chrono(data_type{column_type_id})) {
+    clock_rate = to_clockrate(timestamp_type_id);
+  }
+
+  int8_t converted_type = converted;
+  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
+      not cudf::is_fixed_point(data_type{column_type_id})) {
+    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
+  }
+  return std::make_tuple(type_width, clock_rate, converted_type);
+}
+
+/**
+ * @brief Reads compressed page data to device memory
+ *
+ * @param sources Dataset sources
+ * @param page_data Buffers to hold compressed page data for each chunk
+ * @param chunks List of column chunk descriptors
+ * @param begin_chunk Index of first column chunk to read
+ * @param end_chunk Index after the last column chunk to read
+ * @param column_chunk_offsets File offset for all chunks
+ * @param chunk_source_map Association between each column chunk and its source
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return A future object for reading synchronization
+ */
+[[nodiscard]] std::future<void> read_column_chunks_async(
+  std::vector<std::unique_ptr<datasource>> const& sources,
+  std::vector<std::unique_ptr<datasource::buffer>>& page_data,
+  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  size_t begin_chunk,
+  size_t end_chunk,
+  const std::vector<size_t>& column_chunk_offsets,
+  std::vector<size_type> const& chunk_source_map,
+  rmm::cuda_stream_view stream)
+{
+  // Transfer chunk data, coalescing adjacent chunks
+  std::vector<std::future<size_t>> read_tasks;
+  for (size_t chunk = begin_chunk; chunk < end_chunk;) {
+    const size_t io_offset   = column_chunk_offsets[chunk];
+    size_t io_size           = chunks[chunk].compressed_size;
+    size_t next_chunk        = chunk + 1;
+    const bool is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
+    while (next_chunk < end_chunk) {
+      const size_t next_offset = column_chunk_offsets[next_chunk];
+      const bool is_next_compressed =
+        (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED);
+      if (next_offset != io_offset + io_size || is_next_compressed != is_compressed) {
+        // Can't merge if not contiguous or mixing compressed and uncompressed
+        // Not coalescing uncompressed with compressed chunks is so that compressed buffers can be
+        // freed earlier (immediately after decompression stage) to limit peak memory requirements
+        break;
+      }
+      io_size += chunks[next_chunk].compressed_size;
+      next_chunk++;
+    }
+    if (io_size != 0) {
+      auto& source = sources[chunk_source_map[chunk]];
+      if (source->is_device_read_preferred(io_size)) {
+        auto buffer        = rmm::device_buffer(io_size, stream);
+        auto fut_read_size = source->device_read_async(
+          io_offset, io_size, static_cast<uint8_t*>(buffer.data()), stream);
+        read_tasks.emplace_back(std::move(fut_read_size));
+        page_data[chunk] = datasource::buffer::create(std::move(buffer));
+      } else {
+        auto const buffer = source->host_read(io_offset, io_size);
+        page_data[chunk] =
+          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream));
+      }
+      auto d_compdata = page_data[chunk]->data();
+      do {
+        chunks[chunk].compressed_data = d_compdata;
+        d_compdata += chunks[chunk].compressed_size;
+      } while (++chunk != next_chunk);
+    } else {
+      chunk = next_chunk;
+    }
+  }
+  auto sync_fn = [](decltype(read_tasks) read_tasks) {
+    for (auto& task : read_tasks) {
+      task.wait();
+    }
+  };
+  return std::async(std::launch::deferred, sync_fn, std::move(read_tasks));
+}
+
+/**
+ * @brief Return the number of total pages from the given column chunks.
+ *
+ * @param chunks List of column chunk descriptors
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return The total number of pages
+ */
+[[nodiscard]] size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                                        rmm::cuda_stream_view stream)
+{
+  size_t total_pages = 0;
+
+  chunks.host_to_device(stream);
+  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  chunks.device_to_host(stream, true);
+
+  for (size_t c = 0; c < chunks.size(); c++) {
+    total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages;
+  }
+
+  return total_pages;
+}
+
+/**
+ * @brief Decode the page information from the given column chunks.
+ *
+ * @param chunks List of column chunk descriptors
+ * @param pages List of page information
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                         hostdevice_vector<gpu::PageInfo>& pages,
+                         rmm::cuda_stream_view stream)
+{
+  // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
+  // please update preprocess_nested_columns to reflect this.
+  for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
+    chunks[c].max_num_pages = chunks[c].num_data_pages + chunks[c].num_dict_pages;
+    chunks[c].page_info     = pages.device_ptr(page_count);
+    page_count += chunks[c].max_num_pages;
+  }
+
+  chunks.host_to_device(stream);
+  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  pages.device_to_host(stream, true);
+}
+
+/**
+ * @brief Decompresses the page data, at page granularity.
+ *
+ * @param chunks List of column chunk descriptors
+ * @param pages List of page information
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Device buffer to decompressed page data
+ */
+[[nodiscard]] rmm::device_buffer decompress_page_data(
+  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  hostdevice_vector<gpu::PageInfo>& pages,
+  rmm::cuda_stream_view stream)
+{
+  auto for_each_codec_page = [&](parquet::Compression codec, const std::function<void(size_t)>& f) {
+    for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
+      const auto page_stride = chunks[c].max_num_pages;
+      if (chunks[c].codec == codec) {
+        for (int k = 0; k < page_stride; k++) {
+          f(page_count + k);
+        }
+      }
+      page_count += page_stride;
+    }
+  };
+
+  // Brotli scratch memory for decompressing
+  rmm::device_buffer debrotli_scratch;
+
+  // Count the exact number of compressed pages
+  size_t num_comp_pages    = 0;
+  size_t total_decomp_size = 0;
+
+  struct codec_stats {
+    parquet::Compression compression_type = UNCOMPRESSED;
+    size_t num_pages                      = 0;
+    int32_t max_decompressed_size         = 0;
+    size_t total_decomp_size              = 0;
+  };
+
+  std::array codecs{codec_stats{parquet::GZIP},
+                    codec_stats{parquet::SNAPPY},
+                    codec_stats{parquet::BROTLI},
+                    codec_stats{parquet::ZSTD}};
+
+  auto is_codec_supported = [&codecs](int8_t codec) {
+    if (codec == parquet::UNCOMPRESSED) return true;
+    return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
+             return codec == cstats.compression_type;
+           }) != codecs.end();
+  };
+  CUDF_EXPECTS(std::all_of(chunks.begin(),
+                           chunks.end(),
+                           [&is_codec_supported](auto const& chunk) {
+                             return is_codec_supported(chunk.codec);
+                           }),
+               "Unsupported compression type");
+
+  for (auto& codec : codecs) {
+    for_each_codec_page(codec.compression_type, [&](size_t page) {
+      auto page_uncomp_size = pages[page].uncompressed_page_size;
+      total_decomp_size += page_uncomp_size;
+      codec.total_decomp_size += page_uncomp_size;
+      codec.max_decompressed_size = std::max(codec.max_decompressed_size, page_uncomp_size);
+      codec.num_pages++;
+      num_comp_pages++;
+    });
+    if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) {
+      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
+    }
+  }
+
+  // Dispatch batches of pages to decompress for each codec
+  rmm::device_buffer decomp_pages(total_decomp_size, stream);
+
+  std::vector<device_span<uint8_t const>> comp_in;
+  comp_in.reserve(num_comp_pages);
+  std::vector<device_span<uint8_t>> comp_out;
+  comp_out.reserve(num_comp_pages);
+
+  // vectors to save v2 def and rep level data, if any
+  std::vector<device_span<uint8_t const>> copy_in;
+  copy_in.reserve(num_comp_pages);
+  std::vector<device_span<uint8_t>> copy_out;
+  copy_out.reserve(num_comp_pages);
+
+  rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               comp_res.begin(),
+               comp_res.end(),
+               compression_result{0, compression_status::FAILURE});
+
+  size_t decomp_offset = 0;
+  int32_t start_pos    = 0;
+  for (const auto& codec : codecs) {
+    if (codec.num_pages == 0) { continue; }
+
+    for_each_codec_page(codec.compression_type, [&](size_t page_idx) {
+      auto const dst_base = static_cast<uint8_t*>(decomp_pages.data()) + decomp_offset;
+      auto& page          = pages[page_idx];
+      // offset will only be non-zero for V2 pages
+      auto const offset = page.def_lvl_bytes + page.rep_lvl_bytes;
+      // for V2 need to copy def and rep level info into place, and then offset the
+      // input and output buffers. otherwise we'd have to keep both the compressed
+      // and decompressed data.
+      if (offset != 0) {
+        copy_in.emplace_back(page.page_data, offset);
+        copy_out.emplace_back(dst_base, offset);
+      }
+      comp_in.emplace_back(page.page_data + offset,
+                           static_cast<size_t>(page.compressed_page_size - offset));
+      comp_out.emplace_back(dst_base + offset,
+                            static_cast<size_t>(page.uncompressed_page_size - offset));
+      page.page_data = dst_base;
+      decomp_offset += page.uncompressed_page_size;
+    });
+
+    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
+                                                             codec.num_pages};
+    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, stream);
+    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
+                                                        codec.num_pages);
+    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, stream);
+    device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
+
+    switch (codec.compression_type) {
+      case parquet::GZIP:
+        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
+        break;
+      case parquet::SNAPPY:
+        if (nvcomp_integration::is_stable_enabled()) {
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     d_comp_in,
+                                     d_comp_out,
+                                     d_comp_res_view,
+                                     codec.max_decompressed_size,
+                                     codec.total_decomp_size,
+                                     stream);
+        } else {
+          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
+        }
+        break;
+      case parquet::ZSTD:
+        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
+                                   d_comp_in,
+                                   d_comp_out,
+                                   d_comp_res_view,
+                                   codec.max_decompressed_size,
+                                   codec.total_decomp_size,
+                                   stream);
+        break;
+      case parquet::BROTLI:
+        gpu_debrotli(d_comp_in,
+                     d_comp_out,
+                     d_comp_res_view,
+                     debrotli_scratch.data(),
+                     debrotli_scratch.size(),
+                     stream);
+        break;
+      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
+    }
+    start_pos += codec.num_pages;
+  }
+
+  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
+                              comp_res.begin(),
+                              comp_res.end(),
+                              [] __device__(auto const& res) {
+                                return res.status == compression_status::SUCCESS;
+                              }),
+               "Error during decompression");
+
+  // now copy the uncompressed V2 def and rep level data
+  if (not copy_in.empty()) {
+    auto const d_copy_in  = cudf::detail::make_device_uvector_async(copy_in, stream);
+    auto const d_copy_out = cudf::detail::make_device_uvector_async(copy_out, stream);
+
+    gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
+    stream.synchronize();
+  }
+
+  // Update the page information in device memory with the updated value of
+  // page_data; it now points to the uncompressed data buffer
+  pages.host_to_device(stream);
+
+  return decomp_pages;
+}
+
+}  // namespace
+
+void reader::impl::allocate_nesting_info()
+{
+  auto const& chunks      = _file_itm_data.chunks;
+  auto& pages             = _file_itm_data.pages_info;
+  auto& page_nesting_info = _file_itm_data.page_nesting_info;
+
+  // compute total # of page_nesting infos needed and allocate space. doing this in one
+  // buffer to keep it to a single gpu allocation
+  size_t const total_page_nesting_infos = std::accumulate(
+    chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto& chunk) {
+      // the schema of the input column
+      auto const& schema                    = _metadata->get_schema(chunk.src_col_schema);
+      auto const per_page_nesting_info_size = max(
+        schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema));
+      return total + (per_page_nesting_info_size * chunk.num_data_pages);
+    });
+
+  page_nesting_info = hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
+
+  // retrieve from the gpu so we can update
+  pages.device_to_host(_stream, true);
+
+  // update pointers in the PageInfos
+  int target_page_index = 0;
+  int src_info_index    = 0;
+  for (size_t idx = 0; idx < chunks.size(); idx++) {
+    int src_col_schema                    = chunks[idx].src_col_schema;
+    auto& schema                          = _metadata->get_schema(src_col_schema);
+    auto const per_page_nesting_info_size = std::max(
+      schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
+
+    // skip my dict pages
+    target_page_index += chunks[idx].num_dict_pages;
+    for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
+      pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index;
+      pages[target_page_index + p_idx].num_nesting_levels = per_page_nesting_info_size;
+
+      src_info_index += per_page_nesting_info_size;
+    }
+    target_page_index += chunks[idx].num_data_pages;
+  }
+
+  // copy back to the gpu
+  pages.host_to_device(_stream);
+
+  // fill in
+  int nesting_info_index = 0;
+  std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
+  for (size_t idx = 0; idx < chunks.size(); idx++) {
+    int src_col_schema = chunks[idx].src_col_schema;
+
+    // schema of the input column
+    auto& schema = _metadata->get_schema(src_col_schema);
+    // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
+    int max_depth = _metadata->get_output_nesting_depth(src_col_schema);
+
+    // # of nesting infos stored per page for this column
+    auto const per_page_nesting_info_size = std::max(schema.max_definition_level + 1, max_depth);
+
+    // if this column has lists, generate depth remapping
+    std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
+    if (schema.max_repetition_level > 0) {
+      generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
+    }
+
+    // fill in host-side nesting info
+    int schema_idx  = src_col_schema;
+    auto cur_schema = _metadata->get_schema(schema_idx);
+    int cur_depth   = max_depth - 1;
+    while (schema_idx > 0) {
+      // stub columns (basically the inner field of a list scheme element) are not real columns.
+      // we can ignore them for the purposes of output nesting info
+      if (!cur_schema.is_stub()) {
+        // initialize each page within the chunk
+        for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
+          gpu::PageNestingInfo* pni =
+            &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
+
+          // if we have lists, set our start and end depth remappings
+          if (schema.max_repetition_level > 0) {
+            auto remap = depth_remapping.find(src_col_schema);
+            CUDF_EXPECTS(remap != depth_remapping.end(),
+                         "Could not find depth remapping for schema");
+            std::vector<int> const& rep_depth_remap = (remap->second.first);
+            std::vector<int> const& def_depth_remap = (remap->second.second);
+
+            for (size_t m = 0; m < rep_depth_remap.size(); m++) {
+              pni[m].start_depth = rep_depth_remap[m];
+            }
+            for (size_t m = 0; m < def_depth_remap.size(); m++) {
+              pni[m].end_depth = def_depth_remap[m];
+            }
+          }
+
+          // values indexed by output column index
+          pni[cur_depth].max_def_level = cur_schema.max_definition_level;
+          pni[cur_depth].max_rep_level = cur_schema.max_repetition_level;
+          pni[cur_depth].size          = 0;
+        }
+
+        // move up the hierarchy
+        cur_depth--;
+      }
+
+      // next schema
+      schema_idx = cur_schema.parent_idx;
+      cur_schema = _metadata->get_schema(schema_idx);
+    }
+
+    nesting_info_index += (per_page_nesting_info_size * chunks[idx].num_data_pages);
+  }
+
+  // copy nesting info to the device
+  page_nesting_info.host_to_device(_stream);
+}
+
+void reader::impl::load_and_decompress_data(std::vector<row_group_info> const& row_groups_info,
+                                            size_type num_rows)
+{
+  // This function should never be called if `num_rows == 0`.
+  CUDF_EXPECTS(num_rows > 0, "Number of reading rows must not be zero.");
+
+  auto& raw_page_data    = _file_itm_data.raw_page_data;
+  auto& decomp_page_data = _file_itm_data.decomp_page_data;
+  auto& chunks           = _file_itm_data.chunks;
+  auto& pages_info       = _file_itm_data.pages_info;
+
+  // Descriptors for all the chunks that make up the selected columns
+  const auto num_input_columns = _input_columns.size();
+  const auto num_chunks        = row_groups_info.size() * num_input_columns;
+  chunks                       = hostdevice_vector<gpu::ColumnChunkDesc>(0, num_chunks, _stream);
+
+  // Association between each column chunk and its source
+  std::vector<size_type> chunk_source_map(num_chunks);
+
+  // Tracker for eventually deallocating compressed and uncompressed data
+  raw_page_data = std::vector<std::unique_ptr<datasource::buffer>>(num_chunks);
+
+  // Keep track of column chunk file offsets
+  std::vector<size_t> column_chunk_offsets(num_chunks);
+
+  // Initialize column chunk information
+  size_t total_decompressed_size = 0;
+  auto remaining_rows            = num_rows;
+  std::vector<std::future<void>> read_rowgroup_tasks;
+  for (const auto& rg : row_groups_info) {
+    const auto& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
+    auto const row_group_start  = rg.start_row;
+    auto const row_group_source = rg.source_index;
+    auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
+    auto const io_chunk_idx     = chunks.size();
+
+    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
+    for (size_t i = 0; i < num_input_columns; ++i) {
+      auto col = _input_columns[i];
+      // look up metadata
+      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+      auto& schema   = _metadata->get_schema(col.schema_idx);
+
+      auto [type_width, clock_rate, converted_type] =
+        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
+                        _timestamp_type.id(),
+                        schema.type,
+                        schema.converted_type,
+                        schema.type_length);
+
+      column_chunk_offsets[chunks.size()] =
+        (col_meta.dictionary_page_offset != 0)
+          ? std::min(col_meta.data_page_offset, col_meta.dictionary_page_offset)
+          : col_meta.data_page_offset;
+
+      chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
+                                            nullptr,
+                                            col_meta.num_values,
+                                            schema.type,
+                                            type_width,
+                                            row_group_start,
+                                            row_group_rows,
+                                            schema.max_definition_level,
+                                            schema.max_repetition_level,
+                                            _metadata->get_output_nesting_depth(col.schema_idx),
+                                            required_bits(schema.max_definition_level),
+                                            required_bits(schema.max_repetition_level),
+                                            col_meta.codec,
+                                            converted_type,
+                                            schema.logical_type,
+                                            schema.decimal_scale,
+                                            clock_rate,
+                                            i,
+                                            col.schema_idx));
+
+      // Map each column chunk to its column index and its source index
+      chunk_source_map[chunks.size() - 1] = row_group_source;
+
+      if (col_meta.codec != Compression::UNCOMPRESSED) {
+        total_decompressed_size += col_meta.total_uncompressed_size;
+      }
+    }
+    // Read compressed chunk data to device memory
+    read_rowgroup_tasks.push_back(read_column_chunks_async(_sources,
+                                                           raw_page_data,
+                                                           chunks,
+                                                           io_chunk_idx,
+                                                           chunks.size(),
+                                                           column_chunk_offsets,
+                                                           chunk_source_map,
+                                                           _stream));
+
+    remaining_rows -= row_group.num_rows;
+  }
+  for (auto& task : read_rowgroup_tasks) {
+    task.wait();
+  }
+  CUDF_EXPECTS(remaining_rows <= 0, "All rows data must be read.");
+
+  // Process dataset chunk pages into output columns
+  auto const total_pages = count_page_headers(chunks, _stream);
+  pages_info             = hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
+
+  if (total_pages > 0) {
+    // decoding of column/page information
+    decode_page_headers(chunks, pages_info, _stream);
+    if (total_decompressed_size > 0) {
+      decomp_page_data = decompress_page_data(chunks, pages_info, _stream);
+      // Free compressed data
+      for (size_t c = 0; c < chunks.size(); c++) {
+        if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) {
+          raw_page_data[c].reset();
+          // TODO: Check if this is called
+        }
+      }
+    }
+
+    // build output column info
+    // walk the schema, building out_buffers that mirror what our final cudf columns will look
+    // like. important : there is not necessarily a 1:1 mapping between input columns and output
+    // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct
+    // columns. The "structiness" is simply implied by the schema.  For example, this schema:
+    //  required group field_id=1 name {
+    //    required binary field_id=2 firstname (String);
+    //    required binary field_id=3 middlename (String);
+    //    required binary field_id=4 lastname (String);
+    // }
+    // will only contain 3 columns of data (firstname, middlename, lastname).  But of course
+    // "name" is a struct column that we want to return, so we have to make sure that we
+    // create it ourselves.
+    // std::vector<output_column_info> output_info = build_output_column_info();
+
+    // nesting information (sizes, etc) stored -per page-
+    // note : even for flat schemas, we allocate 1 level of "nesting" info
+    allocate_nesting_info();
+  }
+}
+
+void reader::impl::allocate_columns(size_t min_row, size_t total_rows, bool uses_custom_row_bounds)
+{
+  auto const& chunks = _file_itm_data.chunks;
+  auto& pages        = _file_itm_data.pages_info;
+
+  // iterate over all input columns and allocate any associated output
+  // buffers if they are not part of a list hierarchy. mark down
+  // if we have any list columns that need further processing.
+  bool has_lists = false;
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    auto const& input_col  = _input_columns[idx];
+    size_t const max_depth = input_col.nesting_depth();
+
+    auto* cols = &_output_buffers;
+    for (size_t l_idx = 0; l_idx < max_depth; l_idx++) {
+      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
+      cols          = &out_buf.children;
+
+      // if this has a list parent, we will have to do further work in gpu::PreprocessColumnData
+      // to know how big this buffer actually is.
+      if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
+        has_lists = true;
+      }
+      // if we haven't already processed this column because it is part of a struct hierarchy
+      else if (out_buf.size == 0) {
+        // add 1 for the offset if this is a list column
+        out_buf.create(
+          out_buf.type.id() == type_id::LIST && l_idx < max_depth ? total_rows + 1 : total_rows,
+          _stream,
+          _mr);
+      }
+    }
+  }
+
+  // if we have columns containing lists, further preprocessing is necessary.
+  if (has_lists) {
+    gpu::PreprocessColumnData(pages,
+                              chunks,
+                              _input_columns,
+                              _output_buffers,
+                              total_rows,
+                              min_row,
+                              uses_custom_row_bounds,
+                              _stream,
+                              _mr);
+    _stream.synchronize();
+  }
+}
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index a49dbcc703c..26b3f97616f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -19,6 +19,7 @@
  * @brief cuDF-IO parquet writer class implementation
  */
 
+#include "parquet_gpu.cuh"
 #include "writer_impl.hpp"
 
 #include "compact_protocol_reader.hpp"
@@ -926,7 +927,7 @@ auto to_nvcomp_compression_type(Compression codec)
 auto page_alignment(Compression codec)
 {
   if (codec == Compression::UNCOMPRESSED or
-      not nvcomp::is_compression_enabled(to_nvcomp_compression_type(codec))) {
+      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
     return 1u;
   }
 
@@ -1171,19 +1172,22 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   gpu::EncodePages(batch_pages, comp_in, comp_out, comp_res, stream);
   switch (compression_) {
     case parquet::Compression::SNAPPY:
-      if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) {
+      if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
+        gpu_snap(comp_in, comp_out, comp_res, stream);
+      } else {
         nvcomp::batched_compress(
           nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
-      } else {
-        gpu_snap(comp_in, comp_out, comp_res, stream);
       }
       break;
-    case parquet::Compression::ZSTD:
-      if (nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) {
-        nvcomp::batched_compress(
-          nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
+    case parquet::Compression::ZSTD: {
+      if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
+          reason) {
+        CUDF_FAIL("Compression error: " + reason.value());
       }
+      nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
+
       break;
+    }
     case parquet::Compression::UNCOMPRESSED: break;
     default: CUDF_FAIL("invalid compression type");
   }
@@ -1245,9 +1249,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
   if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; }
 
   auto const ncomp_type   = to_nvcomp_compression_type(compression);
-  auto const nvcomp_limit = nvcomp::is_compression_enabled(ncomp_type)
-                              ? nvcomp::compress_max_allowed_chunk_size(ncomp_type)
-                              : std::nullopt;
+  auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
+                              ? std::nullopt
+                              : nvcomp::compress_max_allowed_chunk_size(ncomp_type);
 
   return std::min(nvcomp_limit.value_or(max_page_size_bytes), max_page_size_bytes);
 }
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 22955deeabb..3fa68cd8b0f 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -144,7 +144,13 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
         bgzip_nvcomp_transform_functor{reinterpret_cast<uint8_t const*>(d_compressed_blocks.data()),
                                        reinterpret_cast<uint8_t*>(d_decompressed_blocks.begin())});
       if (decompressed_size() > 0) {
-        if (cudf::io::detail::nvcomp_integration::is_all_enabled()) {
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
+          gpuinflate(d_compressed_spans,
+                     d_decompressed_spans,
+                     d_decompression_results,
+                     gzip_header_included::NO,
+                     stream);
+        } else {
           cudf::io::nvcomp::batched_decompress(cudf::io::nvcomp::compression_type::DEFLATE,
                                                d_compressed_spans,
                                                d_decompressed_spans,
@@ -152,12 +158,6 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
                                                max_decompressed_size,
                                                decompressed_size(),
                                                stream);
-        } else {
-          gpuinflate(d_compressed_spans,
-                     d_decompressed_spans,
-                     d_decompression_results,
-                     gzip_header_included::NO,
-                     stream);
         }
       }
       is_decompressed = true;
diff --git a/cpp/src/io/utilities/column_type_histogram.hpp b/cpp/src/io/utilities/column_type_histogram.hpp
index 8bd2d3a89cf..88f4e58f9b1 100644
--- a/cpp/src/io/utilities/column_type_histogram.hpp
+++ b/cpp/src/io/utilities/column_type_histogram.hpp
@@ -33,6 +33,11 @@ struct column_type_histogram {
   cudf::size_type positive_small_int_count{};
   cudf::size_type big_int_count{};
   cudf::size_type bool_count{};
+  auto total_count() const
+  {
+    return null_count + float_count + datetime_count + string_count + negative_small_int_count +
+           positive_small_int_count + big_int_count + bool_count;
+  }
 };
 
 }  // namespace io
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index c0dd85702e2..2484a36143a 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -257,11 +257,20 @@ std::future<void> cufile_output_impl::write_async(void const* data, size_t offse
   // writes.
   return std::async(std::launch::deferred, waiter, std::move(slice_tasks));
 }
+#else
+cufile_input_impl::cufile_input_impl(std::string const& filepath)
+{
+  CUDF_FAIL("Cannot create cuFile source, current build was compiled without cuFile headers");
+}
+
+cufile_output_impl::cufile_output_impl(std::string const& filepath)
+{
+  CUDF_FAIL("Cannot create cuFile sink, current build was compiled without cuFile headers");
+}
 #endif
 
 std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath)
 {
-#ifdef CUFILE_FOUND
   if (cufile_integration::is_gds_enabled()) {
     try {
       return std::make_unique<cufile_input_impl>(filepath);
@@ -269,13 +278,11 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
       if (cufile_integration::is_always_enabled()) throw;
     }
   }
-#endif
   return nullptr;
 }
 
 std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath)
 {
-#ifdef CUFILE_FOUND
   if (cufile_integration::is_gds_enabled()) {
     try {
       return std::make_unique<cufile_output_impl>(filepath);
@@ -283,7 +290,6 @@ std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepa
       if (cufile_integration::is_always_enabled()) throw;
     }
   }
-#endif
   return nullptr;
 }
 
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 704ee77de8a..38674892966 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -194,6 +194,7 @@ class cufile_output_impl final : public cufile_output {
 
 class cufile_input_impl final : public cufile_input {
  public:
+  cufile_input_impl(std::string const& filepath);
   std::future<size_t> read_async(size_t offset,
                                  size_t size,
                                  uint8_t* dst,
@@ -205,6 +206,7 @@ class cufile_input_impl final : public cufile_input {
 
 class cufile_output_impl final : public cufile_output {
  public:
+  cufile_output_impl(std::string const& filepath);
   std::future<void> write_async(void const* data, size_t offset, size_t size) override
   {
     CUDF_FAIL("Only used to compile without cufile library, should not be called");
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index cbb3aec76c5..ca7ca2f6590 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -185,7 +185,7 @@ struct list_child_constructor {
                                                       mr);
 
     thrust::transform(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(child_column->size()),
       child_column->mutable_view().begin<T>(),
@@ -237,7 +237,7 @@ struct list_child_constructor {
     auto const null_string_view = string_view{nullptr, 0};  // placeholder for factory function
 
     thrust::transform(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(string_views.size()),
       string_views.begin(),
@@ -304,7 +304,7 @@ struct list_child_constructor {
     // For instance, if a parent list_device_view has 3 elements, it should have 3 corresponding
     // child list_device_view instances.
     thrust::transform(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(child_list_views.size()),
       child_list_views.begin(),
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 019809d5f68..0c90b0af8d2 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
+#include <quantiles/tdigest/tdigest_util.cuh>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -42,8 +43,8 @@
 using namespace cudf::tdigest;
 
 namespace cudf {
-namespace detail {
 namespace tdigest {
+namespace detail {
 
 // https://developer.nvidia.com/blog/lerp-faster-cuda/
 template <typename T>
@@ -338,7 +339,7 @@ std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
 
-}  // namespace tdigest
+}  // namespace detail
 
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
@@ -354,8 +355,8 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
     data_type{type_id::INT32}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
   auto const all_empty_rows =
     thrust::count_if(rmm::exec_policy(stream),
-                     input.size_begin(),
-                     input.size_begin() + input.size(),
+                     detail::size_begin(input),
+                     detail::size_begin(input) + input.size(),
                      [] __device__(auto const x) { return x == 0; }) == input.size();
   auto row_size_iter = thrust::make_constant_iterator(all_empty_rows ? 0 : percentiles.size());
   thrust::exclusive_scan(rmm::exec_policy(stream),
@@ -379,7 +380,7 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
   // uninitialized)
   auto [bitmask, null_count] = [stream, mr, &tdv]() {
     auto tdigest_is_empty = thrust::make_transform_iterator(
-      tdv.size_begin(),
+      detail::size_begin(tdv),
       [] __device__(size_type tdigest_size) -> size_type { return tdigest_size == 0; });
     auto const null_count =
       thrust::reduce(rmm::exec_policy(stream), tdigest_is_empty, tdigest_is_empty + tdv.size(), 0);
@@ -390,24 +391,23 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
       tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr);
   }();
 
-  return cudf::make_lists_column(
-    input.size(),
-    std::move(offsets),
-    tdigest::compute_approx_percentiles(input, percentiles, stream, mr),
-    null_count,
-    std::move(bitmask),
-    stream,
-    mr);
+  return cudf::make_lists_column(input.size(),
+                                 std::move(offsets),
+                                 detail::compute_approx_percentiles(input, percentiles, stream, mr),
+                                 null_count,
+                                 std::move(bitmask),
+                                 stream,
+                                 mr);
 }
 
-}  // namespace detail
+}  // namespace tdigest
 
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::percentile_approx(input, percentiles, cudf::get_default_stream(), mr);
+  return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index d870b73dff4..38c6cf7bd2e 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <quantiles/tdigest/tdigest_util.cuh>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
@@ -26,7 +28,6 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -52,10 +53,8 @@
 #include <thrust/tuple.h>
 
 namespace cudf {
-namespace detail {
 namespace tdigest {
-
-using namespace cudf::tdigest;
+namespace detail {
 
 namespace {
 
@@ -596,7 +595,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
 
   // if there are no stub tdigests, we can return immediately.
   if (num_stubs == 0) {
-    return cudf::detail::tdigest::make_tdigest_column(num_rows,
+    return cudf::tdigest::detail::make_tdigest_column(num_rows,
                                                       std::move(means),
                                                       std::move(weights),
                                                       std::move(offsets),
@@ -642,7 +641,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                          0);
 
   // assemble final column
-  return cudf::detail::tdigest::make_tdigest_column(num_rows,
+  return cudf::tdigest::detail::make_tdigest_column(num_rows,
                                                     std::move(_means),
                                                     std::move(_weights),
                                                     std::move(offsets),
@@ -708,7 +707,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
+  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -1067,9 +1066,10 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   // generate min and max values
   auto merged_min_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
-  auto min_iter = thrust::make_transform_iterator(
-    thrust::make_zip_iterator(thrust::make_tuple(tdv.min_begin(), tdv.size_begin())),
-    tdigest_min{});
+  auto min_iter =
+    thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(
+                                      tdv.min_begin(), cudf::tdigest::detail::size_begin(tdv))),
+                                    tdigest_min{});
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels,
                         group_labels + num_group_labels,
@@ -1081,9 +1081,10 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto merged_max_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
-  auto max_iter = thrust::make_transform_iterator(
-    thrust::make_zip_iterator(thrust::make_tuple(tdv.max_begin(), tdv.size_begin())),
-    tdigest_max{});
+  auto max_iter =
+    thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(
+                                      tdv.max_begin(), cudf::tdigest::detail::size_begin(tdv))),
+                                    tdigest_max{});
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels,
                         group_labels + num_group_labels,
@@ -1190,7 +1191,7 @@ std::unique_ptr<scalar> reduce_tdigest(column_view const& col,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_scalar(stream, mr); }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
   // since this isn't coming out of a groupby, we need to sort the inputs in ascending
   // order with nulls at the end.
@@ -1209,7 +1210,7 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
 {
   tdigest_column_view tdv(input);
 
-  if (input.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_scalar(stream, mr); }
+  if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
   auto h_group_offsets = cudf::detail::make_counting_transform_iterator(
     0, [size = input.size()](size_type i) { return i == 0 ? 0 : size; });
@@ -1238,7 +1239,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1264,7 +1265,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr);
+    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
   }
 
   // bring group offsets back to the host
@@ -1286,6 +1287,6 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         mr);
 }
 
-}  // namespace tdigest
 }  // namespace detail
+}  // namespace tdigest
 }  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
index a86b40fd64a..cfcd21c5690 100644
--- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
+++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,13 +17,11 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 
 namespace cudf {
 namespace tdigest {
 
-using namespace cudf;
-
 tdigest_column_view::tdigest_column_view(column_view const& col) : column_view(col)
 {
   // sanity check that this is actually tdigest data
diff --git a/cpp/src/quantiles/tdigest/tdigest_util.cuh b/cpp/src/quantiles/tdigest/tdigest_util.cuh
new file mode 100644
index 00000000000..d0e6484875b
--- /dev/null
+++ b/cpp/src/quantiles/tdigest/tdigest_util.cuh
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
+
+namespace cudf {
+namespace tdigest {
+namespace detail {
+
+/**
+ * @brief Functor to compute the size of each tdigest of a column
+ */
+struct tdigest_size_fn {
+  size_type const* offsets;  ///< Offsets of the t-digest column
+  /**
+   * @brief Returns size of the each tdigest in the column
+   *
+   * @param tdigest_index Index of the tdigest in the column
+   * @return Size of the tdigest
+   */
+  __device__ size_type operator()(size_type tdigest_index)
+  {
+    return offsets[tdigest_index + 1] - offsets[tdigest_index];
+  }
+};
+
+/**
+ * @brief Returns an iterator that returns the size of each tdigest
+ * in the column (each row is 1 digest)
+ *
+ * @return An iterator that returns the size of each tdigest in the column
+ */
+inline auto size_begin(tdigest_column_view const& tdv)
+{
+  return cudf::detail::make_counting_transform_iterator(
+    0, tdigest_size_fn{tdv.centroids().offsets_begin()});
+}
+
+}  // namespace detail
+}  // namespace tdigest
+}  // namespace cudf
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d7a195c088c..38db7eb3e89 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -124,13 +124,13 @@ struct reduce_dispatch_functor {
         CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT,
                      "Tdigest aggregations expect output type to be STRUCT");
         auto td_agg = static_cast<tdigest_aggregation const&>(agg);
-        return detail::tdigest::reduce_tdigest(col, td_agg.max_centroids, stream, mr);
+        return tdigest::detail::reduce_tdigest(col, td_agg.max_centroids, stream, mr);
       }
       case aggregation::MERGE_TDIGEST: {
         CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT,
                      "Tdigest aggregations expect output type to be STRUCT");
         auto td_agg = static_cast<merge_tdigest_aggregation const&>(agg);
-        return detail::tdigest::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr);
+        return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr);
       }
       default: CUDF_FAIL("Unsupported reduction operator");
     }
@@ -157,7 +157,7 @@ std::unique_ptr<scalar> reduce(
   // handcraft the default scalar with input column.
   if (col.size() <= col.null_count()) {
     if (agg.kind == aggregation::TDIGEST || agg.kind == aggregation::MERGE_TDIGEST) {
-      return detail::tdigest::make_empty_tdigest_scalar(stream);
+      return tdigest::detail::make_empty_tdigest_scalar(stream);
     }
     if (col.type().id() == type_id::EMPTY || col.type() != output_dtype) {
       // Under some circumstance, the output type will become the List of input type,
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index 4394557e453..68480dbf773 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -122,10 +122,8 @@ struct DeviceRolling {
     using AggOp = typename corresponding_operator<op>::type;
     AggOp agg_op;
 
-    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-    // for CUDA 10.0 and below (fixed in CUDA 10.1)
-    volatile cudf::size_type count = 0;
-    OutputType val                 = AggOp::template identity<OutputType>();
+    cudf::size_type count = 0;
+    OutputType val        = AggOp::template identity<OutputType>();
 
     for (size_type j = start_index; j < end_index; j++) {
       if (!has_nulls || input.is_valid(j)) {
@@ -190,11 +188,9 @@ struct DeviceRollingArgMinMaxString : DeviceRollingArgMinMaxBase<cudf::string_vi
     using AggOp     = typename corresponding_operator<op>::type;
     AggOp agg_op;
 
-    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-    // for CUDA 10.0 and below (fixed in CUDA 10.1)
-    volatile cudf::size_type count = 0;
-    InputType val                  = AggOp::template identity<InputType>();
-    OutputType val_index           = default_output;
+    cudf::size_type count = 0;
+    InputType val         = AggOp::template identity<InputType>();
+    OutputType val_index  = default_output;
 
     for (size_type j = start_index; j < end_index; j++) {
       if (!has_nulls || input.is_valid(j)) {
@@ -284,13 +280,11 @@ struct DeviceRollingCountValid {
                              size_type end_index,
                              size_type current_index)
   {
-    // declare this as volatile to avoid some compiler optimizations that lead to incorrect
-    // results for CUDA 10.0 and below (fixed in CUDA 10.1)
-    volatile cudf::size_type count = 0;
-
     bool output_is_valid = ((end_index - start_index) >= min_periods);
 
     if (output_is_valid) {
+      cudf::size_type count = 0;
+
       if (!has_nulls) {
         count = end_index - start_index;
       } else {
diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index ecdbbb6a0f2..3bfee32d1cc 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -58,10 +58,6 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
 
   auto active_threads = __ballot_sync(0xffff'ffffu, i < nrows);
   while (i < nrows) {
-    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-    // for CUDA 10.0 and below (fixed in CUDA 10.1)
-    volatile cudf::size_type count = 0;
-
     int64_t const preceding_window = get_window(preceding_window_begin, i);
     int64_t const following_window = get_window(following_window_begin, i);
 
@@ -77,8 +73,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
     // TODO: We should explore using shared memory to avoid redundant loads.
     //       This might require separating the kernel into a special version
     //       for dynamic and static sizes.
-    count       = end_index - start_index;
-    OutType val = agg_op::template operate<OutType, InType>(in_col, start_index, count);
+    cudf::size_type count = end_index - start_index;
+    OutType val           = agg_op::template operate<OutType, InType>(in_col, start_index, count);
 
     // check if we have enough input samples
     bool const output_is_valid = (count >= min_periods);
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index ea01b570b91..127d3aa8fe7 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -87,19 +87,17 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
 
 }  // namespace
 
-std::unique_ptr<column> count_characters(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> count_characters(strings_column_view const& strings,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   auto ufn = [] __device__(const string_view& d_str) { return d_str.length(); };
   return counts_fn(strings, ufn, stream, mr);
 }
 
-std::unique_ptr<column> count_bytes(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> count_bytes(strings_column_view const& strings,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto ufn = [] __device__(const string_view& d_str) { return d_str.size_bytes(); };
   return counts_fn(strings, ufn, stream, mr);
@@ -135,10 +133,9 @@ struct code_points_fn {
 
 namespace detail {
 //
-std::unique_ptr<column> code_points(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> code_points(strings_column_view const& strings,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 05c2904ec9e..a2cee757112 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -147,30 +147,27 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
 
 }  // namespace
 
-std::unique_ptr<column> to_lower(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> to_lower(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   character_flags_table_type case_flag = IS_UPPER(0xFF);  // convert only upper case characters
   return convert_case(strings, case_flag, stream, mr);
 }
 
 //
-std::unique_ptr<column> to_upper(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> to_upper(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   character_flags_table_type case_flag = IS_LOWER(0xFF);  // convert only lower case characters
   return convert_case(strings, case_flag, stream, mr);
 }
 
 //
-std::unique_ptr<column> swapcase(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> swapcase(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   // convert only upper or lower case characters
   character_flags_table_type case_flag = IS_LOWER(0xFF) | IS_UPPER(0xFF);
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 0426d82c6c6..aa1e4dce4d0 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -38,12 +38,11 @@ namespace cudf {
 namespace strings {
 namespace detail {
 //
-std::unique_ptr<column> all_characters_of_type(
-  strings_column_view const& strings,
-  string_character_types types,
-  string_character_types verify_types,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> all_characters_of_type(strings_column_view const& strings,
+                                               string_character_types types,
+                                               string_character_types verify_types,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
 {
   auto strings_count  = strings.size();
   auto strings_column = column_device_view::create(strings.parent(), stream);
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index c6e71b00809..eafc78be8da 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -57,8 +58,7 @@ struct contains_fn {
 };
 
 std::unique_ptr<column> contains_impl(strings_column_view const& input,
-                                      std::string_view pattern,
-                                      regex_flags const flags,
+                                      regex_program const& prog,
                                       bool const beginning_only,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
@@ -71,7 +71,7 @@ std::unique_ptr<column> contains_impl(strings_column_view const& input,
                                      mr);
   if (input.is_empty()) { return results; }
 
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream);
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   auto d_results       = results->mutable_view().data<bool>();
   auto const d_strings = column_device_view::create(input.parent(), stream);
@@ -86,35 +86,29 @@ std::unique_ptr<column> contains_impl(strings_column_view const& input,
 
 }  // namespace
 
-std::unique_ptr<column> contains_re(
-  strings_column_view const& input,
-  std::string_view pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> contains_re(strings_column_view const& input,
+                                    regex_program const& prog,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
-  return contains_impl(input, pattern, flags, false, stream, mr);
+  return contains_impl(input, prog, false, stream, mr);
 }
 
-std::unique_ptr<column> matches_re(
-  strings_column_view const& input,
-  std::string_view pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> matches_re(strings_column_view const& input,
+                                   regex_program const& prog,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
-  return contains_impl(input, pattern, flags, true, stream, mr);
+  return contains_impl(input, prog, true, stream, mr);
 }
 
-std::unique_ptr<column> count_re(
-  strings_column_view const& input,
-  std::string_view pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> count_re(strings_column_view const& input,
+                                 regex_program const& prog,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
-  // compile regex into device object
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
@@ -136,7 +130,16 @@ std::unique_ptr<column> contains_re(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_re(strings, pattern, flags, cudf::get_default_stream(), mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE);
+  return detail::contains_re(strings, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> contains_re(strings_column_view const& strings,
+                                    regex_program const& prog,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::contains_re(strings, prog, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> matches_re(strings_column_view const& strings,
@@ -145,7 +148,16 @@ std::unique_ptr<column> matches_re(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::matches_re(strings, pattern, flags, cudf::get_default_stream(), mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE);
+  return detail::matches_re(strings, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> matches_re(strings_column_view const& strings,
+                                   regex_program const& prog,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::matches_re(strings, prog, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> count_re(strings_column_view const& strings,
@@ -154,7 +166,16 @@ std::unique_ptr<column> count_re(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_re(strings, pattern, flags, cudf::get_default_stream(), mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE);
+  return detail::count_re(strings, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> count_re(strings_column_view const& strings,
+                                 regex_program const& prog,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::count_re(strings, prog, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 2de4bd2a2cc..49713731ff5 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -454,10 +454,9 @@ std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_m
 }
 
 namespace detail {
-std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> is_float(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index dbbdffac2c2..f41232a4af6 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -206,11 +206,10 @@ struct dispatch_integers_to_hex_fn {
 }  // namespace
 
 // This will convert a strings column into any integer column type.
-std::unique_ptr<column> hex_to_integers(
-  strings_column_view const& strings,
-  data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
+                                        data_type output_type,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(output_type);
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 343288af0c1..ed40c47b99d 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -157,10 +157,9 @@ struct dispatch_is_integer_fn {
 
 }  // namespace
 
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   auto const d_column = column_device_view::create(strings.parent(), stream);
   auto results        = make_numeric_column(data_type{type_id::BOOL8},
@@ -192,11 +191,10 @@ std::unique_ptr<column> is_integer(
   return results;
 }
 
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  data_type int_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   data_type int_type,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   if (strings.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); }
   return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr);
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 5229f0fdf1b..0dcb2b61446 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -75,10 +75,9 @@ struct ipv4_to_integers_fn {
 }  // namespace
 
 // Convert strings column of IPv4 addresses to integers column
-std::unique_ptr<column> ipv4_to_integers(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> ipv4_to_integers(strings_column_view const& strings,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_numeric_column(data_type{type_id::INT64}, 0);
@@ -162,10 +161,9 @@ struct integers_to_ipv4_fn {
 }  // namespace
 
 // Convert integers into IPv4 addresses
-std::unique_ptr<column> integers_to_ipv4(
-  column_view const& integers,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = integers.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 0c6ecf46313..25e37526f59 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -129,10 +129,9 @@ struct url_encoder_fn {
 }  // namespace
 
 //
-std::unique_ptr<column> url_encode(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> url_encode(strings_column_view const& strings,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -388,10 +387,9 @@ __global__ void url_decode_char_replacer(column_device_view const in_strings,
 }  // namespace
 
 //
-std::unique_ptr<column> url_decode(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> url_decode(strings_column_view const& strings,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 627e689d4d9..e44c343e31b 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -127,10 +127,8 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const*
   if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
   while (output_index < output_size) {
     // Lookup input index by searching for output index in offsets
-    // thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
-    auto const offset_it =
-      -1 + thrust::upper_bound(
-             thrust::seq, input_offsets, input_offsets + num_input_views, output_index);
+    auto const offset_it            = thrust::prev(thrust::upper_bound(
+      thrust::seq, input_offsets, input_offsets + num_input_views, output_index));
     size_type const partition_index = offset_it - input_offsets;
 
     auto const offset_index      = output_index - *offset_it;
@@ -180,10 +178,8 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*
 
   while (output_index < output_size) {
     // Lookup input index by searching for output index in offsets
-    // thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
-    auto const offset_it =
-      -1 + thrust::upper_bound(
-             thrust::seq, partition_offsets, partition_offsets + num_input_views, output_index);
+    auto const offset_it            = thrust::prev(thrust::upper_bound(
+      thrust::seq, partition_offsets, partition_offsets + num_input_views, output_index));
     size_type const partition_index = offset_it - partition_offsets;
 
     auto const offset_index = output_index - *offset_it;
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index d4bcdaf4042..a4f76c1c5e3 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -41,12 +41,11 @@ class reprog_device;
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Integer column of match counts
  */
-std::unique_ptr<column> count_matches(
-  column_device_view const& d_strings,
-  reprog_device& d_prog,
-  size_type output_size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> count_matches(column_device_view const& d_strings,
+                                      reprog_device& d_prog,
+                                      size_type output_size,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index 882b85d1066..f99b0e63715 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -86,13 +87,12 @@ struct extract_fn {
 
 //
 std::unique_ptr<table> extract(strings_column_view const& input,
-                               std::string_view pattern,
-                               regex_flags const flags,
+                               regex_program const& prog,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
-  // compile regex into device object
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern");
@@ -136,7 +136,16 @@ std::unique_ptr<table> extract(strings_column_view const& strings,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract(strings, pattern, flags, cudf::get_default_stream(), mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::EXTRACT);
+  return detail::extract(strings, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<table> extract(strings_column_view const& strings,
+                               regex_program const& prog,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract(strings, prog, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 1ba5a8a1470..c27834dae19 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column_device_view.cuh>
@@ -95,18 +96,17 @@ struct extract_fn {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> extract_all_record(
-  strings_column_view const& input,
-  std::string_view pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> extract_all_record(strings_column_view const& input,
+                                           regex_program const& prog,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   auto const strings_count = input.size();
   auto const d_strings     = column_device_view::create(input.parent(), stream);
 
-  // Compile regex into device object.
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
+
   // The extract pattern should always include groups.
   auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern.");
@@ -171,7 +171,16 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& strings,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_all_record(strings, pattern, flags, cudf::get_default_stream(), mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::EXTRACT);
+  return detail::extract_all_record(strings, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> extract_all_record(strings_column_view const& strings,
+                                           regex_program const& prog,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract_all_record(strings, prog, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index f813ec24ee9..4bd98ee4cdc 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/combine.hpp>
+#include <cudf/strings/detail/fill.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -35,13 +35,12 @@
 namespace cudf {
 namespace strings {
 namespace detail {
-std::unique_ptr<column> fill(
-  strings_column_view const& strings,
-  size_type begin,
-  size_type end,
-  string_scalar const& value,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> fill(strings_column_view const& strings,
+                             size_type begin,
+                             size_type end,
+                             string_scalar const& value,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu
index cb6fc844426..4e4df6cb1ad 100644
--- a/cpp/src/strings/like.cu
+++ b/cpp/src/strings/like.cu
@@ -102,12 +102,11 @@ struct like_fn {
 
 }  // namespace
 
-std::unique_ptr<column> like(
-  strings_column_view const& input,
-  string_scalar const& pattern,
-  string_scalar const& escape_character,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> like(strings_column_view const& input,
+                             string_scalar const& pattern,
+                             string_scalar const& escape_character,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   auto results = make_numeric_column(data_type{type_id::BOOL8},
                                      input.size(),
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index d84b4afc7cf..e5497849681 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -53,13 +53,12 @@ struct compute_pad_output_length_fn {
 
 }  // namespace
 
-std::unique_ptr<column> pad(
-  strings_column_view const& strings,
-  size_type width,
-  side_type side                      = side_type::RIGHT,
-  std::string_view fill_char          = " ",
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> pad(strings_column_view const& strings,
+                            size_type width,
+                            side_type side,
+                            std::string_view fill_char,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -128,11 +127,10 @@ std::unique_ptr<column> pad(
                              std::move(null_mask));
 }
 
-std::unique_ptr<column> zfill(
-  strings_column_view const& input,
-  size_type width,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> zfill(strings_column_view const& input,
+                              size_type width,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 5b86aedc409..0c0404f31ce 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -123,7 +123,7 @@ int32_t reprog::add_class(reclass const& cls)
 
 reinst& reprog::inst_at(int32_t id) { return _insts[id]; }
 
-reclass& reprog::class_at(int32_t id) { return _classes[id]; }
+reclass const& reprog::class_at(int32_t id) const { return _classes[id]; }
 
 void reprog::set_start_inst(int32_t id) { _startinst_id = id; }
 
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index 7ad7f481436..b450b3f90e7 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -128,7 +128,7 @@ class reprog {
   [[nodiscard]] reinst const* insts_data() const;
 
   [[nodiscard]] int32_t classes_count() const;
-  [[nodiscard]] reclass& class_at(int32_t id);
+  [[nodiscard]] reclass const& class_at(int32_t id) const;
   [[nodiscard]] reclass const* classes_data() const;
 
   [[nodiscard]] const int32_t* starts_data() const;
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 98631680800..d16efb5f66e 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -25,6 +25,8 @@
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 
+#include <cuda_runtime.h>
+
 #include <memory>
 
 namespace cudf {
@@ -56,6 +58,8 @@ struct alignas(16) reclass_device {
   __device__ inline bool is_match(char32_t const ch, uint8_t const* flags) const;
 };
 
+class reprog;
+
 /**
  * @brief Regex program of instructions/data for a specific regex pattern.
  *
@@ -78,32 +82,14 @@ class reprog_device {
   reprog_device& operator=(reprog_device&&) = default;
 
   /**
-   * @brief Create device program instance from a regex pattern.
-   *
-   * The number of strings is needed to compute the state data size required when evaluating the
-   * regex.
-   *
-   * @param pattern The regex pattern to compile.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @return The program device object.
-   */
-  static std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> create(
-    std::string_view pattern, rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Create the device program instance from a regex pattern
+   * @brief Create device program instance from a regex program
    *
-   * @param pattern The regex pattern to compile
-   * @param re_flags Regex flags for interpreting special characters in the pattern
-   * @param capture Control how capture groups are processed
+   * @param prog The regex program to create from
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return The program device object
    */
   static std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> create(
-    std::string_view pattern,
-    regex_flags const re_flags,
-    capture_groups const capture,
-    rmm::cuda_stream_view stream);
+    reprog const& prog, rmm::cuda_stream_view stream);
 
   /**
    * @brief Called automatically by the unique_ptr returned from create().
@@ -270,7 +256,7 @@ class reprog_device {
                                          cudf::size_type& end,
                                          cudf::size_type const group_id = 0) const;
 
-  reprog_device(reprog&);
+  reprog_device(reprog const&);
 
   int32_t _startinst_id;          // first instruction id
   int32_t _num_capturing_groups;  // instruction groups
@@ -289,6 +275,16 @@ class reprog_device {
   int32_t _thread_count{};   // threads available in working memory
 };
 
+/**
+ * @brief Return the size in bytes needed for working memory to
+ * execute insts_count instructions in parallel over num_threads threads.
+ *
+ * @param num_threads Number of parallel threads (usually one per string in a strings column)
+ * @param insts_count Number of instructions from a compiled regex pattern
+ * @return Number of bytes needed for working memory
+ */
+std::size_t compute_working_memory_size(int32_t num_threads, int32_t insts_count);
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/regex/regex_program.cpp b/cpp/src/strings/regex/regex_program.cpp
new file mode 100644
index 00000000000..c64da213fcf
--- /dev/null
+++ b/cpp/src/strings/regex/regex_program.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "regex_program_impl.h"
+
+#include <cudf/strings/regex/regex_program.hpp>
+
+#include <memory>
+#include <string>
+
+namespace cudf {
+namespace strings {
+
+std::unique_ptr<regex_program> regex_program::create(std::string_view pattern,
+                                                     regex_flags flags,
+                                                     capture_groups capture)
+{
+  auto p = new regex_program(pattern, flags, capture);
+  return std::unique_ptr<regex_program>(p);
+}
+
+regex_program::~regex_program()                     = default;
+regex_program::regex_program(regex_program&& other) = default;
+regex_program& regex_program::operator=(regex_program&& other) = default;
+
+regex_program::regex_program(std::string_view pattern, regex_flags flags, capture_groups capture)
+  : _pattern(pattern),
+    _flags(flags),
+    _impl(
+      std::make_unique<regex_program_impl>(detail::reprog::create_from(pattern, flags, capture)))
+{
+}
+
+std::string regex_program::pattern() const { return _pattern; }
+
+regex_flags regex_program::flags() const { return _flags; }
+
+capture_groups regex_program::capture() const { return _capture; }
+
+int32_t regex_program::instructions_count() const { return _impl->prog.insts_count(); }
+
+int32_t regex_program::groups_count() const { return _impl->prog.groups_count(); }
+
+std::size_t regex_program::compute_working_memory_size(int32_t num_strings) const
+{
+  return detail::compute_working_memory_size(num_strings, instructions_count());
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/regex/regex_program_impl.h b/cpp/src/strings/regex/regex_program_impl.h
new file mode 100644
index 00000000000..eede2225bce
--- /dev/null
+++ b/cpp/src/strings/regex/regex_program_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "regcomp.h"
+#include "regex.cuh"
+
+#include <cudf/strings/regex/regex_program.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace strings {
+
+/**
+ * @brief Implementation object for regex_program
+ *
+ * It encapsulates internal reprog object used for building its device equivalent
+ */
+struct regex_program::regex_program_impl {
+  detail::reprog prog;
+
+  regex_program_impl(detail::reprog const& p) : prog(p) {}
+  regex_program_impl(detail::reprog&& p) : prog(p) {}
+
+  // TODO: There will be other options added here in the future to handle issues
+  // 10852 and possibly others like 11979
+};
+
+struct regex_device_builder {
+  static auto create_prog_device(regex_program const& p, rmm::cuda_stream_view stream)
+  {
+    return detail::reprog_device::create(p._impl->prog, stream);
+  }
+};
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cpp
similarity index 90%
rename from cpp/src/strings/regex/regexec.cu
rename to cpp/src/strings/regex/regexec.cpp
index 03247d24ba3..febad651f69 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cpp
@@ -33,7 +33,7 @@ namespace strings {
 namespace detail {
 
 // Copy reprog primitive values
-reprog_device::reprog_device(reprog& prog)
+reprog_device::reprog_device(reprog const& prog)
   : _startinst_id{prog.get_start_inst()},
     _num_capturing_groups{prog.groups_count()},
     _insts_count{prog.insts_count()},
@@ -45,22 +45,8 @@ reprog_device::reprog_device(reprog& prog)
 }
 
 std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_device::create(
-  std::string_view pattern, rmm::cuda_stream_view stream)
+  reprog const& h_prog, rmm::cuda_stream_view stream)
 {
-  return reprog_device::create(
-    pattern, regex_flags::MULTILINE, capture_groups::NON_CAPTURE, stream);
-}
-
-// Create instance of the reprog that can be passed into a device kernel
-std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_device::create(
-  std::string_view pattern,
-  regex_flags const flags,
-  capture_groups const capture,
-  rmm::cuda_stream_view stream)
-{
-  // compile pattern into host object
-  reprog h_prog = reprog::create_from(pattern, flags, capture);
-
   // compute size to hold all the member data
   auto const insts_count   = h_prog.insts_count();
   auto const classes_count = h_prog.classes_count();
@@ -144,7 +130,7 @@ void reprog_device::destroy() { delete this; }
 
 std::size_t reprog_device::working_memory_size(int32_t num_threads) const
 {
-  return relist::alloc_size(_insts_count, num_threads) * 2;
+  return compute_working_memory_size(num_threads, insts_counts());
 }
 
 std::pair<std::size_t, int32_t> reprog_device::compute_strided_working_memory(
@@ -176,6 +162,11 @@ int32_t reprog_device::compute_shared_memory_size() const
   return _prog_size < MAX_SHARED_MEM ? static_cast<int32_t>(_prog_size) : 0;
 }
 
+std::size_t compute_working_memory_size(int32_t num_threads, int32_t insts_count)
+{
+  return relist::alloc_size(insts_count, num_threads) * 2;
+}
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 9658610da18..383337c9088 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -16,6 +16,7 @@
 
 #include "backref_re.cuh"
 
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -102,19 +103,18 @@ std::pair<std::string, std::vector<backref_type>> parse_backrefs(std::string_vie
 
 //
 std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
-                                              std::string_view pattern,
+                                              regex_program const& prog,
                                               std::string_view replacement,
-                                              regex_flags const flags,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+  CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
   CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty");
 
-  // compile regex into device object
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   // parse the repl string for back-ref indicators
   auto group_count = std::min(99, d_prog->group_counts());  // group count should NOT exceed 99
@@ -152,8 +152,18 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::EXTRACT);
   return detail::replace_with_backrefs(
-    strings, pattern, replacement, flags, cudf::get_default_stream(), mr);
+    strings, *h_prog, replacement, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings,
+                                              regex_program const& prog,
+                                              std::string_view replacement,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::replace_with_backrefs(strings, prog, replacement, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index cc5cf1384ec..fcc24f36b5c 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/regex/regex.cuh>
+#include <strings/regex/regex_program_impl.h>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -125,13 +126,12 @@ struct replace_multi_regex_fn {
 
 }  // namespace
 
-std::unique_ptr<column> replace_re(
-  strings_column_view const& input,
-  std::vector<std::string> const& patterns,
-  strings_column_view const& replacements,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> replace_re(strings_column_view const& input,
+                                   std::vector<std::string> const& patterns,
+                                   strings_column_view const& replacements,
+                                   regex_flags const flags,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   if (patterns.empty()) {  // if no patterns; just return a copy
@@ -145,7 +145,8 @@ std::unique_ptr<column> replace_re(
     patterns.size());
   std::transform(
     patterns.begin(), patterns.end(), h_progs.begin(), [flags, stream](auto const& ptn) {
-      return reprog_device::create(ptn, flags, capture_groups::NON_CAPTURE, stream);
+      auto h_prog = regex_program::create(ptn, flags, capture_groups::NON_CAPTURE);
+      return regex_device_builder::create_prog_device(*h_prog, stream);
     });
 
   // get the longest regex for the dispatcher
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 04cb074c016..0e2f3169e8e 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -100,22 +101,20 @@ struct replace_regex_fn {
 }  // namespace
 
 //
-std::unique_ptr<column> replace_re(
-  strings_column_view const& input,
-  std::string_view pattern,
-  string_scalar const& replacement,
-  std::optional<size_type> max_replace_count,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> replace_re(strings_column_view const& input,
+                                   regex_program const& prog,
+                                   string_scalar const& replacement,
+                                   std::optional<size_type> max_replace_count,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   string_view d_repl(replacement.data(), replacement.size());
 
-  // compile regex into device object
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   auto const maxrepl = max_replace_count.value_or(-1);
 
@@ -141,10 +140,22 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    std::optional<size_type> max_replace_count,
                                    regex_flags const flags,
                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE);
+  return detail::replace_re(
+    strings, *h_prog, replacement, max_replace_count, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> replace_re(strings_column_view const& strings,
+                                   regex_program const& prog,
+                                   string_scalar const& replacement,
+                                   std::optional<size_type> max_replace_count,
+                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_re(
-    strings, pattern, replacement, max_replace_count, flags, cudf::get_default_stream(), mr);
+    strings, prog, replacement, max_replace_count, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index c48aedc5499..e6384d5d6e1 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -102,13 +102,12 @@ std::unique_ptr<column> find_fn(strings_column_view const& strings,
 
 }  // namespace
 
-std::unique_ptr<column> find(
-  strings_column_view const& strings,
-  string_scalar const& target,
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> find(strings_column_view const& strings,
+                             string_scalar const& target,
+                             size_type start,
+                             size_type stop,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(
                string_view d_string, string_view d_target, size_type start, size_type stop) {
@@ -122,13 +121,12 @@ std::unique_ptr<column> find(
   return find_fn(strings, target, start, stop, pfn, stream, mr);
 }
 
-std::unique_ptr<column> rfind(
-  strings_column_view const& strings,
-  string_scalar const& target,
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> rfind(strings_column_view const& strings,
+                              string_scalar const& target,
+                              size_type start,
+                              size_type stop,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(
                string_view d_string, string_view d_target, size_type start, size_type stop) {
@@ -366,11 +364,10 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
 
 }  // namespace
 
-std::unique_ptr<column> contains(
-  strings_column_view const& input,
-  string_scalar const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> contains(strings_column_view const& input,
+                                 string_scalar const& target,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   // use warp parallel when the average string width is greater than the threshold
   if (!input.is_empty() && ((input.chars_size() / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) {
@@ -384,11 +381,10 @@ std::unique_ptr<column> contains(
   return contains_fn(input, target, pfn, stream, mr);
 }
 
-std::unique_ptr<column> contains(
-  strings_column_view const& strings,
-  strings_column_view const& targets,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> contains(strings_column_view const& strings,
+                                 strings_column_view const& targets,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return d_string.find(d_target) != string_view::npos;
@@ -396,11 +392,10 @@ std::unique_ptr<column> contains(
   return contains_fn(strings, targets, pfn, stream, mr);
 }
 
-std::unique_ptr<column> starts_with(
-  strings_column_view const& strings,
-  string_scalar const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> starts_with(strings_column_view const& strings,
+                                    string_scalar const& target,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return (d_target.size_bytes() <= d_string.size_bytes()) &&
@@ -409,11 +404,10 @@ std::unique_ptr<column> starts_with(
   return contains_fn(strings, target, pfn, stream, mr);
 }
 
-std::unique_ptr<column> starts_with(
-  strings_column_view const& strings,
-  strings_column_view const& targets,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> starts_with(strings_column_view const& strings,
+                                    strings_column_view const& targets,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return (d_target.size_bytes() <= d_string.size_bytes()) &&
@@ -422,11 +416,10 @@ std::unique_ptr<column> starts_with(
   return contains_fn(strings, targets, pfn, stream, mr);
 }
 
-std::unique_ptr<column> ends_with(
-  strings_column_view const& strings,
-  string_scalar const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> ends_with(strings_column_view const& strings,
+                                  string_scalar const& target,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto const str_size = d_string.size_bytes();
@@ -438,11 +431,10 @@ std::unique_ptr<column> ends_with(
   return contains_fn(strings, target, pfn, stream, mr);
 }
 
-std::unique_ptr<column> ends_with(
-  strings_column_view const& strings,
-  strings_column_view const& targets,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> ends_with(strings_column_view const& strings,
+                                  strings_column_view const& targets,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto const str_size = d_string.size_bytes();
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index 389e6eccc43..1907c0d749b 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -34,11 +34,10 @@
 namespace cudf {
 namespace strings {
 namespace detail {
-std::unique_ptr<column> find_multiple(
-  strings_column_view const& input,
-  strings_column_view const& targets,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> find_multiple(strings_column_view const& input,
+                                      strings_column_view const& targets,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   auto const strings_count = input.size();
   auto const targets_count = targets.size();
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 07829581aa6..6ab1b3e726b 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -92,18 +93,16 @@ std::unique_ptr<column> findall_util(column_device_view const& d_strings,
 }  // namespace
 
 //
-std::unique_ptr<column> findall(
-  strings_column_view const& input,
-  std::string_view pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> findall(strings_column_view const& input,
+                                regex_program const& prog,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   auto const strings_count = input.size();
   auto const d_strings     = column_device_view::create(input.parent(), stream);
 
-  // compile regex into device object
-  auto const d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   // Create lists offsets column
   auto offsets   = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
@@ -139,7 +138,16 @@ std::unique_ptr<column> findall(strings_column_view const& input,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::findall(input, pattern, flags, cudf::get_default_stream(), mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE);
+  return detail::findall(input, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> findall(strings_column_view const& input,
+                                regex_program const& prog,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::findall(input, prog, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index acdd9efbb45..09aadb78554 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -181,11 +181,10 @@ struct rpartition_fn : public partition_fn {
 
 }  // namespace
 
-std::unique_ptr<table> partition(
-  strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> partition(strings_column_view const& strings,
+                                 string_scalar const& delimiter,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -209,11 +208,10 @@ std::unique_ptr<table> partition(
   return std::make_unique<table>(std::move(results));
 }
 
-std::unique_ptr<table> rpartition(
-  strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> rpartition(strings_column_view const& strings,
+                                  string_scalar const& delimiter,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 89b4c1d75c2..c11d7ad47f9 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -791,12 +791,11 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
 
 }  // namespace
 
-std::unique_ptr<table> split(
-  strings_column_view const& strings_column,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> split(strings_column_view const& strings_column,
+                             string_scalar const& delimiter,
+                             size_type maxsplit,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -816,12 +815,11 @@ std::unique_ptr<table> split(
     strings_column, split_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, stream, mr);
 }
 
-std::unique_ptr<table> rsplit(
-  strings_column_view const& strings_column,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
+                              string_scalar const& delimiter,
+                              size_type maxsplit,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 2538bab6229..fdd46300820 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -184,13 +185,13 @@ struct tokens_transform_fn {
 };
 
 std::unique_ptr<table> split_re(strings_column_view const& input,
-                                std::string_view pattern,
+                                regex_program const& prog,
                                 split_direction direction,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+  CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
 
   auto const strings_count = input.size();
 
@@ -200,12 +201,14 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
     return std::make_unique<table>(std::move(results));
   }
 
-  // create the regex device prog from the given pattern
-  auto d_prog    = reprog_device::create(pattern, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
+
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
-  auto offsets      = count_matches(*d_strings, *d_prog, strings_count + 1, stream);
+  auto offsets = count_matches(
+    *d_strings, *d_prog, strings_count + 1, stream, rmm::mr::get_current_device_resource());
   auto offsets_view = offsets->mutable_view();
   auto d_offsets    = offsets_view.data<offset_type>();
 
@@ -252,18 +255,19 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
 }
 
 std::unique_ptr<column> split_record_re(strings_column_view const& input,
-                                        std::string_view pattern,
+                                        regex_program const& prog,
                                         split_direction direction,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+  CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
 
   auto const strings_count = input.size();
 
-  // create the regex device prog from the given pattern
-  auto d_prog    = reprog_device::create(pattern, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
+
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
@@ -289,39 +293,39 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
 }  // namespace
 
 std::unique_ptr<table> split_re(strings_column_view const& input,
-                                std::string_view pattern,
+                                regex_program const& prog,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  return split_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr);
+  return split_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr);
 }
 
 std::unique_ptr<column> split_record_re(strings_column_view const& input,
-                                        std::string_view pattern,
+                                        regex_program const& prog,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  return split_record_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr);
+  return split_record_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr);
 }
 
 std::unique_ptr<table> rsplit_re(strings_column_view const& input,
-                                 std::string_view pattern,
+                                 regex_program const& prog,
                                  size_type maxsplit,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return split_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr);
+  return split_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr);
 }
 
 std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
-                                         std::string_view pattern,
+                                         regex_program const& prog,
                                          size_type maxsplit,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  return split_record_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr);
+  return split_record_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr);
 }
 
 }  // namespace detail
@@ -334,7 +338,17 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_re(input, pattern, maxsplit, cudf::get_default_stream(), mr);
+  auto const h_prog = regex_program::create(pattern);
+  return detail::split_re(input, *h_prog, maxsplit, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                regex_program const& prog,
+                                size_type maxsplit,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> split_record_re(strings_column_view const& input,
@@ -343,7 +357,17 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record_re(input, pattern, maxsplit, cudf::get_default_stream(), mr);
+  auto const h_prog = regex_program::create(pattern);
+  return detail::split_record_re(input, *h_prog, maxsplit, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        regex_program const& prog,
+                                        size_type maxsplit,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> rsplit_re(strings_column_view const& input,
@@ -352,7 +376,17 @@ std::unique_ptr<table> rsplit_re(strings_column_view const& input,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit_re(input, pattern, maxsplit, cudf::get_default_stream(), mr);
+  auto const h_prog = regex_program::create(pattern);
+  return detail::rsplit_re(input, *h_prog, maxsplit, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<table> rsplit_re(strings_column_view const& input,
+                                 regex_program const& prog,
+                                 size_type maxsplit,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::rsplit_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
@@ -361,7 +395,18 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit_record_re(input, pattern, maxsplit, cudf::get_default_stream(), mr);
+  auto const h_prog = regex_program::create(pattern);
+  return detail::rsplit_record_re(input, *h_prog, maxsplit, cudf::get_default_stream(), mr);
 }
+
+std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
+                                         regex_program const& prog,
+                                         size_type maxsplit,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::rsplit_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 83d8d7f9203..d935ad0b1da 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -264,12 +264,11 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
 }
 
 template <Dir dir>
-std::unique_ptr<column> split_record(
-  strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> split_record(strings_column_view const& strings,
+                                     string_scalar const& delimiter,
+                                     size_type maxsplit,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index ca30eb3f6d8..2159b67774e 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -56,13 +56,12 @@ std::unique_ptr<column> make_strings_column(
   return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr);
 }
 
-std::unique_ptr<column> make_strings_column(
-  device_span<char> chars,
-  device_span<size_type> offsets,
-  size_type null_count,
-  rmm::device_buffer&& null_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> make_strings_column(device_span<char> chars,
+                                            device_span<size_type> offsets,
+                                            size_type null_count,
+                                            rmm::device_buffer&& null_mask,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index e982050b8d6..6fb7c671a87 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -56,12 +56,11 @@ struct strip_transform_fn {
 
 }  // namespace
 
-std::unique_ptr<column> strip(
-  strings_column_view const& input,
-  side_type side                      = side_type::BOTH,
-  string_scalar const& to_strip       = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> strip(strings_column_view const& input,
+                              side_type side,
+                              string_scalar const& to_strip,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index e0d1bc8cf31..2acc834a1cb 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -105,13 +105,12 @@ struct substring_fn {
 }  // namespace
 
 //
-std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
-  numeric_scalar<size_type> const& start = numeric_scalar<size_type>(0, false),
-  numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
-  numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
-  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+                                      numeric_scalar<size_type> const& start,
+                                      numeric_scalar<size_type> const& stop,
+                                      numeric_scalar<size_type> const& step,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -291,12 +290,11 @@ void compute_substring_indices(column_device_view const& d_column,
 }  // namespace
 
 //
-std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
-  column_view const& starts_column,
-  column_view const& stops_column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+                                      column_view const& starts_column,
+                                      column_view const& stops_column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 01ecc49f10a..5b23b092cce 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -86,11 +86,10 @@ struct translate_fn {
 }  // namespace
 
 //
-std::unique_ptr<column> translate(
-  strings_column_view const& strings,
-  std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> translate(strings_column_view const& strings,
+                                  std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index cd0aafc3545..335908d65d1 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -91,11 +91,10 @@ struct execute_wrap {
 }  // namespace
 
 template <typename device_execute_functor>
-std::unique_ptr<column> wrap(
-  strings_column_view const& strings,
-  size_type width,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> wrap(strings_column_view const& strings,
+                             size_type width,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(width > 0, "Positive wrap width required");
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8675dc891c1..5ff2e9bf6d6 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -146,7 +146,7 @@ ConfigureTest(HASH_MAP_TEST hash_map/map_test.cu)
 # ##################################################################################################
 # * quantiles tests -------------------------------------------------------------------------------
 ConfigureTest(
-  QUANTILES_TEST quantiles/percentile_approx_test.cu quantiles/quantile_test.cpp
+  QUANTILES_TEST quantiles/percentile_approx_test.cpp quantiles/quantile_test.cpp
   quantiles/quantiles_test.cpp
 )
 
diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu
index a8abaa33ac3..bf2937ae8ab 100644
--- a/cpp/tests/copying/detail_gather_tests.cu
+++ b/cpp/tests/copying/detail_gather_tests.cu
@@ -48,7 +48,7 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
   constexpr cudf::size_type source_size{1000};
   rmm::device_uvector<cudf::size_type> gather_map(source_size, cudf::get_default_stream());
   thrust::sequence(
-    rmm::exec_policy(cudf::get_default_stream()), gather_map.begin(), gather_map.end());
+    rmm::exec_policy_nosync(cudf::get_default_stream()), gather_map.begin(), gather_map.end());
 
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
   cudf::test::fixed_width_column_wrapper<TypeParam> source_column(data, data + source_size);
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 6d903cca020..2538cd9d851 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -81,8 +81,9 @@ TYPED_TEST(FixedWidthGetValueTest, IndexOutOfBounds)
 {
   fixed_width_column_wrapper<TypeParam, int32_t> col({9, 8, 7, 6}, {0, 1, 0, 1});
 
-  CUDF_EXPECT_THROW_MESSAGE(get_element(col, -1);, "Index out of bounds");
-  CUDF_EXPECT_THROW_MESSAGE(get_element(col, 4);, "Index out of bounds");
+  // Test for out of bounds indexes in both directions.
+  EXPECT_THROW(get_element(col, -1), cudf::logic_error);
+  EXPECT_THROW(get_element(col, 4), cudf::logic_error);
 }
 
 struct StringGetValueTest : public BaseFixture {
diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp
index c05db05d57c..3ba7f668595 100644
--- a/cpp/tests/copying/segmented_gather_list_tests.cpp
+++ b/cpp/tests/copying/segmented_gather_list_tests.cpp
@@ -576,26 +576,31 @@ TEST_F(SegmentedGatherTestFloat, Fails)
   cudf::test::strings_column_wrapper nonlist_map1{"1", "2", "0", "1"};
   LCW<cudf::string_view> nonlist_map2{{"1", "2", "0", "1"}};
 
-  CUDF_EXPECT_THROW_MESSAGE(
+  // Input must be a list of integer indices. It should fail for integers,
+  // strings, or lists containing anything other than integers.
+  EXPECT_THROW(
     cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{nonlist_map0}),
-    "lists_column_view only supports lists");
+    cudf::logic_error);
 
-  CUDF_EXPECT_THROW_MESSAGE(
+  EXPECT_THROW(
     cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{nonlist_map1}),
-    "lists_column_view only supports lists");
+    cudf::logic_error);
 
-  CUDF_EXPECT_THROW_MESSAGE(
+  EXPECT_THROW(
     cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{nonlist_map2}),
-    "Gather map should be list column of index type");
+    cudf::logic_error);
 
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
   LCW<int8_t> nulls_map{{{3, 2, 1, 0}, {0}, {0}, {0, 1}}, valids};
-  CUDF_EXPECT_THROW_MESSAGE(
+
+  // Nulls are not supported in the gather map.
+  EXPECT_THROW(
     cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{nulls_map}),
-    "Gather map contains nulls");
+    cudf::logic_error);
 
-  CUDF_EXPECT_THROW_MESSAGE(cudf::lists::detail::segmented_gather(
-                              lists_column_view{list}, lists_column_view{size_mismatch_map}),
-                            "Gather map and list column should be same size");
+  // Gather map and list column sizes must be the same.
+  EXPECT_THROW(cudf::lists::detail::segmented_gather(lists_column_view{list},
+                                                     lists_column_view{size_mismatch_map}),
+               cudf::logic_error);
 }
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index e83d961cd9b..eb4a3e895f9 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -29,25 +29,10 @@ TEST(ExpectsTest, FalseCondition)
 
 TEST(ExpectsTest, TrueCondition) { EXPECT_NO_THROW(CUDF_EXPECTS(true, "condition is true")); }
 
-TEST(ExpectsTest, TryCatch)
-{
-  CUDF_EXPECT_THROW_MESSAGE(CUDF_EXPECTS(false, "test reason"), "test reason");
-}
-
-TEST(CudaTryTest, Error)
-{
-  CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure),
-                            "cudaErrorLaunchFailure unspecified launch failure");
-}
+TEST(CudaTryTest, Error) { EXPECT_THROW(CUDF_CUDA_TRY(cudaErrorLaunchFailure), cudf::cuda_error); }
 
 TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); }
 
-TEST(CudaTryTest, TryCatch)
-{
-  CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorMemoryAllocation),
-                            "cudaErrorMemoryAllocation out of memory");
-}
-
 TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); }
 
 namespace {
@@ -79,9 +64,7 @@ TEST(StreamCheck, CatchFailedKernel)
 #ifndef NDEBUG
   stream.synchronize();
 #endif
-  CUDA_EXPECT_THROW_MESSAGE(CUDF_CHECK_CUDA(stream.value()),
-                            "cudaErrorInvalidConfiguration "
-                            "invalid configuration argument");
+  EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error);
 }
 
 __global__ void kernel() { asm("trap;"); }
diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp
index 164e967e28e..54df690d307 100644
--- a/cpp/tests/groupby/count_scan_tests.cpp
+++ b/cpp/tests/groupby/count_scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,9 +53,10 @@ TYPED_TEST(groupby_count_scan_test, basic)
   result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
   // clang-format on
 
+  // Count groupby aggregation is only supported with null_policy::EXCLUDE
   auto agg1 = cudf::make_count_aggregation<groupby_scan_aggregation>();
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
-                            "Unsupported groupby scan aggregation");
+  EXPECT_THROW(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
+               cudf::logic_error);
 
   auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
@@ -181,13 +182,13 @@ TYPED_TEST(FixedPointTestAllReps, GroupByCountScan)
   auto const expect_vals = result_wrapper{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
   // clang-format on
 
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     vals,
-                     expect_keys,
-                     expect_vals,
-                     cudf::make_count_aggregation<groupby_scan_aggregation>()),
-    "Unsupported groupby scan aggregation");
+  // Count groupby aggregation is only supported with null_policy::EXCLUDE
+  EXPECT_THROW(test_single_scan(keys,
+                                vals,
+                                expect_keys,
+                                expect_vals,
+                                cudf::make_count_aggregation<groupby_scan_aggregation>()),
+               cudf::logic_error);
 
   auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
@@ -209,9 +210,10 @@ TEST_F(groupby_dictionary_count_scan_test, basic)
   result_wrapper expect_vals{0, 0, 0, 1, 0, 1};
   // clang-format on
 
+  // Count groupby aggregation is only supported with null_policy::EXCLUDE
   auto agg1 = cudf::make_count_aggregation<groupby_scan_aggregation>();
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
-                            "Unsupported groupby scan aggregation");
+  EXPECT_THROW(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
+               cudf::logic_error);
   test_single_scan(keys,
                    vals,
                    expect_keys,
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index b333d9dacba..83f522ed913 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,57 +131,5 @@ inline void test_single_scan(column_view const& keys,
     expect_vals, *result.second[0].results[0], debug_output_level::ALL_ERRORS);
 }
 
-template <typename T>
-inline T frand()
-{
-  return static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
-}
-
-template <typename T>
-inline T rand_range(T min, T max)
-{
-  return min + static_cast<T>(frand<T>() * (max - min));
-}
-
-inline std::unique_ptr<column> generate_typed_percentile_distribution(
-  std::vector<double> const& buckets,
-  std::vector<int> const& sizes,
-  data_type t,
-  bool sorted = false)
-{
-  srand(0);
-
-  std::vector<double> values;
-  size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0);
-  values.reserve(total_size);
-  for (size_t idx = 0; idx < sizes.size(); idx++) {
-    double min = idx == 0 ? 0.0f : buckets[idx - 1];
-    double max = buckets[idx];
-
-    for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) {
-      values.push_back(rand_range(min, max));
-    }
-  }
-
-  if (sorted) { std::sort(values.begin(), values.end()); }
-
-  cudf::test::fixed_width_column_wrapper<double> src(values.begin(), values.end());
-  return cudf::cast(src, t);
-}
-
-// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent
-// is to provide a standardized set of inputs for use with tdigest generation tests and
-// percentile_approx tests. std::vector<double>
-// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector<int>
-// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
-inline std::unique_ptr<column> generate_standardized_percentile_distribution(
-  data_type t = data_type{type_id::FLOAT64}, bool sorted = false)
-{
-  std::vector<double> buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f};
-  std::vector<int> b_sizes{
-    50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
-  return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted);
-}
-
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp
index 19e82c4ffd1..d2f2f233953 100644
--- a/cpp/tests/groupby/keys_tests.cpp
+++ b/cpp/tests/groupby/keys_tests.cpp
@@ -234,12 +234,11 @@ TYPED_TEST(groupby_keys_test, mismatch_num_rows)
   fixed_width_column_wrapper<K> keys{1, 2, 3};
   fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4};
 
+  // Verify that scan throws an error when given data of mismatched sizes.
   auto agg = cudf::make_count_aggregation<groupby_aggregation>();
-  CUDF_EXPECT_THROW_MESSAGE(test_single_agg(keys, vals, keys, vals, std::move(agg)),
-                            "Size mismatch between request values and groupby keys.");
+  EXPECT_THROW(test_single_agg(keys, vals, keys, vals, std::move(agg)), cudf::logic_error);
   auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>();
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg2)),
-                            "Size mismatch between request values and groupby keys.");
+  EXPECT_THROW(test_single_scan(keys, vals, keys, vals, std::move(agg2)), cudf::logic_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index d4e8b4cbf0f..c9f31576aff 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -508,65 +508,60 @@ TEST_F(groupby_rank_scan_test_failures, DISABLED_test_exception_triggers)
   auto const keys = input<T>{{1, 2, 3}, null_at(2)};
   auto const col  = input<T>{3, 3, 1};
 
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
-                     null_policy::INCLUDE,
-                     sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
-                     null_policy::INCLUDE,
-                     sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
-                     null_policy::EXCLUDE,
-                     sorted::YES),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
-                     null_policy::EXCLUDE,
-                     sorted::YES),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
-                     null_policy::EXCLUDE,
-                     sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
-                     null_policy::EXCLUDE,
-                     sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
+  // All of these aggregations raise exceptions unless provided presorted keys
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
+                                null_policy::INCLUDE,
+                                sorted::NO),
+               cudf::logic_error);
+
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
+                                null_policy::INCLUDE,
+                                sorted::NO),
+               cudf::logic_error);
+
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
+                                null_policy::EXCLUDE,
+                                sorted::YES),
+               cudf::logic_error);
+
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
+                                null_policy::EXCLUDE,
+                                sorted::YES),
+               cudf::logic_error);
+
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
+                                null_policy::EXCLUDE,
+                                sorted::NO),
+               cudf::logic_error);
+
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
+                                null_policy::EXCLUDE,
+                                sorted::NO),
+               cudf::logic_error);
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 2e4a41a70f8..70b0851c814 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -17,7 +17,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -466,13 +466,13 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::detail::tdigest::make_empty_tdigest_column(cudf::get_default_stream());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
   auto b = cudf::type_dispatcher(
     static_cast<column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::detail::tdigest::make_empty_tdigest_column(cudf::get_default_stream());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
   auto d = cudf::type_dispatcher(
     static_cast<column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::detail::tdigest::make_empty_tdigest_column(cudf::get_default_stream());
+  auto e = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
 
   std::vector<column_view> cols;
   cols.push_back(*a);
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index 51dfc467e00..a97f44bce43 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -16,6 +16,7 @@
 
 #include <io/comp/gpuinflate.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
+#include <src/io/comp/nvcomp_adapter.hpp>
 
 #include <cudf/utilities/default_stream.hpp>
 
@@ -118,6 +119,9 @@ struct BrotliDecompressTest : public DecompressTest<BrotliDecompressTest> {
   }
 };
 
+struct NvcompConfigTest : public cudf::test::BaseFixture {
+};
+
 TEST_F(GzipDecompressTest, HelloWorld)
 {
   constexpr char uncompressed[]  = "hello world";
@@ -166,4 +170,58 @@ TEST_F(BrotliDecompressTest, HelloWorld)
   EXPECT_EQ(output, input);
 }
 
+TEST_F(NvcompConfigTest, Compression)
+{
+  using cudf::io::nvcomp::compression_type;
+  auto const& comp_disabled = cudf::io::nvcomp::is_compression_disabled;
+
+  EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {2, 5, 0, true, true, 0}));
+  // version 2.5 required
+  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {2, 4, 0, true, true, 0}));
+  // all integrations enabled required
+  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {2, 5, 0, false, true, 0}));
+
+  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 0}));
+  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {2, 4, 0, false, true, 0}));
+  // 2.4 version required
+  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {2, 3, 1, false, true, 0}));
+  // stable integrations enabled required
+  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {2, 4, 0, false, false, 0}));
+
+  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {2, 5, 0, true, true, 0}));
+  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {2, 4, 0, false, true, 0}));
+  // stable integrations enabled required
+  EXPECT_TRUE(comp_disabled(compression_type::SNAPPY, {2, 3, 0, false, false, 0}));
+}
+
+TEST_F(NvcompConfigTest, Decompression)
+{
+  using cudf::io::nvcomp::compression_type;
+  auto const& decomp_disabled = cudf::io::nvcomp::is_decompression_disabled;
+
+  EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {2, 5, 0, true, true, 7}));
+  // version 2.5 required
+  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {2, 4, 0, true, true, 7}));
+  // all integrations enabled required
+  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {2, 5, 0, false, true, 7}));
+
+  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 7}));
+  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 3, 2, false, true, 6}));
+  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 3, 0, true, true, 6}));
+  // 2.3.1 and earlier requires all integrations to be enabled
+  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 3, 1, false, true, 7}));
+  // 2.3 version required
+  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 2, 0, true, true, 7}));
+  // stable integrations enabled required
+  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, false, false, 7}));
+  // 2.4.0 disabled on Pascal
+  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 6}));
+
+  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 4, 0, true, true, 7}));
+  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 3, 0, false, true, 7}));
+  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 2, 0, false, true, 7}));
+  // stable integrations enabled required
+  EXPECT_TRUE(decomp_disabled(compression_type::SNAPPY, {2, 2, 0, false, false, 7}));
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 8100c8e3d7f..eeca87446ec 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -2244,4 +2244,50 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch)
   EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1");
 }
 
+TEST_F(CsvReaderTest, EmptyColumns)
+{
+  // First column only has empty fields. second column contains only "null" literals
+  std::string csv_in{",null\n,null"};
+
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+      .names({"a", "b", "c", "d"})
+      .header(-1);
+  // More elements in `names` than in the file; additional columns are filled with nulls
+  auto result = cudf::io::read_csv(in_opts);
+
+  const auto result_table = result.tbl->view();
+  EXPECT_EQ(result_table.num_columns(), 4);
+  // All columns should contain only nulls; expect INT8 type to use as little memory as possible
+  for (auto& column : result_table) {
+    EXPECT_EQ(column.type(), data_type{type_id::INT8});
+    EXPECT_EQ(column.null_count(), 2);
+  }
+}
+
+TEST_F(CsvReaderTest, BlankLineAfterFirstRow)
+{
+  std::string csv_in{"12,9., 10\n\n"};
+
+  {
+    cudf::io::csv_reader_options no_header_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+        .header(-1);
+    // No header, getting column names/count from first row
+    auto result = cudf::io::read_csv(no_header_opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 3);
+  }
+  {
+    cudf::io::csv_reader_options header_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()});
+    // Getting column names/count from header
+    auto result = cudf::io::read_csv(header_opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 3);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index 59942eecc1d..d3cd1dd9490 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -750,9 +750,9 @@ TEST_F(JsonTest, TreeRepresentationError)
     cudf::io::json::detail::get_token_stream(d_input, options, stream);
 
   // Get the JSON's tree representation
-  CUDF_EXPECT_THROW_MESSAGE(
-    cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream),
-    "JSON Parser encountered an invalid format at location 6");
+  // This JSON is invalid and will raise an exception.
+  EXPECT_THROW(cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream),
+               cudf::logic_error);
 }
 
 /**
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 01a1f0647cc..5a556f35501 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -588,11 +588,11 @@ TEST_P(JsonParserTest, ExpectFailMixStructAndList)
     R"( [{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}] )",
     R"( [{"a":[123, "123"], "b":1.0}, {"b":1.1}, {"b":2.1}] )"};
 
+  // libcudf does not currently support a mix of lists and structs.
   for (auto const& input : inputs_fail) {
-    CUDF_EXPECT_THROW_MESSAGE(
-      auto const cudf_table = json_parser(
-        cudf::host_span<SymbolT const>{input.data(), input.size()}, options, stream, mr),
-      "A mix of lists and structs within the same column is not supported");
+    EXPECT_THROW(auto const cudf_table = json_parser(
+                   cudf::host_span<SymbolT const>{input.data(), input.size()}, options, stream, mr),
+                 cudf::logic_error);
   }
 
   for (auto const& input : inputs_succeed) {
diff --git a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
index a5b036210ba..17265326fde 100644
--- a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
@@ -217,17 +217,15 @@ TEST_F(ApplyBooleanMaskTest, Failure)
     // Invalid mask type.
     auto const input  = lists<int32_t>{{1, 2, 3}, {4, 5, 6}};
     auto const filter = lists<int32_t>{{0, 0, 0}};
-    CUDF_EXPECT_THROW_MESSAGE(
-      apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}),
-      "Mask must be of type BOOL8.");
+    EXPECT_THROW(apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}),
+                 cudf::logic_error);
   }
   {
     // Mismatched number of rows.
     auto const input  = lists<int32_t>{{1, 2, 3}, {4, 5, 6}};
     auto const filter = filter_t{{0, 0, 0}};
-    CUDF_EXPECT_THROW_MESSAGE(
-      apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}),
-      "Boolean masks column must have same number of rows as input.");
+    EXPECT_THROW(apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}),
+                 cudf::logic_error);
   }
 }
 }  // namespace cudf::test
diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cpp
similarity index 69%
rename from cpp/tests/quantiles/percentile_approx_test.cu
rename to cpp/tests/quantiles/percentile_approx_test.cpp
index b02b7d6c336..2840d275d4d 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cu
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -13,56 +13,47 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <arrow/util/tdigest.h>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/tdigest_utilities.cuh>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/tdigest/tdigest.hpp>
-#include <cudf/detail/valid_if.cuh>
 #include <cudf/groupby.hpp>
 #include <cudf/quantiles.hpp>
 #include <cudf/reduction.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/sorting.hpp>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/transform.hpp>
-#include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_list_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <rmm/exec_policy.hpp>
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <thrust/fill.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-
-using namespace cudf;
-using namespace cudf::tdigest;
+#include <arrow/util/tdigest.h>
 
-std::unique_ptr<column> arrow_percentile_approx(column_view const& _values,
-                                                int delta,
-                                                std::vector<double> const& percentages)
+std::unique_ptr<cudf::column> arrow_percentile_approx(cudf::column_view const& _values,
+                                                      int delta,
+                                                      std::vector<double> const& percentages)
 {
   // sort the incoming values using the same settings that groupby does.
   // this is a little weak because null_order::AFTER is hardcoded internally to groupby.
-  table_view t({_values});
-  auto sorted_t      = cudf::sort(t, {}, {null_order::AFTER});
+  cudf::table_view t({_values});
+  auto sorted_t      = cudf::sort(t, {}, {cudf::null_order::AFTER});
   auto sorted_values = sorted_t->get_column(0).view();
 
   std::vector<double> h_values(sorted_values.size());
-  cudaMemcpy(h_values.data(),
-             sorted_values.data<double>(),
-             sizeof(double) * sorted_values.size(),
-             cudaMemcpyDeviceToHost);
+  CUDF_CUDA_TRY(cudaMemcpy(h_values.data(),
+                           sorted_values.data<double>(),
+                           sizeof(double) * sorted_values.size(),
+                           cudaMemcpyDeviceToHost));
   std::vector<char> h_validity(sorted_values.size());
   if (sorted_values.null_mask() != nullptr) {
     auto validity = cudf::mask_to_bools(sorted_values.null_mask(), 0, sorted_values.size());
-    cudaMemcpy(h_validity.data(),
-               (validity->view().data<char>()),
-               sizeof(char) * sorted_values.size(),
-               cudaMemcpyDeviceToHost);
+    CUDF_CUDA_TRY(cudaMemcpy(h_validity.data(),
+                             (validity->view().data<char>()),
+                             sizeof(char) * sorted_values.size(),
+                             cudaMemcpyDeviceToHost));
   }
 
   // generate the tdigest
@@ -79,8 +70,8 @@ std::unique_ptr<column> arrow_percentile_approx(column_view const& _values,
       return atd.Quantile(p);
     });
   cudf::test::fixed_width_column_wrapper<double> result(h_result.begin(), h_result.end());
-  cudf::test::fixed_width_column_wrapper<size_type> offsets{
-    0, static_cast<size_type>(percentages.size())};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{
+    0, static_cast<cudf::size_type>(percentages.size())};
   return cudf::make_lists_column(1, offsets.release(), result.release(), 0, {});
 }
 
@@ -89,18 +80,18 @@ struct percentile_approx_dispatch {
     typename T,
     typename Func,
     typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<column> operator()(Func op,
-                                     column_view const& values,
-                                     int delta,
-                                     std::vector<double> const& percentages,
-                                     size_type ulps)
+  std::unique_ptr<cudf::column> operator()(Func op,
+                                           cudf::column_view const& values,
+                                           int delta,
+                                           std::vector<double> const& percentages,
+                                           cudf::size_type ulps)
   {
     // arrow implementation.
     auto expected = [&]() {
       // we're explicitly casting back to doubles here but this is ok because that is
       // exactly what happens inside of the cudf implementation as values are processed as well. so
       // this should not affect results.
-      auto as_doubles = cudf::cast(values, data_type{type_id::FLOAT64});
+      auto as_doubles = cudf::cast(values, cudf::data_type{cudf::type_id::FLOAT64});
       return arrow_percentile_approx(*as_doubles, delta, percentages);
     }();
 
@@ -109,7 +100,7 @@ struct percentile_approx_dispatch {
 
     cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
                                                                  percentages.end());
-    tdigest_column_view tdv(*agg_result);
+    cudf::tdigest::tdigest_column_view tdv(*agg_result);
     auto result = cudf::percentile_approx(tdv, g_percentages);
 
     cudf::test::expect_columns_equivalent(
@@ -122,21 +113,21 @@ struct percentile_approx_dispatch {
     typename T,
     typename Func,
     typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<column> operator()(Func op,
-                                     column_view const& values,
-                                     int delta,
-                                     std::vector<double> const& percentages,
-                                     size_type ulps)
+  std::unique_ptr<cudf::column> operator()(Func op,
+                                           cudf::column_view const& values,
+                                           int delta,
+                                           std::vector<double> const& percentages,
+                                           cudf::size_type ulps)
   {
     CUDF_FAIL("Invalid input type for percentile_approx test");
   }
 };
 
-void percentile_approx_test(column_view const& _keys,
-                            column_view const& _values,
+void percentile_approx_test(cudf::column_view const& _keys,
+                            cudf::column_view const& _values,
                             int delta,
                             std::vector<double> const& percentages,
-                            size_type ulps)
+                            cudf::size_type ulps)
 {
   // first pass:  validate the actual percentages we get per group.
 
@@ -146,8 +137,8 @@ void percentile_approx_test(column_view const& _keys,
   cudf::table_view v({_values});
   auto groups = pass1_gb.get_groups(v);
   // slice it all up so we have keys/columns for everything.
-  std::vector<column_view> keys;
-  std::vector<column_view> values;
+  std::vector<cudf::column_view> keys;
+  std::vector<cudf::column_view> values;
   for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) {
     auto k =
       cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
@@ -158,11 +149,11 @@ void percentile_approx_test(column_view const& _keys,
     values.push_back(v[0]);
   }
 
-  std::vector<std::unique_ptr<column>> groupby_parts;
-  std::vector<std::unique_ptr<column>> reduce_parts;
+  std::vector<std::unique_ptr<cudf::column>> groupby_parts;
+  std::vector<std::unique_ptr<cudf::column>> reduce_parts;
   for (size_t idx = 0; idx < values.size(); idx++) {
     // via groupby
-    auto groupby = [&](column_view const& values, int delta) {
+    auto groupby = [&](cudf::column_view const& values, int delta) {
       cudf::table_view t({keys[idx]});
       cudf::groupby::groupby gb(t);
       std::vector<cudf::groupby::aggregation_request> requests;
@@ -180,12 +171,12 @@ void percentile_approx_test(column_view const& _keys,
                                                   ulps));
 
     // via reduce
-    auto reduce = [](column_view const& values, int delta) {
+    auto reduce = [](cudf::column_view const& values, int delta) {
       // result is a scalar, but we want to extract out the underlying column
       auto scalar_result =
         cudf::reduce(values,
                      *cudf::make_tdigest_aggregation<cudf::reduce_aggregation>(delta),
-                     data_type{type_id::STRUCT});
+                     cudf::data_type{cudf::type_id::STRUCT});
       auto tbl = static_cast<cudf::struct_scalar const*>(scalar_result.get())->view();
       std::vector<std::unique_ptr<cudf::column>> cols;
       std::transform(
@@ -206,11 +197,11 @@ void percentile_approx_test(column_view const& _keys,
 
   // second pass. run the percentile_approx with all the keys in one pass and make sure we get the
   // same results as the concatenated by-key results.
-  std::vector<column_view> part_views;
+  std::vector<cudf::column_view> part_views;
   std::transform(groupby_parts.begin(),
                  groupby_parts.end(),
                  std::back_inserter(part_views),
-                 [](std::unique_ptr<column> const& c) { return c->view(); });
+                 [](std::unique_ptr<cudf::column> const& c) { return c->view(); });
   auto expected = cudf::concatenate(part_views);
 
   cudf::groupby::groupby gb(k);
@@ -222,22 +213,20 @@ void percentile_approx_test(column_view const& _keys,
 
   cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
                                                                percentages.end());
-  tdigest_column_view tdv(*(gb_result.second[0].results[0]));
+  cudf::tdigest::tdigest_column_view tdv(*(gb_result.second[0].results[0]));
   auto result = cudf::percentile_approx(tdv, g_percentages);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result);
 }
 
-void simple_test(data_type input_type, std::vector<std::pair<int, int>> params)
+void simple_test(cudf::data_type input_type, std::vector<std::pair<int, int>> params)
 {
   auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
-  thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
-               keys->mutable_view().template begin<int>(),
-               keys->mutable_view().template end<int>(),
-               0);
+    cudf::data_type{cudf::type_id::INT32}, values->size(), cudf::mask_state::UNALLOCATED);
+  CUDF_CUDA_TRY(
+    cudaMemset(keys->mutable_view().data<int32_t>(), 0, values->size() * sizeof(int32_t)));
 
   // runs both groupby and reduce paths
   std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
@@ -247,21 +236,22 @@ void simple_test(data_type input_type, std::vector<std::pair<int, int>> params)
 }
 
 struct group_index {
-  __device__ int operator()(int i) { return i / 150000; }
+  int32_t operator()(int32_t i) { return i / 150000; }
 };
 
-void grouped_test(data_type input_type, std::vector<std::pair<int, int>> params)
+void grouped_test(cudf::data_type input_type, std::vector<std::pair<int, int>> params)
 {
   auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
-  auto i = thrust::make_counting_iterator(0);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    i,
-                    i + values->size(),
-                    keys->mutable_view().template begin<int>(),
-                    group_index{});
+    cudf::data_type{cudf::type_id::INT32}, values->size(), cudf::mask_state::UNALLOCATED);
+  auto i      = thrust::make_counting_iterator(0);
+  auto h_keys = std::vector<int32_t>(values->size());
+  std::transform(i, i + values->size(), h_keys.begin(), group_index{});
+  CUDF_CUDA_TRY(cudaMemcpy(keys->mutable_view().data<int32_t>(),
+                           h_keys.data(),
+                           h_keys.size() * sizeof(int32_t),
+                           cudaMemcpyHostToDevice));
 
   std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
     percentile_approx_test(
@@ -269,25 +259,21 @@ void grouped_test(data_type input_type, std::vector<std::pair<int, int>> params)
   });
 }
 
-std::pair<rmm::device_buffer, size_type> make_null_mask(column_view const& col)
+std::pair<rmm::device_buffer, cudf::size_type> make_null_mask(cudf::column_view const& col)
 {
-  return cudf::detail::valid_if(
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(col.size()),
-    [] __device__(size_type i) { return i % 2 == 0; },
-    cudf::get_default_stream());
+  auto itr  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+  auto mask = cudf::test::detail::make_null_mask(itr, itr + col.size());
+  return std::make_pair(std::move(mask), col.size() / 2);
 }
 
-void simple_with_nulls_test(data_type input_type, std::vector<std::pair<int, int>> params)
+void simple_with_nulls_test(cudf::data_type input_type, std::vector<std::pair<int, int>> params)
 {
   auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
-  thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
-               keys->mutable_view().template begin<int>(),
-               keys->mutable_view().template end<int>(),
-               0);
+    cudf::data_type{cudf::type_id::INT32}, values->size(), cudf::mask_state::UNALLOCATED);
+  CUDF_CUDA_TRY(
+    cudaMemset(keys->mutable_view().data<int32_t>(), 0, values->size() * sizeof(int32_t)));
 
   // add a null mask
   auto mask = make_null_mask(*values);
@@ -299,18 +285,19 @@ void simple_with_nulls_test(data_type input_type, std::vector<std::pair<int, int
   });
 }
 
-void grouped_with_nulls_test(data_type input_type, std::vector<std::pair<int, int>> params)
+void grouped_with_nulls_test(cudf::data_type input_type, std::vector<std::pair<int, int>> params)
 {
   auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
-  auto i = thrust::make_counting_iterator(0);
-  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
-                    i,
-                    i + values->size(),
-                    keys->mutable_view().template begin<int>(),
-                    group_index{});
+    cudf::data_type{cudf::type_id::INT32}, values->size(), cudf::mask_state::UNALLOCATED);
+  auto i      = thrust::make_counting_iterator(0);
+  auto h_keys = std::vector<int32_t>(values->size());
+  std::transform(i, i + values->size(), h_keys.begin(), group_index{});
+  CUDF_CUDA_TRY(cudaMemcpy(keys->mutable_view().data<int32_t>(),
+                           h_keys.data(),
+                           h_keys.size() * sizeof(int32_t),
+                           cudaMemcpyHostToDevice));
 
   // add a null mask
   auto mask = make_null_mask(*values);
@@ -323,10 +310,10 @@ void grouped_with_nulls_test(data_type input_type, std::vector<std::pair<int, in
 }
 
 template <typename T>
-data_type get_appropriate_type()
+cudf::data_type get_appropriate_type()
 {
-  if constexpr (cudf::is_fixed_point<T>()) { return data_type{cudf::type_to_id<T>(), -7}; }
-  return data_type{cudf::type_to_id<T>()};
+  if constexpr (cudf::is_fixed_point<T>()) { return cudf::data_type{cudf::type_to_id<T>(), -7}; }
+  return cudf::data_type{cudf::type_to_id<T>()};
 }
 
 using PercentileApproxTypes =
@@ -386,24 +373,24 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::detail::tdigest::make_empty_tdigest_column(cudf::get_default_stream());
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
-  std::vector<column_view> input;
+  std::vector<cudf::column_view> input;
   input.push_back(*empty_);
   input.push_back(*empty_);
   input.push_back(*empty_);
   auto empty = cudf::concatenate(input);
 
-  tdigest_column_view tdv(*empty);
+  cudf::tdigest::tdigest_column_view tdv(*empty);
   auto result = cudf::percentile_approx(tdv, percentiles);
 
-  cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0, 0};
+  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{0, 0, 0, 0};
   std::vector<bool> nulls{0, 0, 0};
   auto expected =
     cudf::make_lists_column(3,
                             offsets.release(),
-                            cudf::make_empty_column(type_id::FLOAT64),
+                            cudf::make_empty_column(cudf::type_id::FLOAT64),
                             3,
                             cudf::test::detail::make_null_mask(nulls.begin(), nulls.end()));
 
@@ -426,16 +413,18 @@ TEST_F(PercentileApproxTest, EmptyPercentiles)
 
   cudf::test::fixed_width_column_wrapper<double> percentiles{};
 
-  tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
+  cudf::tdigest::tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
   auto result = cudf::percentile_approx(tdv, percentiles);
 
-  cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0};
-  auto expected = cudf::make_lists_column(
-    2,
-    offsets.release(),
-    cudf::make_empty_column(type_id::FLOAT64),
-    2,
-    cudf::detail::create_null_mask(2, mask_state::ALL_NULL, cudf::get_default_stream()));
+  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{0, 0, 0};
+  std::vector<bool> nulls{0, 0};
+  auto expected =
+    cudf::make_lists_column(2,
+                            offsets.release(),
+                            cudf::make_empty_column(cudf::type_id::FLOAT64),
+                            2,
+                            cudf::test::detail::make_null_mask(nulls.begin(), nulls.end()));
+  // cudf::detail::create_null_mask(2, cudf::mask_state::ALL_NULL, cudf::get_default_stream()));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -454,7 +443,7 @@ TEST_F(PercentileApproxTest, NullPercentiles)
   requests.push_back({values, std::move(aggregations)});
   auto tdigest_column = gb.aggregate(requests);
 
-  tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
+  cudf::tdigest::tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
 
   cudf::test::fixed_width_column_wrapper<double> npercentiles{{0.5, 0.5, 1.0, 1.0}, {0, 0, 1, 1}};
   auto result = cudf::percentile_approx(tdv, npercentiles);
diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp
index 20acdd02a93..6dfe4f5169b 100644
--- a/cpp/tests/quantiles/quantile_test.cpp
+++ b/cpp/tests/quantiles/quantile_test.cpp
@@ -14,26 +14,21 @@
  * limitations under the License.
  */
 
-#include <cudf/quantiles.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
+
+#include <cudf/quantiles.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
 #include <limits>
 #include <memory>
 #include <type_traits>
 #include <vector>
 
-using namespace cudf::test;
-
-using cudf::null_order;
-using cudf::order;
-using std::vector;
-
 namespace {
 struct q_res {
   q_res(double value, bool is_valid = true) : is_valid(is_valid), value(value) {}
@@ -77,9 +72,9 @@ struct q_expect {
 
 template <typename T>
 struct test_case {
-  fixed_width_column_wrapper<T> column;
-  vector<q_expect> expectations;
-  fixed_width_column_wrapper<cudf::size_type> ordered_indices;
+  cudf::test::fixed_width_column_wrapper<T> column;
+  std::vector<q_expect> expectations;
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> ordered_indices;
 };
 
 // interpolate_center
@@ -104,7 +99,7 @@ test_case<T> interpolate_center()
   }();
   auto max_d = static_cast<double>(max);
   auto low_d = static_cast<double>(low);
-  return test_case<T>{fixed_width_column_wrapper<T>({low, max}),
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({low, max}),
                       {q_expect{0.50, max_d, low_d, lin_d, mid_d, low_d}}};
 }
 
@@ -116,7 +111,7 @@ test_case<bool> interpolate_center()
   auto mid_d = 0.5;
   auto low_d = static_cast<double>(low);
   auto max_d = static_cast<double>(max);
-  return test_case<bool>{fixed_width_column_wrapper<bool>({low, max}),
+  return test_case<bool>{cudf::test::fixed_width_column_wrapper<bool>({low, max}),
                          {q_expect{0.5, max_d, low_d, mid_d, mid_d, low_d}}};
 }
 
@@ -130,7 +125,7 @@ test_case<T> interpolate_extrema_high()
   auto low_d   = static_cast<double>(low);
   auto max_d   = static_cast<double>(max);
   auto exact_d = static_cast<double>(max - 1);
-  return test_case<T>{fixed_width_column_wrapper<T>({low, max}),
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({low, max}),
                       {q_expect{0.50, max_d, low_d, exact_d, exact_d, low_d}}};
 }
 
@@ -151,7 +146,7 @@ test_case<T> interpolate_extrema_low()
   auto a_d     = static_cast<double>(a);
   auto b_d     = static_cast<double>(b);
   auto exact_d = static_cast<double>(a + 1);
-  return test_case<T>{fixed_width_column_wrapper<T>({a, b}),
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({a, b}),
                       {q_expect{0.50, b_d, a_d, exact_d, exact_d, a_d}}};
 }
 
@@ -166,7 +161,7 @@ test_case<bool> interpolate_extrema_low<bool>()
 template <typename T>
 std::enable_if_t<std::is_floating_point_v<T>, test_case<T>> single()
 {
-  return test_case<T>{fixed_width_column_wrapper<T>({7.309999942779541}),
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({7.309999942779541}),
                       {
                         q_expect{
                           -1.0,
@@ -198,13 +193,15 @@ std::enable_if_t<std::is_floating_point_v<T>, test_case<T>> single()
 template <typename T>
 std::enable_if_t<std::is_integral_v<T> and not cudf::is_boolean<T>(), test_case<T>> single()
 {
-  return test_case<T>{fixed_width_column_wrapper<T>({1}), {q_expect{0.7, 1, 1, 1, 1, 1}}};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({1}),
+                      {q_expect{0.7, 1, 1, 1, 1, 1}}};
 }
 
 template <typename T>
 std::enable_if_t<cudf::is_boolean<T>(), test_case<T>> single()
 {
-  return test_case<T>{fixed_width_column_wrapper<T>({1}), {q_expect{0.7, 1.0, 1.0, 1.0, 1.0, 1.0}}};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({1}),
+                      {q_expect{0.7, 1.0, 1.0, 1.0, 1.0, 1.0}}};
 }
 
 // all_invalid
@@ -213,25 +210,25 @@ template <typename T>
 std::enable_if_t<std::is_floating_point_v<T>, test_case<T>> all_invalid()
 {
   return test_case<T>{
-    fixed_width_column_wrapper<T>({6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7},
-                                  {0, 0, 0, 0, 0, 0, 0, 0, 0}),
+    cudf::test::fixed_width_column_wrapper<T>({6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7},
+                                              {0, 0, 0, 0, 0, 0, 0, 0, 0}),
     {q_expect{-1.0}, q_expect{0.0}, q_expect{0.5}, q_expect{1.0}, q_expect{2.0}}};
 }
 
 template <typename T>
 std::enable_if_t<std::is_integral_v<T> and not cudf::is_boolean<T>(), test_case<T>> all_invalid()
 {
-  return test_case<T>{
-    fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6}, {0, 0, 0, 0, 0, 0, 0, 0, 0}),
-    {q_expect{0.7}}};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6},
+                                                                {0, 0, 0, 0, 0, 0, 0, 0, 0}),
+                      {q_expect{0.7}}};
 }
 
 template <typename T>
 std::enable_if_t<cudf::is_boolean<T>(), test_case<T>> all_invalid()
 {
-  return test_case<T>{
-    fixed_width_column_wrapper<T>({1, 0, 1, 1, 0, 1, 0, 1, 1}, {0, 0, 0, 0, 0, 0, 0, 0, 0}),
-    {q_expect{0.7}}};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({1, 0, 1, 1, 0, 1, 0, 1, 1},
+                                                                {0, 0, 0, 0, 0, 0, 0, 0, 0}),
+                      {q_expect{0.7}}};
 }
 
 // some invalid
@@ -244,14 +241,14 @@ std::enable_if_t<std::is_same_v<T, double>, test_case<T>> some_invalid()
   T mid  = -0.432;
   T lin  = -0.432;
   return test_case<T>{
-    fixed_width_column_wrapper<T>({6.8, high, 3.4, 4.17, 2.13, 1.11, low, 0.8, 5.7},
-                                  {0, 1, 0, 0, 0, 0, 1, 0, 0}),
+    cudf::test::fixed_width_column_wrapper<T>({6.8, high, 3.4, 4.17, 2.13, 1.11, low, 0.8, 5.7},
+                                              {0, 1, 0, 0, 0, 0, 1, 0, 0}),
     {q_expect{-1.0, low, low, low, low, low},
      q_expect{0.0, low, low, low, low, low},
      q_expect{0.5, high, low, lin, mid, low},
      q_expect{1.0, high, high, high, high, high},
      q_expect{2.0, high, high, high, high, high}},
-    fixed_width_column_wrapper<cudf::size_type>({6, 1})};
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({6, 1})};
 }
 
 template <typename T>
@@ -261,7 +258,7 @@ std::enable_if_t<std::is_same_v<T, float>, test_case<T>> some_invalid()
   T low      = -1.024;
   double mid = -0.43200002610683441;
   double lin = -0.43200002610683441;
-  return test_case<T>{fixed_width_column_wrapper<T>(
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>(
                         {T(6.8), high, T(3.4), T(4.17), T(2.13), T(1.11), low, T(0.8), T(5.7)},
                         {0, 1, 0, 0, 0, 0, 1, 0, 0}),
                       {q_expect{-1.0, low, low, low, low, low},
@@ -269,29 +266,29 @@ std::enable_if_t<std::is_same_v<T, float>, test_case<T>> some_invalid()
                        q_expect{0.5, high, low, lin, mid, low},
                        q_expect{1.0, high, high, high, high, high},
                        q_expect{2.0, high, high, high, high, high}},
-                      fixed_width_column_wrapper<cudf::size_type>({6, 1})};
+                      cudf::test::fixed_width_column_wrapper<cudf::size_type>({6, 1})};
 }
 
 template <typename T>
 std::enable_if_t<std::is_integral_v<T> and not cudf::is_boolean<T>(), test_case<T>> some_invalid()
 {
-  return test_case<T>{
-    fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6}, {0, 0, 1, 0, 0, 0, 0, 0, 1}),
-    {q_expect{0.0, 3.0, 3.0, 3.0, 3.0, 3.0},
-     q_expect{0.5, 6.0, 3.0, 4.5, 4.5, 3.0},
-     q_expect{1.0, 6.0, 6.0, 6.0, 6.0, 6.0}},
-    fixed_width_column_wrapper<cudf::size_type>({2, 8})};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6},
+                                                                {0, 0, 1, 0, 0, 0, 0, 0, 1}),
+                      {q_expect{0.0, 3.0, 3.0, 3.0, 3.0, 3.0},
+                       q_expect{0.5, 6.0, 3.0, 4.5, 4.5, 3.0},
+                       q_expect{1.0, 6.0, 6.0, 6.0, 6.0, 6.0}},
+                      cudf::test::fixed_width_column_wrapper<cudf::size_type>({2, 8})};
 }
 
 template <typename T>
 std::enable_if_t<cudf::is_boolean<T>(), test_case<T>> some_invalid()
 {
-  return test_case<T>{
-    fixed_width_column_wrapper<T>({1, 0, 1, 1, 0, 1, 0, 1, 1}, {0, 0, 1, 0, 1, 0, 0, 0, 0}),
-    {q_expect{0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
-     q_expect{0.5, 1.0, 0.0, 0.5, 0.5, 0.0},
-     q_expect{1.0, 1.0, 1.0, 1.0, 1.0, 1.0}},
-    fixed_width_column_wrapper<cudf::size_type>({4, 2})};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({1, 0, 1, 1, 0, 1, 0, 1, 1},
+                                                                {0, 0, 1, 0, 1, 0, 0, 0, 0}),
+                      {q_expect{0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                       q_expect{0.5, 1.0, 0.0, 0.5, 0.5, 0.0},
+                       q_expect{1.0, 1.0, 1.0, 1.0, 1.0, 1.0}},
+                      cudf::test::fixed_width_column_wrapper<cudf::size_type>({4, 2})};
 }
 
 // unsorted
@@ -300,38 +297,41 @@ template <typename T>
 std::enable_if_t<std::is_floating_point_v<T>, test_case<T>> unsorted()
 {
   return test_case<T>{
-    fixed_width_column_wrapper<T>({6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.00, 0.8, 5.7}),
+    cudf::test::fixed_width_column_wrapper<T>({6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.00, 0.8, 5.7}),
     {
       q_expect{0.0, -1.00, -1.00, -1.00, -1.00, -1.00},
     },
-    fixed_width_column_wrapper<cudf::size_type>({6, 1, 7, 5, 4, 2, 3, 8, 0})};
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({6, 1, 7, 5, 4, 2, 3, 8, 0})};
 }
 
 template <typename T>
 std::enable_if_t<std::is_integral_v<T> and not cudf::is_boolean<T>(), test_case<T>> unsorted()
 {
   return std::is_signed<T>()
-           ? test_case<T>{fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6}),
+           ? test_case<T>{cudf::test::fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6}),
                           {q_expect{0.0, -1, -1, -1, -1, -1}},
-                          fixed_width_column_wrapper<cudf::size_type>({6, 1, 7, 5, 4, 2, 3, 8, 0})}
-           : test_case<T>{fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, 1, 1, 6}),
+                          cudf::test::fixed_width_column_wrapper<cudf::size_type>(
+                            {6, 1, 7, 5, 4, 2, 3, 8, 0})}
+           : test_case<T>{cudf::test::fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, 1, 1, 6}),
                           {q_expect{0.0, 1, 1, 1, 1, 1}},
-                          fixed_width_column_wrapper<cudf::size_type>({6, 1, 7, 5, 4, 2, 3, 8, 0})};
+                          cudf::test::fixed_width_column_wrapper<cudf::size_type>(
+                            {6, 1, 7, 5, 4, 2, 3, 8, 0})};
 }
 
 template <typename T>
 std::enable_if_t<cudf::is_boolean<T>(), test_case<T>> unsorted()
 {
-  return test_case<T>{fixed_width_column_wrapper<T>({0, 0, 1, 1, 0, 1, 1, 0, 1}),
-                      {q_expect{
-                        0.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                      }},
-                      fixed_width_column_wrapper<cudf::size_type>({0, 1, 4, 7, 2, 3, 5, 6, 9})};
+  return test_case<T>{
+    cudf::test::fixed_width_column_wrapper<T>({0, 0, 1, 1, 0, 1, 1, 0, 1}),
+    {q_expect{
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+    }},
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 1, 4, 7, 2, 3, 5, 6, 9})};
 }
 
 }  // namespace testdata
@@ -342,40 +342,39 @@ std::enable_if_t<cudf::is_boolean<T>(), test_case<T>> unsorted()
 template <typename T>
 void test(testdata::test_case<T> test_case)
 {
-  using namespace cudf;
-
   for (auto& expected : test_case.expectations) {
     auto q = std::vector<double>{expected.quantile};
 
     auto nullable = static_cast<cudf::column_view>(test_case.column).nullable();
 
     auto make_expected_column = [nullable](q_res expected) {
-      return nullable ? fixed_width_column_wrapper<double>({expected.value}, {expected.is_valid})
-                      : fixed_width_column_wrapper<double>({expected.value});
+      return nullable ? cudf::test::fixed_width_column_wrapper<double>({expected.value},
+                                                                       {expected.is_valid})
+                      : cudf::test::fixed_width_column_wrapper<double>({expected.value});
     };
 
     auto actual_higher =
-      quantile(test_case.column, q, interpolation::HIGHER, test_case.ordered_indices);
+      cudf::quantile(test_case.column, q, cudf::interpolation::HIGHER, test_case.ordered_indices);
     auto expected_higher_col = make_expected_column(expected.higher);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_higher_col, actual_higher->view());
 
     auto actual_lower =
-      quantile(test_case.column, q, interpolation::LOWER, test_case.ordered_indices);
+      cudf::quantile(test_case.column, q, cudf::interpolation::LOWER, test_case.ordered_indices);
     auto expected_lower_col = make_expected_column(expected.lower);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_col, actual_lower->view());
 
     auto actual_linear =
-      quantile(test_case.column, q, interpolation::LINEAR, test_case.ordered_indices);
+      cudf::quantile(test_case.column, q, cudf::interpolation::LINEAR, test_case.ordered_indices);
     auto expected_linear_col = make_expected_column(expected.linear);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_linear_col, actual_linear->view());
 
     auto actual_midpoint =
-      quantile(test_case.column, q, interpolation::MIDPOINT, test_case.ordered_indices);
+      cudf::quantile(test_case.column, q, cudf::interpolation::MIDPOINT, test_case.ordered_indices);
     auto expected_midpoint_col = make_expected_column(expected.midpoint);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_midpoint_col, actual_midpoint->view());
 
     auto actual_nearest =
-      quantile(test_case.column, q, interpolation::NEAREST, test_case.ordered_indices);
+      cudf::quantile(test_case.column, q, cudf::interpolation::NEAREST, test_case.ordered_indices);
     auto expected_nearest_col = make_expected_column(expected.nearest);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_nearest_col, actual_nearest->view());
   }
@@ -385,10 +384,10 @@ void test(testdata::test_case<T> test_case)
 // ----- tests -----------------------------------------------------------------
 
 template <typename T>
-struct QuantileTest : public BaseFixture {
+struct QuantileTest : public cudf::test::BaseFixture {
 };
 
-using TestTypes = NumericTypes;
+using TestTypes = cudf::test::NumericTypes;
 TYPED_TEST_SUITE(QuantileTest, TestTypes);
 
 TYPED_TEST(QuantileTest, TestSingle) { test(testdata::single<TypeParam>()); }
@@ -413,60 +412,65 @@ TYPED_TEST(QuantileTest, TestInterpolateExtremaLow)
 
 TYPED_TEST(QuantileTest, TestEmpty)
 {
-  auto input    = fixed_width_column_wrapper<TypeParam>({});
+  auto input    = cudf::test::fixed_width_column_wrapper<TypeParam>({});
   auto expected = cudf::test::fixed_width_column_wrapper<double>({0, 0}, {0, 0});
   auto actual   = cudf::quantile(input, {0.5, 0.25});
 }
 
 template <typename T>
-struct QuantileUnsupportedTypesTest : public BaseFixture {
+struct QuantileUnsupportedTypesTest : public cudf::test::BaseFixture {
 };
 
 // TODO add tests for FixedPointTypes
-using UnsupportedTestTypes = RemoveIf<ContainedIn<Concat<TestTypes, FixedPointTypes>>, AllTypes>;
+using UnsupportedTestTypes = cudf::test::RemoveIf<
+  cudf::test::ContainedIn<cudf::test::Concat<TestTypes, cudf::test::FixedPointTypes>>,
+  cudf::test::AllTypes>;
 TYPED_TEST_SUITE(QuantileUnsupportedTypesTest, UnsupportedTestTypes);
 
 TYPED_TEST(QuantileUnsupportedTypesTest, TestZeroElements)
 {
-  fixed_width_column_wrapper<TypeParam> input({});
+  cudf::test::fixed_width_column_wrapper<TypeParam> input({});
 
   EXPECT_THROW(cudf::quantile(input, {0}), cudf::logic_error);
 }
 
 TYPED_TEST(QuantileUnsupportedTypesTest, TestOneElements)
 {
-  fixed_width_column_wrapper<TypeParam, int32_t> input({0});
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> input({0});
 
   EXPECT_THROW(cudf::quantile(input, {0}), cudf::logic_error);
 }
 
 TYPED_TEST(QuantileUnsupportedTypesTest, TestMultipleElements)
 {
-  fixed_width_column_wrapper<TypeParam, int32_t> input({0, 1, 2});
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> input({0, 1, 2});
 
   EXPECT_THROW(cudf::quantile(input, {0}), cudf::logic_error);
 }
 
-struct QuantileDictionaryTest : public BaseFixture {
+struct QuantileDictionaryTest : public cudf::test::BaseFixture {
 };
 
 TEST_F(QuantileDictionaryTest, TestValid)
 {
-  dictionary_column_wrapper<int32_t> col{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-  fixed_width_column_wrapper<int32_t> indices{0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+  cudf::test::dictionary_column_wrapper<int32_t> col{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  cudf::test::fixed_width_column_wrapper<int32_t> indices{0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
 
   auto result = cudf::quantile(col, {0.5}, cudf::interpolation::LINEAR);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), fixed_width_column_wrapper<double>{5.5});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(),
+                                      cudf::test::fixed_width_column_wrapper<double>{5.5});
 
   result = cudf::quantile(col, {0.5}, cudf::interpolation::LINEAR, indices);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), fixed_width_column_wrapper<double>{5.5});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(),
+                                      cudf::test::fixed_width_column_wrapper<double>{5.5});
 
   result = cudf::quantile(col, {0.1, 0.2}, cudf::interpolation::HIGHER);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), fixed_width_column_wrapper<double>{2.0, 3.0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(),
+                                      cudf::test::fixed_width_column_wrapper<double>{2.0, 3.0});
 
   result = cudf::quantile(col, {0.25, 0.5, 0.75}, cudf::interpolation::MIDPOINT);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(),
-                                      fixed_width_column_wrapper<double>{3.5, 5.5, 7.5});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    result->view(), cudf::test::fixed_width_column_wrapper<double>{3.5, 5.5, 7.5});
 };
 
 }  // anonymous namespace
diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp
index b4d1b9984ab..f532e93c6c2 100644
--- a/cpp/tests/quantiles/quantiles_test.cpp
+++ b/cpp/tests/quantiles/quantiles_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,52 +20,45 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-
 #include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/quantiles.hpp>
-#include <cudf/utilities/error.hpp>
-
-using namespace cudf;
-using namespace test;
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
 
 template <typename T>
-struct QuantilesTest : public BaseFixture {
+struct QuantilesTest : public cudf::test::BaseFixture {
 };
 
-using TestTypes = AllTypes;
+using TestTypes = cudf::test::AllTypes;
 
 TYPED_TEST_SUITE(QuantilesTest, TestTypes);
 
 TYPED_TEST(QuantilesTest, TestZeroColumns)
 {
-  auto input = table_view(std::vector<column_view>{});
+  auto input = cudf::table_view(std::vector<cudf::column_view>{});
 
-  EXPECT_THROW(quantiles(input, {0.0f}), logic_error);
+  EXPECT_THROW(cudf::quantiles(input, {0.0f}), cudf::logic_error);
 }
 
 TYPED_TEST(QuantilesTest, TestMultiColumnZeroRows)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> input_a({});
-  auto input = table_view({input_a});
+  cudf::test::fixed_width_column_wrapper<T> input_a({});
+  auto input = cudf::table_view({input_a});
 
-  EXPECT_THROW(quantiles(input, {0.0f}), logic_error);
+  EXPECT_THROW(cudf::quantiles(input, {0.0f}), cudf::logic_error);
 }
 
 TYPED_TEST(QuantilesTest, TestZeroRequestedQuantiles)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T, int32_t> input_a({1}, {1});
-  auto input = table_view(std::vector<column_view>{input_a});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> input_a({1}, {1});
+  auto input = cudf::table_view(std::vector<cudf::column_view>{input_a});
 
-  auto actual   = quantiles(input, {});
-  auto expected = empty_like(input);
+  auto actual   = cudf::quantiles(input, {});
+  auto expected = cudf::empty_like(input);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), actual->view());
 }
@@ -74,75 +67,75 @@ TYPED_TEST(QuantilesTest, TestMultiColumnOrderCountMismatch)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> input_a({});
-  fixed_width_column_wrapper<T> input_b({});
-  auto input = table_view({input_a});
-
-  EXPECT_THROW(quantiles(input,
-                         {0.0f},
-                         interpolation::NEAREST,
-                         sorted::NO,
-                         {order::ASCENDING},
-                         {null_order::AFTER, null_order::AFTER}),
-               logic_error);
+  cudf::test::fixed_width_column_wrapper<T> input_a({});
+  cudf::test::fixed_width_column_wrapper<T> input_b({});
+  auto input = cudf::table_view({input_a});
+
+  EXPECT_THROW(cudf::quantiles(input,
+                               {0.0f},
+                               cudf::interpolation::NEAREST,
+                               cudf::sorted::NO,
+                               {cudf::order::ASCENDING},
+                               {cudf::null_order::AFTER, cudf::null_order::AFTER}),
+               cudf::logic_error);
 }
 
 TYPED_TEST(QuantilesTest, TestMultiColumnNullOrderCountMismatch)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> input_a({});
-  fixed_width_column_wrapper<T> input_b({});
-  auto input = table_view({input_a});
-
-  EXPECT_THROW(quantiles(input,
-                         {0.0f},
-                         interpolation::NEAREST,
-                         sorted::NO,
-                         {order::ASCENDING, order::ASCENDING},
-                         {null_order::AFTER}),
-               logic_error);
+  cudf::test::fixed_width_column_wrapper<T> input_a({});
+  cudf::test::fixed_width_column_wrapper<T> input_b({});
+  auto input = cudf::table_view({input_a});
+
+  EXPECT_THROW(cudf::quantiles(input,
+                               {0.0f},
+                               cudf::interpolation::NEAREST,
+                               cudf::sorted::NO,
+                               {cudf::order::ASCENDING, cudf::order::ASCENDING},
+                               {cudf::null_order::AFTER}),
+               cudf::logic_error);
 }
 
 TYPED_TEST(QuantilesTest, TestMultiColumnArithmeticInterpolation)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> input_a({});
-  fixed_width_column_wrapper<T> input_b({});
-  auto input = table_view({input_a});
+  cudf::test::fixed_width_column_wrapper<T> input_a({});
+  cudf::test::fixed_width_column_wrapper<T> input_b({});
+  auto input = cudf::table_view({input_a});
 
-  EXPECT_THROW(quantiles(input, {0.0f}, interpolation::LINEAR), logic_error);
+  EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), cudf::logic_error);
 
-  EXPECT_THROW(quantiles(input, {0.0f}, interpolation::MIDPOINT), logic_error);
+  EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), cudf::logic_error);
 }
 
 TYPED_TEST(QuantilesTest, TestMultiColumnUnsorted)
 {
   using T = TypeParam;
 
-  auto input_a = strings_column_wrapper(
+  auto input_a = cudf::test::strings_column_wrapper(
     {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C",
      "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"},
     {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  fixed_width_column_wrapper<T, int32_t> input_b(
+  cudf::test::fixed_width_column_wrapper<T, int32_t> input_b(
     {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2},
     {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto input = table_view({input_a, input_b});
+  auto input = cudf::table_view({input_a, input_b});
 
-  auto actual = quantiles(input,
-                          {0.0f, 0.5f, 0.7f, 0.25f, 1.0f},
-                          interpolation::NEAREST,
-                          sorted::NO,
-                          {order::ASCENDING, order::DESCENDING});
+  auto actual = cudf::quantiles(input,
+                                {0.0f, 0.5f, 0.7f, 0.25f, 1.0f},
+                                cudf::interpolation::NEAREST,
+                                cudf::sorted::NO,
+                                {cudf::order::ASCENDING, cudf::order::DESCENDING});
 
-  auto expected_a = strings_column_wrapper({"A", "C", "C", "B", "D"}, {1, 1, 1, 1, 1});
+  auto expected_a = cudf::test::strings_column_wrapper({"A", "C", "C", "B", "D"}, {1, 1, 1, 1, 1});
 
-  fixed_width_column_wrapper<T, int32_t> expected_b({5, 5, 1, 5, 0}, {1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> expected_b({5, 5, 1, 5, 0}, {1, 1, 1, 1, 1});
 
-  auto expected = table_view({expected_a, expected_b});
+  auto expected = cudf::table_view({expected_a, expected_b});
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, actual->view());
 }
@@ -151,25 +144,25 @@ TYPED_TEST(QuantilesTest, TestMultiColumnAssumedSorted)
 {
   using T = TypeParam;
 
-  auto input_a = strings_column_wrapper(
+  auto input_a = cudf::test::strings_column_wrapper(
     {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C",
      "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"},
     {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  fixed_width_column_wrapper<T, int32_t> input_b(
+  cudf::test::fixed_width_column_wrapper<T, int32_t> input_b(
     {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2},
     {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto input = table_view({input_a, input_b});
+  auto input = cudf::table_view({input_a, input_b});
 
-  auto actual =
-    quantiles(input, {0.0f, 0.5f, 0.7f, 0.25f, 1.0f}, interpolation::NEAREST, sorted::YES);
+  auto actual = cudf::quantiles(
+    input, {0.0f, 0.5f, 0.7f, 0.25f, 1.0f}, cudf::interpolation::NEAREST, cudf::sorted::YES);
 
-  auto expected_a = strings_column_wrapper({"C", "D", "C", "D", "A"}, {1, 1, 1, 1, 1});
+  auto expected_a = cudf::test::strings_column_wrapper({"C", "D", "C", "D", "A"}, {1, 1, 1, 1, 1});
 
-  fixed_width_column_wrapper<T, int32_t> expected_b({4, 2, 1, 4, 2}, {1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> expected_b({4, 2, 1, 4, 2}, {1, 1, 1, 1, 1});
 
-  auto expected = table_view({expected_a, expected_b});
+  auto expected = cudf::table_view({expected_a, expected_b});
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, actual->view());
 }
diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp
index 73b721d5d85..a8e75aeb7e5 100644
--- a/cpp/tests/reductions/rank_tests.cpp
+++ b/cpp/tests/reductions/rank_tests.cpp
@@ -316,12 +316,12 @@ TEST(RankScanTest, ExclusiveScan)
 {
   auto const vals = input<uint32_t>{3, 4, 5};
 
-  CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, *dense_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
-                            "Rank aggregation operator requires an inclusive scan");
-  CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, *rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
-                            "Rank aggregation operator requires an inclusive scan");
-  CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, *percent_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
-                            "Rank aggregation operator requires an inclusive scan");
+  // Only inclusive scans are supported, so these should all raise exceptions.
+  EXPECT_THROW(cudf::scan(vals, *dense_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::scan(vals, *rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), cudf::logic_error);
+  EXPECT_THROW(cudf::scan(vals, *percent_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
+               cudf::logic_error);
 }
 
 }  // namespace cudf::test
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index 53642a89b3d..09007df38ce 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -80,14 +80,12 @@ TEST_F(SegmentedSortInt, Empty)
   CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(table_empty, table_empty, segments));
   CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(table_empty, table_empty, col_empty));
 
-  CUDF_EXPECT_THROW_MESSAGE(cudf::segmented_sort_by_key(table_empty, table_valid, segments),
-                            "Mismatch in number of rows for values and keys");
-  CUDF_EXPECT_THROW_MESSAGE(cudf::segmented_sort_by_key(table_empty, table_valid, col_empty),
-                            "Mismatch in number of rows for values and keys");
-  CUDF_EXPECT_THROW_MESSAGE(cudf::segmented_sort_by_key(table_valid, table_empty, segments),
-                            "Mismatch in number of rows for values and keys");
-  CUDF_EXPECT_THROW_MESSAGE(cudf::segmented_sort_by_key(table_valid, table_empty, col_empty),
-                            "Mismatch in number of rows for values and keys");
+  // Swapping "empty" and "valid" tables is invalid because the keys and values will be of different
+  // sizes.
+  EXPECT_THROW(cudf::segmented_sort_by_key(table_empty, table_valid, segments), cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_sort_by_key(table_empty, table_valid, col_empty), cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_sort_by_key(table_valid, table_empty, segments), cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_sort_by_key(table_valid, table_empty, col_empty), cudf::logic_error);
 }
 
 TEST_F(SegmentedSortInt, Single)
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index 488184f4099..11f5c9f39aa 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -17,11 +17,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
-#include <cudf/strings/detail/copying.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -61,14 +61,14 @@ class SliceParmsTest : public StringsColumnTest,
 TEST_P(SliceParmsTest, Slice)
 {
   std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  cudf::test::strings_column_wrapper input(
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
   cudf::size_type start = 3;
   cudf::size_type end   = GetParam();
-  auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);
+
+  auto scol    = cudf::slice(input, {start, end});
+  auto results = std::make_unique<cudf::column>(scol.front());
 
   cudf::test::strings_column_wrapper expected(
     h_strings.begin() + start,
@@ -81,14 +81,14 @@ TEST_P(SliceParmsTest, Slice)
 TEST_P(SliceParmsTest, SliceAllNulls)
 {
   std::vector<const char*> h_strings{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  cudf::test::strings_column_wrapper input(
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
   cudf::size_type start = 3;
   cudf::size_type end   = GetParam();
-  auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);
+
+  auto scol    = cudf::slice(input, {start, end});
+  auto results = std::make_unique<cudf::column>(scol.front());
 
   cudf::test::strings_column_wrapper expected(
     h_strings.begin() + start,
@@ -101,11 +101,13 @@ TEST_P(SliceParmsTest, SliceAllNulls)
 TEST_P(SliceParmsTest, SliceAllEmpty)
 {
   std::vector<const char*> h_strings{"", "", "", "", "", "", ""};
-  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
 
   cudf::size_type start = 3;
   cudf::size_type end   = GetParam();
-  auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);
+
+  auto scol    = cudf::slice(input, {start, end});
+  auto results = std::make_unique<cudf::column>(scol.front());
 
   cudf::test::strings_column_wrapper expected(h_strings.begin() + start, h_strings.begin() + end);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -119,8 +121,8 @@ TEST_F(StringsColumnTest, SliceZeroSizeStringsColumn)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
-  auto results      = cudf::strings::detail::copy_slice(strings_view, 1, 2);
+  auto scol    = cudf::slice(zero_size_strings_column, {0, 0});
+  auto results = std::make_unique<cudf::column>(scol.front());
   cudf::test::expect_column_empty(results->view());
 }
 
@@ -128,18 +130,14 @@ TEST_F(StringsColumnTest, Gather)
 {
   std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
   cudf::test::fixed_width_column_wrapper<int32_t> gather_map{{4, 1}};
   auto results = cudf::gather(cudf::table_view{{strings}}, gather_map)->release();
 
   std::vector<const char*> h_expected{"aa", "bb"};
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results.front()->view(), expected);
 }
 
diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp
index 387f0f5c997..e4f2f7ca62c 100644
--- a/cpp/tests/strings/concatenate_tests.cpp
+++ b/cpp/tests/strings/concatenate_tests.cpp
@@ -19,7 +19,7 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/strings/detail/concatenate.hpp>
+#include <cudf/concatenate.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <vector>
@@ -60,7 +60,7 @@ TEST_F(StringsConcatenateTest, Concatenate)
   strings_columns.push_back(strings2);
   strings_columns.push_back(strings3);
 
-  auto results = cudf::strings::detail::concatenate(strings_columns, cudf::get_default_stream());
+  auto results = cudf::concatenate(strings_columns);
 
   cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -74,7 +74,7 @@ TEST_F(StringsConcatenateTest, ZeroSizeStringsColumns)
   strings_columns.push_back(zero_size_strings_column);
   strings_columns.push_back(zero_size_strings_column);
   strings_columns.push_back(zero_size_strings_column);
-  auto results = cudf::strings::detail::concatenate(strings_columns, cudf::get_default_stream());
+  auto results = cudf::concatenate(strings_columns);
   cudf::test::expect_column_empty(results->view());
 }
 
@@ -107,6 +107,6 @@ TEST_F(StringsConcatenateTest, ZeroSizeStringsPlusNormal)
                                               h_strings.data() + h_strings.size());
   strings_columns.push_back(strings1);
 
-  auto results = cudf::strings::detail::concatenate(strings_columns, cudf::get_default_stream());
+  auto results = cudf::concatenate(strings_columns);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings1);
 }
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index aaacc08d5fb..43ef73baf14 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -14,13 +14,15 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/contains.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/contains.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -147,6 +149,9 @@ TEST_F(StringsContainsTests, ContainsTest)
       h_expected + h_strings.size(),
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(ptn);
+    results   = cudf::strings::contains_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
@@ -161,40 +166,56 @@ TEST_F(StringsContainsTests, MatchesTest)
 
   auto strings_view = cudf::strings_column_view(strings);
   {
-    auto results      = cudf::strings::matches_re(strings_view, "lazy");
-    bool h_expected[] = {false, false, true, false, false, false, false};
+    auto const pattern = std::string("lazy");
+    auto results       = cudf::strings::matches_re(strings_view, pattern);
+    bool h_expected[]  = {false, false, true, false, false, false, false};
     cudf::test::fixed_width_column_wrapper<bool> expected(
       h_expected,
       h_expected + h_strings.size(),
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results      = cudf::strings::matches_re(strings_view, "\\d+");
-    bool h_expected[] = {false, false, false, true, true, false, false};
+    auto const pattern = std::string("\\d+");
+    auto results       = cudf::strings::matches_re(strings_view, pattern);
+    bool h_expected[]  = {false, false, false, true, true, false, false};
     cudf::test::fixed_width_column_wrapper<bool> expected(
       h_expected,
       h_expected + h_strings.size(),
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results      = cudf::strings::matches_re(strings_view, "@\\w+");
-    bool h_expected[] = {false, false, false, false, false, false, false};
+    auto const pattern = std::string("@\\w+");
+    auto results       = cudf::strings::matches_re(strings_view, pattern);
+    bool h_expected[]  = {false, false, false, false, false, false, false};
     cudf::test::fixed_width_column_wrapper<bool> expected(
       h_expected,
       h_expected + h_strings.size(),
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results      = cudf::strings::matches_re(strings_view, ".*");
-    bool h_expected[] = {true, true, true, true, true, false, true};
+    auto const pattern = std::string(".*");
+    auto results       = cudf::strings::matches_re(strings_view, pattern);
+    bool h_expected[]  = {true, true, true, true, true, false, true};
     cudf::test::fixed_width_column_wrapper<bool> expected(
       h_expected,
       h_expected + h_strings.size(),
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
@@ -219,6 +240,9 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
     cudf::test::fixed_width_column_wrapper<bool> expected(
       {true, true, false, false, false, false, true, true, true, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
   }
   {  // is_loopback: 72 instructions
     std::string pattern =
@@ -229,6 +253,9 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
     cudf::test::fixed_width_column_wrapper<bool> expected(
       {false, false, false, false, false, false, false, false, false, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
   }
   {  // is_multicast: 79 instructions
     std::string pattern =
@@ -239,6 +266,9 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
     cudf::test::fixed_width_column_wrapper<bool> expected(
       {false, false, false, false, false, false, true, true, false, false});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
   }
 }
 
@@ -247,18 +277,43 @@ TEST_F(StringsContainsTests, OctalTest)
   cudf::test::strings_column_wrapper strings({"A3", "B", "CDA3EY", "", "99", "\a\t\r"});
   auto strings_view = cudf::strings_column_view(strings);
   auto expected     = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 0, 0});
-  auto results      = cudf::strings::contains_re(strings_view, "\\101");
+
+  auto pattern = std::string("\\101");
+  auto results = cudf::strings::contains_re(strings_view, pattern);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::contains_re(strings_view, *prog);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results = cudf::strings::contains_re(strings_view, "\\1013");
+
+  pattern = std::string("\\1013");
+  results = cudf::strings::contains_re(strings_view, pattern);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  pattern = std::string("D*\\101\\063");
+  results = cudf::strings::contains_re(strings_view, pattern);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results = cudf::strings::contains_re(strings_view, "D*\\101\\063");
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results  = cudf::strings::contains_re(strings_view, "\\719");
+
+  pattern  = std::string("\\719");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results  = cudf::strings::contains_re(strings_view, "[\\7][\\11][\\15]");
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  pattern  = std::string("[\\7][\\11][\\15]");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsContainsTests, HexTest)
@@ -285,10 +340,17 @@ TEST_F(StringsContainsTests, HexTest)
       0, [ch](auto idx) { return ch == static_cast<char>(idx); });
     cudf::test::fixed_width_column_wrapper<bool> expected(true_dat, true_dat + count);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::contains_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
     // also test hex character appearing in character class brackets
     pattern = "[" + pattern + "]";
     results = cudf::strings::contains_re(strings_view, pattern);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    prog    = cudf::strings::regex_program::create(pattern);
+    results = cudf::strings::contains_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
@@ -303,36 +365,56 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter)
   cudf::test::strings_column_wrapper input(data.begin(), data.end());
   auto strings_view = cudf::strings_column_view(input);
 
-  auto results  = cudf::strings::contains_re(strings_view, "A");
+  auto pattern  = std::string("A");
+  auto results  = cudf::strings::contains_re(strings_view, pattern);
   auto expected = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0, 0, 0, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results  = cudf::strings::contains_re(strings_view, "B");
+  pattern  = std::string("B");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results  = cudf::strings::contains_re(strings_view, "J\\0B");
+  pattern  = std::string("J\\0B");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0, 0, 0, 0, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results  = cudf::strings::contains_re(strings_view, "[G-J][\\0]B");
+  pattern  = std::string("[G-J][\\0]B");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results  = cudf::strings::contains_re(strings_view, "[A-D][\\x00]B");
+  pattern  = std::string("[A-D][\\x00]B");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 0, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsContainsTests, Errors)
 {
-  cudf::test::strings_column_wrapper input({"3", "33"});
-  auto strings_view = cudf::strings_column_view(input);
+  EXPECT_THROW(cudf::strings::regex_program::create("(3?)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("(?:3?)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("3?+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("{3}a"), cudf::logic_error);
 
-  EXPECT_THROW(cudf::strings::contains_re(strings_view, "(3?)+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::contains_re(strings_view, "(?:3?)+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::contains_re(strings_view, "3?+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::count_re(strings_view, "{3}a"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("aaaa{1234,5678}"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("aaaa{123,5678}"), cudf::logic_error);
 }
 
 TEST_F(StringsContainsTests, CountTest)
@@ -340,36 +422,37 @@ TEST_F(StringsContainsTests, CountTest)
   std::vector<const char*> h_strings{
     "The quick brown @fox jumps ovér the", "lazy @dog", "1:2:3:4", "00:0:00", nullptr, ""};
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
   auto strings_view = cudf::strings_column_view(strings);
   {
-    auto results         = cudf::strings::count_re(strings_view, "[tT]he");
-    int32_t h_expected[] = {2, 0, 0, 0, 0, 0};
+    auto pattern = std::string("[tT]he");
+    auto results = cudf::strings::count_re(strings_view, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected(
-      h_expected,
-      h_expected + h_strings.size(),
-      thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+      {2, 0, 0, 0, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(strings_view, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results         = cudf::strings::count_re(strings_view, "@\\w+");
-    int32_t h_expected[] = {1, 1, 0, 0, 0, 0};
+    auto pattern = std::string("@\\w+");
+    auto results = cudf::strings::count_re(strings_view, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected(
-      h_expected,
-      h_expected + h_strings.size(),
-      thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+      {1, 1, 0, 0, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(strings_view, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results         = cudf::strings::count_re(strings_view, "\\d+:\\d+");
-    int32_t h_expected[] = {0, 0, 2, 1, 0, 0};
+    auto pattern = std::string("\\d+:\\d+");
+    auto results = cudf::strings::count_re(strings_view, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected(
-      h_expected,
-      h_expected + h_strings.size(),
-      thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+      {0, 0, 2, 1, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(strings_view, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
@@ -381,67 +464,90 @@ TEST_F(StringsContainsTests, FixedQuantifier)
 
   {
     // exact match
-    auto results = cudf::strings::count_re(sv, "a{3}");
+    auto pattern = std::string("a{3}");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 0, 1, 1, 1, 2});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // range match (greedy quantifier)
-    auto results = cudf::strings::count_re(sv, "a{3,5}");
+    auto pattern = std::string("a{3,5}");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 0, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // minimum match (greedy quantifier)
-    auto results = cudf::strings::count_re(sv, "a{2,}");
+    auto pattern = std::string("a{2,}");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 1, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // range match (lazy quantifier)
-    auto results = cudf::strings::count_re(sv, "a{2,4}?");
+    auto pattern = std::string("a{2,4}?");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 1, 1, 2, 2, 3});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // minimum match (lazy quantifier)
-    auto results = cudf::strings::count_re(sv, "a{1,}?");
+    auto pattern = std::string("a{1,}?");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({1, 2, 3, 4, 5, 6});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // zero match
-    auto results = cudf::strings::count_re(sv, "aaaa{0}");
+    auto pattern = std::string("aaaa{0}");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 0, 1, 1, 1, 2});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // poorly formed
-    auto results = cudf::strings::count_re(sv, "aaaa{n,m}");
+    auto pattern = std::string("aaaa{n,m}");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 0, 0, 0, 0, 0});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    EXPECT_THROW(cudf::strings::count_re(sv, "aaaa{1234,5678}"), cudf::logic_error);
-    EXPECT_THROW(cudf::strings::count_re(sv, "aaaa{123,5678}"), cudf::logic_error);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
 TEST_F(StringsContainsTests, QuantifierErrors)
 {
-  auto input = cudf::test::strings_column_wrapper({"a", "aa", "aaa", "aaaa", "aaaaa", "aaaaaa"});
-  auto sv    = cudf::strings_column_view(input);
-
-  EXPECT_THROW(cudf::strings::contains_re(sv, "^+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::count_re(sv, "$+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::count_re(sv, "(^)+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::contains_re(sv, "($)+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::count_re(sv, "\\A+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::count_re(sv, "\\Z+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::contains_re(sv, "(\\A)+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::contains_re(sv, "(\\Z)+"), cudf::logic_error);
-
-  EXPECT_THROW(cudf::strings::contains_re(sv, "(^($))+"), cudf::logic_error);
-  EXPECT_NO_THROW(cudf::strings::contains_re(sv, "(^a($))+"));
-  EXPECT_NO_THROW(cudf::strings::count_re(sv, "(^(a$))+"));
+  EXPECT_THROW(cudf::strings::regex_program::create("^+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("$+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("(^)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("($)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("\\A+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("\\Z+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("(\\A)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("(\\Z)+"), cudf::logic_error);
+
+  EXPECT_THROW(cudf::strings::regex_program::create("(^($))+"), cudf::logic_error);
+  EXPECT_NO_THROW(cudf::strings::regex_program::create("(^a($))+"));
+  EXPECT_NO_THROW(cudf::strings::regex_program::create("(^(a$))+"));
 }
 
 TEST_F(StringsContainsTests, OverlappedClasses)
@@ -450,14 +556,22 @@ TEST_F(StringsContainsTests, OverlappedClasses)
   auto sv = cudf::strings_column_view(input);
 
   {
-    auto results = cudf::strings::count_re(sv, "[e-gb-da-c]");
+    auto pattern = std::string("[e-gb-da-c]");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({7, 4, 0, 0, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results = cudf::strings::count_re(sv, "[á-éê-ú]");
+    auto pattern = std::string("[á-éê-ú]");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 1, 0, 6, 0});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
@@ -467,14 +581,22 @@ TEST_F(StringsContainsTests, NegatedClasses)
   auto sv    = cudf::strings_column_view(input);
 
   {
-    auto results = cudf::strings::count_re(sv, "[^a-f]");
+    auto pattern = std::string("[^a-f]");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({1, 4, 0, 5, 3});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results = cudf::strings::count_re(sv, "[^a-eá-é]");
+    auto pattern = std::string("[^a-eá-é]");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({2, 5, 0, 1, 3});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
@@ -487,14 +609,18 @@ TEST_F(StringsContainsTests, IncompleteClassesRange)
     cudf::test::fixed_width_column_wrapper<bool> expected({1, 0, 0, 1, 1});
     auto results = cudf::strings::contains_re(sv, "[a-z]");
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    results = cudf::strings::contains_re(sv, "[a-m-z]");  // same as [a-z]
+
+    auto prog = cudf::strings::regex_program::create("[a-m-z]");  // same as [a-z]
+    results   = cudf::strings::contains_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     cudf::test::fixed_width_column_wrapper<bool> expected({1, 1, 0, 1, 1});
     auto results = cudf::strings::contains_re(sv, "[g-]");
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    results = cudf::strings::contains_re(sv, "[-k]");
+
+    auto prog = cudf::strings::regex_program::create("[-k]");
+    results   = cudf::strings::contains_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -503,9 +629,12 @@ TEST_F(StringsContainsTests, IncompleteClassesRange)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
     results = cudf::strings::contains_re(sv, "[+--]");
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    results = cudf::strings::contains_re(sv, "[a-c-]");
+
+    auto prog = cudf::strings::regex_program::create("[a-c-]");
+    results   = cudf::strings::contains_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    results = cudf::strings::contains_re(sv, "[-d-f]");
+    prog    = cudf::strings::regex_program::create("[-d-f]");
+    results = cudf::strings::contains_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
@@ -516,26 +645,43 @@ TEST_F(StringsContainsTests, MultiLine)
     cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
   auto view = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::contains_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
+  auto pattern = std::string("^abc$");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto prog_ml =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE);
+
+  auto results = cudf::strings::contains_re(view, pattern, cudf::strings::regex_flags::MULTILINE);
   auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
-  results           = cudf::strings::contains_re(view, "^abc$");
+  results = cudf::strings::contains_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+  results           = cudf::strings::contains_re(view, pattern);
   expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+  results = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
 
-  results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
+  results = cudf::strings::matches_re(view, pattern, cudf::strings::regex_flags::MULTILINE);
   auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
-  results          = cudf::strings::matches_re(view, "^abc$");
+  results = cudf::strings::matches_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+  results          = cudf::strings::matches_re(view, pattern);
   expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+  results = cudf::strings::matches_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
 
-  results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
+  results = cudf::strings::count_re(view, pattern, cudf::strings::regex_flags::MULTILINE);
   auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
-  results        = cudf::strings::count_re(view, "^abc$");
+  results = cudf::strings::count_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  results        = cudf::strings::count_re(view, pattern);
   expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  results = cudf::strings::count_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
 TEST_F(StringsContainsTests, DotAll)
@@ -543,31 +689,55 @@ TEST_F(StringsContainsTests, DotAll)
   auto input = cudf::test::strings_column_wrapper({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""});
   auto view  = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::contains_re(view, "a.*f", cudf::strings::regex_flags::DOTALL);
+  auto pattern = std::string("a.*f");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto prog_dotall =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL);
+
+  auto results = cudf::strings::contains_re(view, pattern, cudf::strings::regex_flags::DOTALL);
   auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
-  results           = cudf::strings::contains_re(view, "a.*f");
+  results = cudf::strings::contains_re(view, *prog_dotall);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+  results           = cudf::strings::contains_re(view, pattern);
   expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+  results = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
 
-  results = cudf::strings::matches_re(view, "a.*f", cudf::strings::regex_flags::DOTALL);
+  results = cudf::strings::matches_re(view, pattern, cudf::strings::regex_flags::DOTALL);
   auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
-  results          = cudf::strings::matches_re(view, "a.*f");
+  results = cudf::strings::matches_re(view, *prog_dotall);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+  results          = cudf::strings::matches_re(view, pattern);
   expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+  results = cudf::strings::matches_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+
+  pattern     = std::string("a.*?f");
+  prog        = cudf::strings::regex_program::create(pattern);
+  prog_dotall = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL);
 
-  results             = cudf::strings::count_re(view, "a.*?f", cudf::strings::regex_flags::DOTALL);
+  results             = cudf::strings::count_re(view, pattern, cudf::strings::regex_flags::DOTALL);
   auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
-  results        = cudf::strings::count_re(view, "a.*?f");
+  results = cudf::strings::count_re(view, *prog_dotall);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  results        = cudf::strings::count_re(view, pattern);
   expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  results = cudf::strings::count_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 
-  auto both_flags = cudf::strings::regex_flags::DOTALL | cudf::strings::regex_flags::MULTILINE;
-  results =
-    cudf::strings::count_re(view, "a.*?f", static_cast<cudf::strings::regex_flags>(both_flags));
-  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
+  auto both_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::DOTALL |
+                                                            cudf::strings::regex_flags::MULTILINE);
+  results         = cudf::strings::count_re(view, pattern, both_flags);
+  expected_count  = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  auto prog_both = cudf::strings::regex_program::create(pattern, both_flags);
+  results        = cudf::strings::count_re(view, *prog_both);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
@@ -586,9 +756,16 @@ TEST_F(StringsContainsTests, ASCII)
     auto results = cudf::strings::contains_re(view, ptn, cudf::strings::regex_flags::ASCII);
     auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0, 0});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+    auto prog = cudf::strings::regex_program::create(ptn, cudf::strings::regex_flags::ASCII);
+    results   = cudf::strings::contains_re(view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+
     results           = cudf::strings::contains_re(view, ptn);
     expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+    prog    = cudf::strings::regex_program::create(ptn);
+    results = cudf::strings::contains_re(view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
   }
 }
 
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index e396ca42d6c..62d7ef2a418 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -21,6 +21,7 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/strings/extract.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -76,6 +77,10 @@ TEST_F(StringsExtractTests, ExtractTest)
   columns.push_back(expected2.release());
   cudf::table expected(std::move(columns));
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::extract(strings_view, pattern);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
 TEST_F(StringsExtractTests, ExtractDomainTest)
@@ -117,6 +122,10 @@ TEST_F(StringsExtractTests, ExtractDomainTest)
   });
   cudf::table_view expected{{expected1}};
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::extract(strings_view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
 TEST_F(StringsExtractTests, ExtractEventTest)
@@ -144,9 +153,13 @@ TEST_F(StringsExtractTests, ExtractEventTest)
                                       "Test Message Description"});
 
   for (std::size_t idx = 0; idx < patterns.size(); ++idx) {
-    auto results = cudf::strings::extract(strings_view, patterns[idx]);
+    auto pattern = patterns[idx];
+    auto results = cudf::strings::extract(strings_view, pattern);
     cudf::test::strings_column_wrapper expected({expecteds[idx]});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view().column(0), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::extract(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view().column(0), expected);
   }
 }
 
@@ -156,15 +169,24 @@ TEST_F(StringsExtractTests, MultiLine)
     cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
   auto view = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::extract(view, "(^[a-c]+$)", cudf::strings::regex_flags::MULTILINE);
+  auto pattern = std::string("(^[a-c]+$)");
+  auto results = cudf::strings::extract(view, pattern, cudf::strings::regex_flags::MULTILINE);
   cudf::test::strings_column_wrapper expected_multiline({"abc", "abc", "abc", "", "abc"},
                                                         {1, 1, 1, 0, 1});
   auto expected = cudf::table_view{{expected_multiline}};
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
-  results = cudf::strings::extract(view, "^([a-c]+)$");
+  auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE);
+  results   = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+
+  pattern = std::string("^([a-c]+)$");
+  results = cudf::strings::extract(view, pattern);
   cudf::test::strings_column_wrapper expected_default({"", "", "abc", "", ""}, {0, 0, 1, 0, 0});
   expected = cudf::table_view{{expected_default}};
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
 TEST_F(StringsExtractTests, DotAll)
@@ -172,15 +194,23 @@ TEST_F(StringsExtractTests, DotAll)
   auto input = cudf::test::strings_column_wrapper({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""});
   auto view  = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::extract(view, "(a.*f)", cudf::strings::regex_flags::DOTALL);
+  auto pattern = std::string("(a.*f)");
+  auto results = cudf::strings::extract(view, pattern, cudf::strings::regex_flags::DOTALL);
   cudf::test::strings_column_wrapper expected_dotall({"abc\nfa\nef", "abbc\nfff", "abcdef", ""},
                                                      {1, 1, 1, 0});
   auto expected = cudf::table_view{{expected_dotall}};
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
-  results = cudf::strings::extract(view, "(a.*f)");
+  auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL);
+  results   = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+
+  results = cudf::strings::extract(view, pattern);
   cudf::test::strings_column_wrapper expected_default({"", "", "abcdef", ""}, {0, 0, 1, 0});
   expected = cudf::table_view{{expected_default}};
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
 TEST_F(StringsExtractTests, EmptyExtractTest)
@@ -192,7 +222,8 @@ TEST_F(StringsExtractTests, EmptyExtractTest)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
   auto strings_view = cudf::strings_column_view(strings);
 
-  auto results = cudf::strings::extract(strings_view, "([^_]*)\\Z");
+  auto pattern = std::string("([^_]*)\\Z");
+  auto results = cudf::strings::extract(strings_view, pattern);
 
   std::vector<const char*> h_expected{nullptr, "AAA", "A", "", "", ""};
   cudf::test::strings_column_wrapper expected(
@@ -203,6 +234,9 @@ TEST_F(StringsExtractTests, EmptyExtractTest)
   columns.push_back(expected.release());
   cudf::table table_expected(std::move(columns));
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::extract(strings_view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected);
 }
 
 TEST_F(StringsExtractTests, ExtractAllTest)
@@ -214,7 +248,8 @@ TEST_F(StringsExtractTests, ExtractAllTest)
   cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity);
   auto sv = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::extract_all_record(sv, "(\\d+) (\\w+)");
+  auto pattern = std::string("(\\d+) (\\w+)");
+  auto results = cudf::strings::extract_all_record(sv, pattern);
 
   bool valids[] = {true, true, true, false, false, false, true};
   using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
@@ -226,15 +261,24 @@ TEST_F(StringsExtractTests, ExtractAllTest)
                 LCW{},
                 LCW{"4", "pare"}},
                valids);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::extract_all_record(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
 TEST_F(StringsExtractTests, Errors)
 {
   cudf::test::strings_column_wrapper input({"this column intentionally left blank"});
   auto sv = cudf::strings_column_view(input);
-  EXPECT_THROW(cudf::strings::extract(sv, "\\w+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::extract_all_record(sv, "\\w+"), cudf::logic_error);
+
+  auto pattern = std::string("\\w+");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+
+  EXPECT_THROW(cudf::strings::extract(sv, pattern), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::extract(sv, *prog), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::extract_all_record(sv, pattern), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::extract_all_record(sv, *prog), cudf::logic_error);
 }
 
 TEST_F(StringsExtractTests, MediumRegex)
diff --git a/cpp/tests/strings/fill_tests.cpp b/cpp/tests/strings/fill_tests.cpp
index ed731fe39b4..c3a1710bb83 100644
--- a/cpp/tests/strings/fill_tests.cpp
+++ b/cpp/tests/strings/fill_tests.cpp
@@ -17,13 +17,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/detail/fill.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <thrust/iterator/transform_iterator.h>
 
 #include <vector>
 
@@ -33,48 +31,37 @@ struct StringsFillTest : public cudf::test::BaseFixture {
 TEST_F(StringsFillTest, Fill)
 {
   std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  cudf::strings_column_view view(strings);
+  cudf::test::strings_column_wrapper input(
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
+
   {
-    auto results = cudf::strings::detail::fill(
-      view, 1, 5, cudf::string_scalar("zz"), cudf::get_default_stream());
+    auto results = cudf::fill(input, 1, 5, cudf::string_scalar("zz"));
 
     std::vector<const char*> h_expected{"eee", "zz", "zz", "zz", "zz", "bbb", "ééé"};
     cudf::test::strings_column_wrapper expected(
-      h_expected.begin(),
-      h_expected.end(),
-      thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+      h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results = cudf::strings::detail::fill(
-      view, 2, 4, cudf::string_scalar("", false), cudf::get_default_stream());
+    auto results = cudf::fill(input, 2, 4, cudf::string_scalar("", false));
 
     std::vector<const char*> h_expected{"eee", "bb", nullptr, nullptr, "aa", "bbb", "ééé"};
     cudf::test::strings_column_wrapper expected(
-      h_expected.begin(),
-      h_expected.end(),
-      thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+      h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results = cudf::strings::detail::fill(
-      view, 5, 5, cudf::string_scalar("zz"), cudf::get_default_stream());
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, view.parent());
+    auto results = cudf::fill(input, 5, 5, cudf::string_scalar("zz"));
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input);
   }
   {
-    auto results =
-      cudf::strings::detail::fill(view, 0, 7, cudf::string_scalar(""), cudf::get_default_stream());
+    auto results = cudf::fill(input, 0, 7, cudf::string_scalar(""));
     cudf::test::strings_column_wrapper expected({"", "", "", "", "", "", ""},
                                                 {1, 1, 1, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results = cudf::strings::detail::fill(
-      view, 0, 7, cudf::string_scalar("", false), cudf::get_default_stream());
+    auto results = cudf::fill(input, 0, 7, cudf::string_scalar("", false));
     cudf::test::strings_column_wrapper expected({"", "", "", "", "", "", ""},
                                                 {0, 0, 0, 0, 0, 0, 0});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
@@ -85,27 +72,16 @@ TEST_F(StringsFillTest, ZeroSizeStringsColumns)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::detail::fill(cudf::strings_column_view(zero_size_strings_column),
-                                             0,
-                                             1,
-                                             cudf::string_scalar(""),
-                                             cudf::get_default_stream());
+  auto results = cudf::fill(zero_size_strings_column, 0, 0, cudf::string_scalar(""));
   cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsFillTest, FillRangeError)
 {
   std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  cudf::strings_column_view view(strings);
+  cudf::test::strings_column_wrapper input(
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
-  EXPECT_THROW(
-    cudf::strings::detail::fill(view, 5, 1, cudf::string_scalar(""), cudf::get_default_stream()),
-    cudf::logic_error);
-  EXPECT_THROW(
-    cudf::strings::detail::fill(view, 5, 9, cudf::string_scalar(""), cudf::get_default_stream()),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::fill(input, 5, 1, cudf::string_scalar("")), cudf::logic_error);
+  EXPECT_THROW(cudf::fill(input, 5, 9, cudf::string_scalar("")), cudf::logic_error);
 }
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 1dd088cb70f..6428be28e0a 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -20,6 +20,7 @@
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/strings/findall.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
@@ -35,8 +36,10 @@ TEST_F(StringsFindallTests, FindallTest)
   cudf::test::strings_column_wrapper input(
     {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"},
     valids);
+  auto sv = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::findall(cudf::strings_column_view(input), "(\\d+)-(\\w+)");
+  auto pattern = std::string("(\\d+)-(\\w+)");
+  auto results = cudf::strings::findall(sv, pattern);
 
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{"3-A"},
@@ -49,6 +52,9 @@ TEST_F(StringsFindallTests, FindallTest)
                 LCW{"25-9000"}},
                valids);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::findall(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
 TEST_F(StringsFindallTests, Multiline)
@@ -56,10 +62,14 @@ TEST_F(StringsFindallTests, Multiline)
   cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
   auto view = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::findall(view, "(^abc$)", cudf::strings::regex_flags::MULTILINE);
+  auto pattern = std::string("(^abc$)");
+  auto results = cudf::strings::findall(view, pattern, cudf::strings::regex_flags::MULTILINE);
   using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{"abc", "abc"}, LCW{"abc"}, LCW{"abc"}, LCW{}, LCW{"abc"}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE);
+  results   = cudf::strings::findall(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
 TEST_F(StringsFindallTests, DotAll)
@@ -67,10 +77,14 @@ TEST_F(StringsFindallTests, DotAll)
   cudf::test::strings_column_wrapper input({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""});
   auto view = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::findall(view, "(b.*f)", cudf::strings::regex_flags::DOTALL);
+  auto pattern = std::string("(b.*f)");
+  auto results = cudf::strings::findall(view, pattern, cudf::strings::regex_flags::DOTALL);
   using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{"bc\nfa\nef"}, LCW{"bbc\nfff"}, LCW{"bcdef"}, LCW{}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL);
+  results   = cudf::strings::findall(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
 TEST_F(StringsFindallTests, MediumRegex)
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 6280463d112..840d998e56c 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -17,7 +17,9 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
+#include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -39,9 +41,7 @@ TEST_F(StringsReplaceRegexTest, ReplaceRegexTest)
                                      nullptr};
 
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
   auto strings_view = cudf::strings_column_view(strings);
 
   std::vector<const char*> h_expected{"= quick brown fox jumps over = lazy dog",
@@ -52,13 +52,15 @@ TEST_F(StringsReplaceRegexTest, ReplaceRegexTest)
                                       "",
                                       nullptr};
 
-  std::string pattern = "(\\bthe\\b)";
-  auto results        = cudf::strings::replace_re(strings_view, pattern, cudf::string_scalar("="));
+  auto pattern = std::string("(\\bthe\\b)");
+  auto repl    = cudf::string_scalar("=");
+  auto results = cudf::strings::replace_re(strings_view, pattern, repl);
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_re(strings_view, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
@@ -72,9 +74,7 @@ TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
                                      nullptr};
 
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
   auto strings_view = cudf::strings_column_view(strings);
 
   std::vector<const char*> h_expected{" quick brown fox jumps over  lazy dog",
@@ -91,101 +91,132 @@ TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
   auto repls_view = cudf::strings_column_view(repls);
   auto results    = cudf::strings::replace_re(strings_view, patterns, repls_view);
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, InvalidRegex)
 {
-  cudf::test::strings_column_wrapper strings(
-    {"abc*def|ghi+jkl", ""});  // these do not really matter
-  auto strings_view = cudf::strings_column_view(strings);
-
   // these are quantifiers that do not have a preceding character/class
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "*", cudf::string_scalar("")),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "|", cudf::string_scalar("")),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "+", cudf::string_scalar("")),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "ab(*)", cudf::string_scalar("")),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "\\", cudf::string_scalar("")),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "\\p", cudf::string_scalar("")),
-               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("*"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("|"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("ab(*)"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("\\"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("\\p"), cudf::logic_error);
 }
 
 TEST_F(StringsReplaceRegexTest, WithEmptyPattern)
 {
   std::vector<const char*> h_strings{"asd", "xcv"};
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
   auto strings_view = cudf::strings_column_view(strings);
-  std::vector<std::string> patterns({""});
+
+  auto empty_pattern = std::string("");
+  auto repl          = cudf::string_scalar("bbb");
+  std::vector<std::string> patterns({empty_pattern});
   cudf::test::strings_column_wrapper repls({"bbb"});
   auto repls_view = cudf::strings_column_view(repls);
   auto results    = cudf::strings::replace_re(strings_view, patterns, repls_view);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
-  results = cudf::strings::replace_re(strings_view, "", cudf::string_scalar("bbb"));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings);
+  results = cudf::strings::replace_re(strings_view, "", repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings);
+  auto prog = cudf::strings::regex_program::create(empty_pattern);
+  results   = cudf::strings::replace_re(strings_view, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings);
 }
 
 TEST_F(StringsReplaceRegexTest, MultiReplacement)
 {
   cudf::test::strings_column_wrapper input({"aba bcd aba", "abababa abababa"});
-  auto results =
-    cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar("_"), 2);
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern = std::string("aba");
+  auto repl    = cudf::string_scalar("_");
+  auto results = cudf::strings::replace_re(sv, pattern, repl, 2);
   cudf::test::strings_column_wrapper expected({"_ bcd _", "_b_ abababa"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
-  results =
-    cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar(""), 0);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_re(sv, *prog, repl, 2);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  results = cudf::strings::replace_re(sv, pattern, repl, 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input);
+  results = cudf::strings::replace_re(sv, *prog, repl, 0);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input);
 }
 
 TEST_F(StringsReplaceRegexTest, WordBoundary)
 {
   cudf::test::strings_column_wrapper input({"aba bcd\naba", "zéz", "A1B2-é3", "e é", "_", "a_b"});
-  auto results =
-    cudf::strings::replace_re(cudf::strings_column_view(input), "\\b", cudf::string_scalar("X"));
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern  = std::string("\\b");
+  auto repl     = cudf::string_scalar("X");
+  auto results  = cudf::strings::replace_re(sv, pattern, repl);
   auto expected = cudf::test::strings_column_wrapper(
     {"XabaX XbcdX\nXabaX", "XzézX", "XA1B2X-Xé3X", "XeX XéX", "X_X", "Xa_bX"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
-  results =
-    cudf::strings::replace_re(cudf::strings_column_view(input), "\\B", cudf::string_scalar("X"));
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  pattern  = std::string("\\B");
+  results  = cudf::strings::replace_re(sv, pattern, repl);
   expected = cudf::test::strings_column_wrapper(
     {"aXbXa bXcXd\naXbXa", "zXéXz", "AX1XBX2-éX3", "e é", "_", "aX_Xb"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, Alternation)
 {
   cudf::test::strings_column_wrapper input(
     {"16  6  brr  232323  1  hello  90", "123 ABC 00 2022", "abé123  4567  89xyz"});
-  auto results = cudf::strings::replace_re(
-    cudf::strings_column_view(input), "(^|\\s)\\d+(\\s|$)", cudf::string_scalar("_"));
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern = std::string("(^|\\s)\\d+(\\s|$)");
+  auto repl    = cudf::string_scalar("_");
+  auto results = cudf::strings::replace_re(sv, pattern, repl);
   auto expected =
     cudf::test::strings_column_wrapper({"__ brr __ hello _", "_ABC_2022", "abé123 _ 89xyz"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
-  results = cudf::strings::replace_re(
-    cudf::strings_column_view(input), "(\\s|^)\\d+($|\\s)", cudf::string_scalar("_"));
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  pattern = std::string("(\\s|^)\\d+($|\\s)");
+  results = cudf::strings::replace_re(sv, pattern, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_re(sv, *prog, repl);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ZeroLengthMatch)
 {
   cudf::test::strings_column_wrapper input({"DD", "zéz", "DsDs", ""});
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern  = std::string("D*");
   auto repl     = cudf::string_scalar("_");
-  auto results  = cudf::strings::replace_re(cudf::strings_column_view(input), "D*", repl);
+  auto results  = cudf::strings::replace_re(sv, pattern, repl);
   auto expected = cudf::test::strings_column_wrapper({"__", "_z_é_z_", "__s__s_", "_"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
-  results  = cudf::strings::replace_re(cudf::strings_column_view(input), "D?s?", repl);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  pattern  = std::string("D?s?");
+  results  = cudf::strings::replace_re(sv, pattern, repl);
   expected = cudf::test::strings_column_wrapper({"___", "_z_é_z_", "___", "_"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, Multiline)
@@ -196,14 +227,21 @@ TEST_F(StringsReplaceRegexTest, Multiline)
   auto sv = cudf::strings_column_view(input);
 
   // single-replace
-  auto results =
-    cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_"), std::nullopt, multiline);
+  auto pattern = std::string("^aba$");
+  auto repl    = cudf::string_scalar("_");
+  auto results = cudf::strings::replace_re(sv, pattern, repl, std::nullopt, multiline);
   cudf::test::strings_column_wrapper expected_ml({"bcd\n_\nefg", "_\naba abab\n_", "_"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml);
+  auto prog = cudf::strings::regex_program::create(pattern, multiline);
+  results   = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml);
 
-  results = cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_"));
+  results = cudf::strings::replace_re(sv, pattern, repl);
   cudf::test::strings_column_wrapper expected({"bcd\naba\nefg", "aba\naba abab\naba", "_"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
   // multi-replace
   std::vector<std::string> patterns({"aba$", "^aba"});
@@ -217,15 +255,23 @@ TEST_F(StringsReplaceRegexTest, Multiline)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected);
 
   // backref-replace
-  results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]", multiline);
+  auto repl_template = std::string("[\\1]");
+  pattern            = std::string("(^aba)");
+  results            = cudf::strings::replace_with_backrefs(sv, pattern, repl_template, multiline);
   cudf::test::strings_column_wrapper br_expected_ml(
     {"bcd\n[aba]\nefg", "[aba]\n[aba] abab\n[aba]", "[aba]"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml);
+  prog    = cudf::strings::regex_program::create(pattern, multiline);
+  results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml);
 
-  results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]");
+  results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
   cudf::test::strings_column_wrapper br_expected(
     {"bcd\naba\nefg", "[aba]\naba abab\naba", "[aba]"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
@@ -239,10 +285,8 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
                                      nullptr};
 
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  auto strings_view = cudf::strings_column_view(strings);
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
+  auto sv = cudf::strings_column_view(strings);
 
   std::vector<const char*> h_expected{"the-quick-brown-fox-jumps-over-the-lazy-dog",
                                       "the-fat-cat-lays-next-to-the-other-accénted-cat",
@@ -252,38 +296,43 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
                                       "",
                                       nullptr};
 
-  std::string pattern       = "(\\w) (\\w)";
-  std::string repl_template = "\\1-\\2";
-  auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template);
+  auto pattern       = std::string("(\\w) (\\w)");
+  auto repl_template = std::string("\\1-\\2");
+  auto results       = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexAltIndexPatternTest)
 {
-  cudf::test::strings_column_wrapper strings({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"});
-  auto strings_view = cudf::strings_column_view(strings);
+  cudf::test::strings_column_wrapper input({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"});
+  auto sv = cudf::strings_column_view(input);
 
-  std::string pattern       = "(\\d+)-(\\d+)";
-  std::string repl_template = "${2} X ${1}0";
-  auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template);
+  auto pattern       = std::string("(\\d+)-(\\d+)");
+  auto repl_template = std::string("${2} X ${1}0");
+  auto results       = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
 
   cudf::test::strings_column_wrapper expected(
     {"3 X 120 5 X 340 89 X 670", "99 X 00: 888 X 7770:: 0 X 56730"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest)
 {
   cudf::test::strings_column_wrapper strings(
     {"A543", "Z756", "", "tést-string", "two-thréé four-fivé", "abcd-éfgh", "tést-string-again"});
-  auto strings_view         = cudf::strings_column_view(strings);
-  std::string pattern       = "([a-z])-([a-zé])";
-  std::string repl_template = "X\\2+\\1Z";
-  auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template);
+  auto sv = cudf::strings_column_view(strings);
+
+  auto pattern       = std::string("([a-z])-([a-zé])");
+  auto repl_template = std::string("X\\2+\\1Z");
+  auto results       = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
 
   cudf::test::strings_column_wrapper expected({"A543",
                                                "Z756",
@@ -293,33 +342,45 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest)
                                                "abcXé+dZfgh",
                                                "tésXs+tZtrinXa+gZgain"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, BackrefWithGreedyQuantifier)
 {
   cudf::test::strings_column_wrapper input(
     {"<h1>title</h1><h2>ABC</h2>", "<h1>1234567</h1><h2>XYZ</h2>"});
-  std::string replacement = "<h2>\\1</h2><p>\\2</p>";
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern       = std::string("<h1>(.*)</h1><h2>(.*)</h2>");
+  auto repl_template = std::string("<h2>\\1</h2><p>\\2</p>");
 
-  auto results = cudf::strings::replace_with_backrefs(
-    cudf::strings_column_view(input), "<h1>(.*)</h1><h2>(.*)</h2>", replacement);
+  auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
   cudf::test::strings_column_wrapper expected(
     {"<h2>title</h2><p>ABC</p>", "<h2>1234567</h2><p>XYZ</p>"});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  results = cudf::strings::replace_with_backrefs(
-    cudf::strings_column_view(input), "<h1>([a-z\\d]+)</h1><h2>([A-Z]+)</h2>", replacement);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  pattern = std::string("<h1>([a-z\\d]+)</h1><h2>([A-Z]+)</h2>");
+  results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest)
 {
   cudf::test::strings_column_wrapper strings(
     {"TEST123", "TEST1TEST2", "TEST2-TEST1122", "TEST1-TEST-T", "TES3"});
-  auto strings_view         = cudf::strings_column_view(strings);
-  std::string pattern       = "(TEST)(\\d+)";
-  std::string repl_template = "${0}: ${1}, ${2}; ";
-  auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template);
+  auto sv = cudf::strings_column_view(strings);
+
+  auto pattern       = std::string("(TEST)(\\d+)");
+  auto repl_template = std::string("${0}: ${1}, ${2}; ");
+  auto results       = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
 
   cudf::test::strings_column_wrapper expected({
     "TEST123: TEST, 123; ",
@@ -329,6 +390,9 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest)
     "TES3",
   });
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest)
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index cd39c1e088a..da0667f54cf 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/detail/replace.hpp>
@@ -50,8 +51,8 @@ struct StringsReplaceTest : public cudf::test::BaseFixture {
 
 TEST_F(StringsReplaceTest, Replace)
 {
-  auto strings      = build_corpus();
-  auto strings_view = cudf::strings_column_view(strings);
+  auto input        = build_corpus();
+  auto strings_view = cudf::strings_column_view(input);
   // replace all occurrences of 'the ' with '++++ '
   std::vector<const char*> h_expected{"++++ quick brown fox jumps over ++++ lazy dog",
                                       "++++ fat cat lays next to ++++ other accénted cat",
@@ -61,24 +62,29 @@ TEST_F(StringsReplaceTest, Replace)
                                       "",
                                       nullptr};
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "));
+    strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "), -1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "));
+    strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "), -1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsReplaceTest, ReplaceReplLimit)
 {
-  auto strings      = build_corpus();
-  auto strings_view = cudf::strings_column_view(strings);
+  auto input        = build_corpus();
+  auto strings_view = cudf::strings_column_view(input);
+  auto stream       = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+
   // only remove the first occurrence of 'the '
   std::vector<const char*> h_expected{"quick brown fox jumps over the lazy dog",
                                       "fat cat lays next to the other accénted cat",
@@ -88,23 +94,21 @@ TEST_F(StringsReplaceTest, ReplaceReplLimit)
                                       "",
                                       nullptr};
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1);
+    strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1);
+    strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsReplaceTest, ReplaceReplLimitInputSliced)
 {
-  auto strings = build_corpus();
+  auto input = build_corpus();
   // replace first two occurrences of ' ' with '--'
   std::vector<const char*> h_expected{"the--quick--brown fox jumps over the lazy dog",
                                       "the--fat--cat lays next to the other accénted cat",
@@ -114,11 +118,11 @@ TEST_F(StringsReplaceTest, ReplaceReplLimitInputSliced)
                                       "",
                                       nullptr};
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
   std::vector<cudf::size_type> slice_indices{0, 2, 2, 3, 3, 7};
-  auto sliced_strings  = cudf::slice(strings, slice_indices);
+  auto sliced_strings  = cudf::slice(input, slice_indices);
   auto sliced_expected = cudf::slice(expected, slice_indices);
   for (size_t i = 0; i < sliced_strings.size(); ++i) {
     auto strings_view = cudf::strings_column_view(sliced_strings[i]);
@@ -126,10 +130,10 @@ TEST_F(StringsReplaceTest, ReplaceReplLimitInputSliced)
       cudf::strings::replace(strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, sliced_expected[i]);
     results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-      strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2);
+      strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2, stream, mr);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, sliced_expected[i]);
     results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-      strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2);
+      strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2, stream, mr);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, sliced_expected[i]);
   }
 }
@@ -139,9 +143,9 @@ TEST_F(StringsReplaceTest, ReplaceTargetOverlap)
   auto corpus      = build_corpus();
   auto corpus_view = cudf::strings_column_view(corpus);
   // replace all occurrences of 'the ' with '+++++++ '
-  auto strings = cudf::strings::replace(
+  auto input = cudf::strings::replace(
     corpus_view, cudf::string_scalar("the "), cudf::string_scalar("++++++++ "));
-  auto strings_view = cudf::strings_column_view(*strings);
+  auto strings_view = cudf::strings_column_view(*input);
   // replace all occurrences of '+++' with 'plus '
   std::vector<const char*> h_expected{
     "plus plus ++ quick brown fox jumps over plus plus ++ lazy dog",
@@ -152,60 +156,71 @@ TEST_F(StringsReplaceTest, ReplaceTargetOverlap)
     "",
     nullptr};
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
   results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "));
+    strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "), -1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "));
+    strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "), -1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsReplaceTest, ReplaceTargetOverlapsStrings)
 {
-  auto strings      = build_corpus();
-  auto strings_view = cudf::strings_column_view(strings);
+  auto input        = build_corpus();
+  auto strings_view = cudf::strings_column_view(input);
+  auto stream       = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+
   // replace all occurrences of 'dogthe' with '+'
   // should not replace anything unless it incorrectly matches across a string boundary
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
   results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+    strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"), -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
   results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+    strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"), -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
 }
 
 TEST_F(StringsReplaceTest, ReplaceNullInput)
 {
   std::vector<const char*> h_null_strings(128);
-  auto strings = cudf::test::strings_column_wrapper(
+  auto input = cudf::test::strings_column_wrapper(
     h_null_strings.begin(), h_null_strings.end(), thrust::make_constant_iterator(false));
-  auto strings_view = cudf::strings_column_view(strings);
+  auto strings_view = cudf::strings_column_view(input);
+  auto stream       = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
   // replace all occurrences of '+' with ''
   // should not replace anything as input is all null
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("+"), cudf::string_scalar(""));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
   results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("+"), cudf::string_scalar(""));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+    strings_view, cudf::string_scalar("+"), cudf::string_scalar(""), -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
   results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("+"), cudf::string_scalar(""));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+    strings_view, cudf::string_scalar("+"), cudf::string_scalar(""), -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
 }
 
 TEST_F(StringsReplaceTest, ReplaceEndOfString)
 {
-  auto strings      = build_corpus();
-  auto strings_view = cudf::strings_column_view(strings);
+  auto input        = build_corpus();
+  auto strings_view = cudf::strings_column_view(input);
+  auto stream       = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+
   // replace all occurrences of 'in' with  ' '
   std::vector<const char*> h_expected{"the quick brown fox jumps over the lazy dog",
                                       "the fat cat lays next to the other accénted cat",
@@ -216,20 +231,18 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString)
                                       nullptr};
 
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
 
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "));
   cudf::test::expect_columns_equal(*results, expected);
 
   results = cudf::strings::detail::replace<cudf::strings::detail::replace_algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "));
+    strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "), -1, stream, mr);
   cudf::test::expect_columns_equal(*results, expected);
 
   results = cudf::strings::detail::replace<cudf::strings::detail::replace_algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "));
+    strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "), -1, stream, mr);
   cudf::test::expect_columns_equal(*results, expected);
 }
 
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index c7bbce263f3..73d5adab427 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/split/partition.hpp>
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
-
 #include <thrust/iterator/transform_iterator.h>
 
 #include <vector>
@@ -316,21 +317,28 @@ TEST_F(StringsSplitTest, SplitRegex)
   auto sv = cudf::strings_column_view(input);
 
   {
-    auto result = cudf::strings::split_re(sv, "\\s+");
+    auto pattern = std::string("\\s+");
+    auto result  = cudf::strings::split_re(sv, pattern);
 
     cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity);
     cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0});
     cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0});
     auto expected = cudf::table_view({col0, col1, col2});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_re(sv, *prog);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
     // rsplit == split when using default parameters
-    result = cudf::strings::rsplit_re(sv, "\\s+");
+    result = cudf::strings::rsplit_re(sv, pattern);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_re(sv, *prog);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
 
   {
-    auto result = cudf::strings::split_re(sv, "[eé]");
+    auto pattern = std::string("[eé]");
+    auto result  = cudf::strings::split_re(sv, pattern);
 
     cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity);
     cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""},
@@ -339,9 +347,14 @@ TEST_F(StringsSplitTest, SplitRegex)
     cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0});
     auto expected = cudf::table_view({col0, col1, col2, col3});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_re(sv, *prog);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
     // rsplit == split when using default parameters
-    result = cudf::strings::rsplit_re(sv, "[eé]");
+    result = cudf::strings::rsplit_re(sv, pattern);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_re(sv, *prog);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
 }
@@ -356,20 +369,27 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
 
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
   {
-    auto result = cudf::strings::split_record_re(sv, "\\s+");
+    auto pattern = std::string("\\s+");
+    auto result  = cudf::strings::split_record_re(sv, pattern);
 
     LCW expected(
       {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}},
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_record_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
     // rsplit == split when using default parameters
-    result = cudf::strings::rsplit_record_re(sv, "\\s+");
+    result = cudf::strings::rsplit_record_re(sv, pattern);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_record_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
   }
 
   {
-    auto result = cudf::strings::split_record_re(sv, "[eé]");
+    auto pattern = std::string("[eé]");
+    auto result  = cudf::strings::split_record_re(sv, pattern);
 
     LCW expected({LCW{" H", "llo th", "s", ""},
                   LCW{},
@@ -378,9 +398,14 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
                   LCW{""}},
                  validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_record_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
     // rsplit == split when using default parameters
-    result = cudf::strings::rsplit_record_re(sv, "[eé]");
+    result = cudf::strings::rsplit_record_re(sv, pattern);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_record_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
   }
 }
@@ -393,37 +418,51 @@ TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
   auto sv = cudf::strings_column_view(input);
   {
-    auto result = cudf::strings::split_re(sv, "\\s+", 1);
+    auto pattern = std::string("\\s+");
+    auto result  = cudf::strings::split_re(sv, pattern, 1);
 
     cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
     cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some  ", "String", ""},
                                             {1, 0, 1, 1, 0});
     auto expected = cudf::table_view({col0, col1});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_re(sv, *prog, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
     // split everything is the same output as maxsplit==2 for the test input column here
-    result         = cudf::strings::split_re(sv, "\\s+", 2);
-    auto expected2 = cudf::strings::split_re(sv, "\\s+");
+    result         = cudf::strings::split_re(sv, pattern, 2);
+    auto expected2 = cudf::strings::split_re(sv, pattern);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view());
+    result = cudf::strings::split_re(sv, *prog, 3);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view());
   }
   {
-    auto result = cudf::strings::split_record_re(sv, "\\s", 1);
+    auto pattern = std::string("\\s");
+    auto result  = cudf::strings::split_record_re(sv, pattern, 1);
 
     using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
     LCW expected1(
       {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_record_re(sv, *prog, 1);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1);
 
-    result = cudf::strings::split_record_re(sv, "\\s", 2);
+    result = cudf::strings::split_record_re(sv, pattern, 2);
     LCW expected2(
       {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}},
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2);
+    result = cudf::strings::split_record_re(sv, *prog, 2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2);
 
     // split everything is the same output as maxsplit==3 for the test input column here
-    result         = cudf::strings::split_record_re(sv, "\\s", 3);
-    auto expected0 = cudf::strings::split_record_re(sv, "\\s");
+    result         = cudf::strings::split_record_re(sv, pattern, 3);
+    auto expected0 = cudf::strings::split_record_re(sv, pattern);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
+    result = cudf::strings::split_record_re(sv, *prog, 3);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
   }
 }
@@ -433,7 +472,8 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
   cudf::test::strings_column_wrapper input({"a", "ab", "-+", "e\né"});
   auto sv = cudf::strings_column_view(input);
   {
-    auto result = cudf::strings::split_re(sv, "\\b");
+    auto pattern = std::string("\\b");
+    auto result  = cudf::strings::split_re(sv, pattern);
 
     cudf::test::strings_column_wrapper col0({"", "", "-+", ""});
     cudf::test::strings_column_wrapper col1({"a", "ab", "", "e"}, {1, 1, 0, 1});
@@ -442,13 +482,20 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
     cudf::test::strings_column_wrapper col4({"", "", "", ""}, {0, 0, 0, 1});
     auto expected = cudf::table_view({col0, col1, col2, col3, col4});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_re(sv, *prog);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
   {
-    auto result = cudf::strings::split_record_re(sv, "\\B");
+    auto pattern = std::string("\\B");
+    auto result  = cudf::strings::split_record_re(sv, pattern);
 
     using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
     LCW expected({LCW{"a"}, LCW{"a", "b"}, LCW{"", "-", "+", ""}, LCW{"e\né"}});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_record_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
   }
 }
 
@@ -551,26 +598,35 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
   auto sv = cudf::strings_column_view(input);
 
+  auto pattern = std::string("\\s+");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+
   {
-    auto result = cudf::strings::rsplit_re(sv, "\\s+", 1);
+    auto result = cudf::strings::rsplit_re(sv, pattern, 1);
 
     cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity);
     cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0});
     auto expected = cudf::table_view({col0, col1});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_re(sv, *prog, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
   {
-    auto result = cudf::strings::rsplit_record_re(sv, "\\s+", 1);
+    auto result = cudf::strings::rsplit_record_re(sv, pattern, 1);
 
     using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
     LCW expected(
       {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}},
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_record_re(sv, *prog, 1);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
     // split everything is the same output as any maxsplit > 2 for the test input column here
-    result         = cudf::strings::rsplit_record_re(sv, "\\s+", 3);
-    auto expected0 = cudf::strings::rsplit_record_re(sv, "\\s+");
+    result         = cudf::strings::rsplit_record_re(sv, pattern, 3);
+    auto expected0 = cudf::strings::rsplit_record_re(sv, pattern);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
+    result = cudf::strings::rsplit_record_re(sv, *prog, 3);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
   }
 }
diff --git a/cpp/tests/quantiles/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu
similarity index 99%
rename from cpp/tests/quantiles/tdigest_utilities.cu
rename to cpp/tests/utilities/tdigest_utilities.cu
index 68147dc29eb..beed9893d71 100644
--- a/cpp/tests/quantiles/tdigest_utilities.cu
+++ b/cpp/tests/utilities/tdigest_utilities.cu
@@ -17,7 +17,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <cudf_test/column_utilities.hpp>
diff --git a/dependencies.yaml b/dependencies.yaml
new file mode 100644
index 00000000000..b8470f02f86
--- /dev/null
+++ b/dependencies.yaml
@@ -0,0 +1,207 @@
+# Dependency list for https://github.com/rapidsai/dependency-file-generator
+files:
+  all:
+    output: conda
+    matrix:
+      cuda: ["11.5"]
+      arch: [x86_64]
+    includes:
+      - build
+      - cudatoolkit
+      - develop
+      - doc
+      - notebook
+      - run
+      - test_python
+  test_cpp:
+    output: none
+    includes:
+      - cudatoolkit
+  test_python:
+    output: none
+    includes:
+      - cudatoolkit
+      - py_version
+      - test_python
+  checks:
+    output: none
+    includes:
+      - build
+      - develop
+      - py_version
+channels:
+  - rapidsai
+  - rapidsai-nightly
+  - dask/label/dev
+  - conda-forge
+  - nvidia
+dependencies:
+  build:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - cmake>=3.23.1
+          - cuda-python>=11.7.1,<12.0
+          - cython>=0.29,<0.30
+          - dlpack>=0.5,<0.6.0a0
+          - pyarrow=9.0.0
+          - rmm=22.12.*
+          - scikit-build>=0.13.1
+      - output_types: conda
+        packages:
+          - arrow-cpp=9
+          - c-compiler
+          - cxx-compiler
+          - librdkafka=1.7.0
+          - protobuf>=3.20.1,<3.21.0a0
+          - python>=3.8,<3.10
+    specific:
+      - output_types: conda
+        matrices:
+        - matrix:
+            arch: x86_64
+          packages:
+            - gcc_linux-64=9.*
+            - sysroot_linux-64==2.17
+        - matrix:
+            arch: aarch64
+          packages:
+            - gcc_linux-aarch64=9.*
+            - sysroot_linux-aarch64==2.17
+      - output_types: conda
+        matrices:
+        - matrix:
+            arch: x86_64
+            cuda: "11.5"
+          packages:
+            - nvcc_linux-64=11.5
+        - matrix:
+            arch: aarch64
+            cuda: "11.5"
+          packages:
+            - nvcc_linux-aarch64=11.5
+  cudatoolkit:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "11.2"
+            packages:
+              - cudatoolkit=11.2
+          - matrix:
+              cuda: "11.4"
+            packages:
+              - cudatoolkit=11.4
+          - matrix:
+              cuda: "11.5"
+            packages:
+              - cudatoolkit=11.5
+  develop:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - pre-commit
+      - output_types: conda
+        packages:
+          - doxygen=1.8.20  # pre-commit hook needs a specific version.
+  doc:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - myst-nb
+          - nbsphinx
+          - numpydoc
+          - pandoc<=2.0.0  # We should check and fix all "<=" pinnings
+          - pydata-sphinx-theme
+          - sphinx
+          - sphinx-autobuild
+          - sphinx-copybutton
+          - sphinx-markdown-tables
+          - sphinxcontrib-websupport
+  notebook:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - ipython
+          - notebook>=0.5.0
+  py_version:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              py: "3.8"
+            packages:
+              - python=3.8
+          - matrix:
+              py: "3.9"
+            packages:
+              - python=3.9
+  run:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - cachetools
+          - dask>=2022.9.2
+          - distributed>=2022.9.2
+          - fsspec>=0.6.0
+          - numba>=0.56.2
+          - numpy
+          - nvtx>=0.2.1
+          - packaging
+          - pandas>=1.0,<1.6.0dev0
+          - python-confluent-kafka=1.7.0
+          - streamz
+          - typing_extensions
+      - output_types: conda
+        packages:
+          - cubinlinker
+          - cupy>=9.5.0,<12.0.0a0
+          - pip
+          - pip:
+              - git+https://github.com/python-streamz/streamz.git@master
+              - pyorc
+          - ptxcompiler
+          - rmm=22.12.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm, cubinlinker, ptxcompiler.
+          - --extra-index-url=https://pypi.ngc.nvidia.com
+          - cubinlinker-cu11
+          - git+https://github.com/python-streamz/streamz.git@master
+          - ptxcompiler-cu11
+          - pyorc
+          - rmm-cu11=22.12.*
+    specific:
+      - output_types: requirements
+        matrices:
+          - matrix:
+              arch: x86_64
+            packages:
+              - cupy-cuda115>=9.5.0,<12.0.0a0  # TODO: This might change to cupy-cuda11x?
+          - matrix:
+              arch: aarch64
+            packages:
+              - cupy-cuda11x -f https://pip.cupy.dev/aarch64  # TODO: Verify that this works.
+  test_python:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - aiobotocore>=2.2.0
+          - boto3>=1.21.21
+          - botocore>=1.24.21
+          - dask-cuda=22.12.*
+          - fastavro>=0.22.9
+          - hypothesis
+          - mimesis>=4.1.0
+          - moto>=4.0.8
+          - pytest
+          - pytest-benchmark
+          - pytest-cases
+          - pytest-cov
+          - pytest-xdist
+          - python-snappy>=0.6.0
+          - pytorch<1.12.0  # We should check and fix all "<=" pinnings
+          - s3fs>=2022.3.0
+          - scipy
+          - transformers
diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index f5c9053ec92..db3bde8cca0 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -179,6 +179,7 @@ Reindexing / selection / label manipulation
    DataFrame.tail
    DataFrame.take
    DataFrame.tile
+   DataFrame.truncate
 
 .. _api.dataframe.missing:
 
@@ -249,10 +250,12 @@ Serialization / IO / conversion
    :toctree: api/
 
    DataFrame.from_arrow
+   DataFrame.from_dict
    DataFrame.from_pandas
    DataFrame.from_records
    DataFrame.hash_values
    DataFrame.to_arrow
+   DataFrame.to_dict
    DataFrame.to_dlpack
    DataFrame.to_parquet
    DataFrame.to_csv
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 245793e5ea6..0b2a58b2f87 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -177,6 +177,7 @@ Reindexing / selection / label manipulation
    Series.take
    Series.tail
    Series.tile
+   Series.truncate
    Series.where
    Series.mask
 
@@ -367,6 +368,7 @@ Serialization / IO / conversion
 
    Series.to_arrow
    Series.to_cupy
+   Series.to_dict
    Series.to_dlpack
    Series.to_frame
    Series.to_hdf
diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb
index 47c6ba408fb..3e169984ace 100644
--- a/docs/cudf/source/user_guide/cupy-interop.ipynb
+++ b/docs/cudf/source/user_guide/cupy-interop.ipynb
@@ -42,7 +42,7 @@
     "\n",
     "2. We can also use `DataFrame.values`.\n",
     "\n",
-    "3. We can also convert via the [CUDA array interface](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) by using cuDF's `to_cupy` functionality."
+    "3. We can also convert via the [CUDA array interface](https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html) by using cuDF's `to_cupy` functionality."
    ]
   },
   {
diff --git a/docs/cudf/source/user_guide/dask-cudf.md b/docs/cudf/source/user_guide/dask-cudf.md
deleted file mode 100644
index 2d829008ac9..00000000000
--- a/docs/cudf/source/user_guide/dask-cudf.md
+++ /dev/null
@@ -1,104 +0,0 @@
-# Multi-GPU with Dask-cuDF
-
-cuDF is a single-GPU library. For Multi-GPU cuDF solutions we use
-[Dask](https://dask.org/) and the [dask-cudf
-package](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf),
-which is able to scale cuDF across multiple GPUs on a single machine,
-or multiple GPUs across many machines in a cluster.
-
-[Dask DataFrame](http://docs.dask.org/en/latest/dataframe.html) was
-originally designed to scale Pandas, orchestrating many Pandas
-DataFrames spread across many CPUs into a cohesive parallel DataFrame.
-Because cuDF currently implements only a subset of the Pandas API, not
-all Dask DataFrame operations work with cuDF.
-
-The following is tested and expected to work:
-
-## What works
-
-- Data ingestion
-
-  - `dask_cudf.read_csv`
-  - Use standard Dask ingestion with Pandas, then convert to cuDF (For
-    Parquet and other formats this is often decently fast)
-
-- Linear operations
-
-  - Element-wise operations: `df.x + df.y`, `df ** 2`
-  - Assignment: `df['z'] = df.x + df.y`
-  - Row-wise selections: `df[df.x > 0]`
-  - Loc: `df.loc['2001-01-01': '2005-02-02']`
-  - Date time/string accessors: `df.timestamp.dt.dayofweek`
-  - ... and most similar operations in this category that are already
-    implemented in cuDF
-
-- Reductions
-
-  - Like `sum`, `mean`, `max`, `count`, and so on on
-    `Series` objects
-  - Support for reductions on full dataframes
-  - `std`
-  - Custom reductions with
-    [dask.dataframe.reduction](https://docs.dask.org/en/latest/generated/dask.dataframe.Series.reduction.html)
-
-- Groupby aggregations
-
-  - On single columns: `df.groupby('x').y.max()`
-  - With custom aggregations:
-  - groupby standard deviation
-  - grouping on multiple columns
-  - groupby agg for multiple outputs
-
-- Joins:
-
-  - On full unsorted columns: `left.merge(right, on='id')`
-    (expensive)
-  - On sorted indexes:
-    `left.merge(right, left_index=True, right_index=True)` (fast)
-  - On large and small dataframes: `left.merge(cudf_df, on='id')`
-    (fast)
-
-- Rolling operations
-
-- Converting to and from other forms
-
-  - Dask + Pandas to Dask + cuDF
-    `df.map_partitions(cudf.DataFrame.from_pandas)`
-  - Dask + cuDF to Dask + Pandas
-    `df.map_partitions(lambda df: df.to_pandas())`
-  - cuDF to Dask + cuDF:
-    `dask.dataframe.from_pandas(df, npartitions=20)`
-  - Dask + cuDF to cuDF: `df.compute()`
-
-Additionally all generic Dask operations, like `compute`, `persist`,
-`visualize` and so on work regardless.
-
-## Developing the API
-
-Above we mention the following:
-
-> and most similar operations in this category that are already
-> implemented in cuDF
-
-This is because it is difficult to create a comprehensive list of
-operations in the cuDF and Pandas libraries. The API is large enough to
-be difficult to track effectively. For any operation that operates
-row-wise like `fillna` or `query` things will likely, but not
-certainly work. If operations don't work it is often due to a slight
-inconsistency between Pandas and cuDF that is generally easy to fix. We
-encourage users to look at the [cuDF issue
-tracker](https://github.com/rapidsai/cudf/issues) to see if their
-issue has already been reported and, if not, [raise a new
-issue](https://github.com/rapidsai/cudf/issues/new).
-
-## Navigating the API
-
-This project reuses the [Dask
-DataFrame](https://docs.dask.org/en/latest/dataframe.html) project,
-which was originally designed for Pandas, with the newer library cuDF.
-Because we use the same Dask classes for both projects there are often
-methods that are implemented for Pandas, but not yet for cuDF. As a
-result users looking at the full Dask DataFrame API can be misleading,
-and often lead to frustration when operations that are advertised in the
-Dask API do not work as expected with cuDF. We apologize for this in
-advance.
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index f80644251c2..bd7793ac214 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -446,8 +446,8 @@
    "id": "00914f2a",
    "metadata": {},
    "source": [
-    "In addition to the Series.apply() method for performing custom operations, you can also pass Series objects directly into [CUDA kernels written with Numba](https://numba.pydata.org/numba-doc/latest/cuda/kernels.html).\n",
-    "Note that this section requires basic CUDA knowledge. Refer to [numba's CUDA documentation](https://numba.pydata.org/numba-doc/latest/cuda/index.html) for details.\n",
+    "In addition to the Series.apply() method for performing custom operations, you can also pass Series objects directly into [CUDA kernels written with Numba](https://numba.readthedocs.io/en/stable/cuda/kernels.html).\n",
+    "Note that this section requires basic CUDA knowledge. Refer to [numba's CUDA documentation](https://numba.readthedocs.io/en/stable/cuda/index.html) for details.\n",
     "\n",
     "The easiest way to write a Numba kernel is to use `cuda.grid(1)` to manage thread indices, and then leverage Numba's `forall` method to configure the kernel for us. Below, define a basic multiplication kernel as an example and use `@cuda.jit` to compile it."
    ]
@@ -485,7 +485,7 @@
    "source": [
     "This kernel will take an input array, multiply it by a configurable value (supplied at runtime), and store the result in an output array. Notice that we wrapped our logic in an `if` statement. Because we can launch more threads than the size of our array, we need to make sure that we don't use threads with an index that would be out of bounds. Leaving this out can result in undefined behavior.\n",
     "\n",
-    "To execute our kernel, must pre-allocate an output array and leverage the `forall` method mentioned above. First, we create a Series of all `0.0` in our DataFrame, since we want `float64` output. Next, we run the kernel with `forall`. `forall` requires us to specify our desired number of tasks, so we'll supply in the length of our Series (which we store in `size`). The [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) is what allows us to directly call our Numba kernel on our Series."
+    "To execute our kernel, must pre-allocate an output array and leverage the `forall` method mentioned above. First, we create a Series of all `0.0` in our DataFrame, since we want `float64` output. Next, we run the kernel with `forall`. `forall` requires us to specify our desired number of tasks, so we'll supply in the length of our Series (which we store in `size`). The [__cuda_array_interface__](https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html) is what allows us to directly call our Numba kernel on our Series."
    ]
   },
   {
@@ -2624,7 +2624,7 @@
     "- Generalized NA UDFs\n",
     "\n",
     "\n",
-    "For more information please see the [cuDF](https://docs.rapids.ai/api/cudf/nightly/), [Numba.cuda](https://numba.pydata.org/numba-doc/dev/cuda/index.html), and [CuPy](https://docs-cupy.chainer.org/en/stable/) documentation."
+    "For more information please see the [cuDF](https://docs.rapids.ai/api/cudf/nightly/), [Numba.cuda](https://numba.readthedocs.io/en/stable/cuda/index.html), and [CuPy](https://docs-cupy.chainer.org/en/stable/) documentation."
    ]
   }
  ],
diff --git a/docs/cudf/source/user_guide/index.md b/docs/cudf/source/user_guide/index.md
index d99056f69f2..86168f0d81b 100644
--- a/docs/cudf/source/user_guide/index.md
+++ b/docs/cudf/source/user_guide/index.md
@@ -11,7 +11,6 @@ missing-data
 groupby
 guide-to-udfs
 cupy-interop
-dask-cudf
 options
 PandasCompat
 ```
diff --git a/docs/cudf/source/user_guide/io.md b/docs/cudf/source/user_guide/io.md
index 9099a761f2c..3a803953502 100644
--- a/docs/cudf/source/user_guide/io.md
+++ b/docs/cudf/source/user_guide/io.md
@@ -170,7 +170,7 @@ If no value is set, behavior will be the same as the "STABLE" option.
     +=======================+========+========+==============+==============+=========+========+==============+==============+========+
     | Snappy                | ❌     | ❌     | Stable       | Stable       | ❌      | ❌     | Stable       | Stable       | ❌     |
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
-    | ZSTD                  | ❌     | ❌     | Experimental | Experimental | ❌      | ❌     | Experimental | Experimental | ❌     |
+    | ZSTD                  | ❌     | ❌     | Stable       | Stable       | ❌      | ❌     | Stable       | Stable       | ❌     |
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
     | DEFLATE               | ❌     | ❌     | ❌           | ❌           | ❌      | ❌     | Experimental | Experimental | ❌     |
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index e639320b028..57849b9ba0a 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2632,12 +2632,13 @@ public final ColumnVector stringSplitRecord(String delimiter) {
 
   /**
    * Returns a new strings column that contains substrings of the strings in the provided column.
-   * Overloading subString to support if end index is not provided. Appending -1 to indicate to
-   * read until end of string.
+   * The character positions to retrieve in each string are `[start, <the string end>)`..
+   *
    * @param start first character index to begin the substring(inclusive).
    */
   public final ColumnVector substring(int start) {
-    return substring(start, -1);
+    assert type.equals(DType.STRING) : "column type must be a String";
+    return new ColumnVector(substringS(getNativeView(), start));
   }
 
   /**
@@ -3983,6 +3984,13 @@ private static native long stringSplitRecord(long nativeHandle, String pattern,
    */
   private static native long substring(long columnView, int start, int end) throws CudfException;
 
+  /**
+   * Native method to extract substrings from a given strings column.
+   * @param columnView native handle of the cudf::column_view being operated on.
+   * @param start      first character index to begin the substrings (inclusive).
+   */
+  private static native long substringS(long columnView, int start) throws CudfException;
+
   /**
    * Native method to calculate substring from a given string column.
    * @param columnView native handle of the cudf::column_view being operated on.
diff --git a/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java
index 9f0d9a451c0..e6b3994235d 100644
--- a/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -28,6 +28,23 @@
  * subclassing beyond what is included in CUDF is not recommended and not supported.
  */
 abstract public class MemoryBuffer implements AutoCloseable {
+  /**
+   * Interface to handle events for this MemoryBuffer. Only invoked during
+   * close, hence `onClosed` is the only event.
+   */
+  public interface EventHandler {
+    /**
+     * `onClosed` is invoked with the updated `refCount` during `close`.
+     * The last invocation of `onClosed` will be with `refCount=0`.
+     *
+     * @note the callback is invoked with this `MemoryBuffer`'s lock held.
+     *
+     * @param refCount - the updated ref count for this MemoryBuffer at the time
+     *                 of invocation
+     */
+    void onClosed(int refCount);
+  }
+
   private static final Logger log = LoggerFactory.getLogger(MemoryBuffer.class);
   protected final long address;
   protected final long length;
@@ -36,6 +53,8 @@ abstract public class MemoryBuffer implements AutoCloseable {
   protected final MemoryBufferCleaner cleaner;
   protected final long id;
 
+  private EventHandler eventHandler;
+
   public static abstract class MemoryBufferCleaner extends MemoryCleaner.Cleaner{}
 
   private static final class SlicedBufferCleaner extends MemoryBufferCleaner {
@@ -193,6 +212,27 @@ public final void copyFromMemoryBufferAsync(
    */
   public abstract MemoryBuffer slice(long offset, long len);
 
+  /**
+   * Set an event handler for this buffer. This method can be invoked with null
+   * to unset the handler.
+   *
+   * @param newHandler - the EventHandler to use from this point forward
+   * @return the prior event handler, or null if not set.
+   */
+  public synchronized EventHandler setEventHandler(EventHandler newHandler) {
+    EventHandler prev = this.eventHandler;
+    this.eventHandler = newHandler;
+    return prev;
+  }
+
+  /**
+   * Returns the current event handler for this buffer or null if no handler
+   * is associated or this buffer is closed.
+   */
+  public synchronized EventHandler getEventHandler() {
+    return this.eventHandler;
+  }
+
   /**
    * Close this buffer and free memory
    */
@@ -200,6 +240,9 @@ public synchronized void close() {
     if (cleaner != null) {
       refCount--;
       cleaner.delRef();
+      if (eventHandler != null) {
+        eventHandler.onClosed(refCount);
+      }
       if (refCount == 0) {
         cleaner.clean(false);
         closed = true;
@@ -232,8 +275,10 @@ public synchronized void incRefCount() {
     cleaner.addRef();
   }
 
-  // visible for testing
-  synchronized int getRefCount() {
+  /**
+   * Get the current reference count for this buffer.
+   */
+  public synchronized int getRefCount() {
     return refCount;
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 0b825937815..a8ca8a2c4d3 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -185,14 +185,30 @@ public static void resetScopedMaximumBytesAllocated() {
    * @throws RmmException if an active handler is already set
    */
   public static void setEventHandler(RmmEventHandler handler) throws RmmException {
+    setEventHandler(handler, false);
+  }
+
+  /**
+   * Sets the event handler to be called on RMM events (e.g.: allocation failure) and
+   * optionally enable debug mode (callbacks on every allocate and deallocate)
+   *
+   * NOTE: Only enable debug mode when necessary, as code will run much slower!
+   *
+   * @param handler event handler to invoke on RMM events or null to clear an existing handler
+   * @param enableDebug if true enable debug callbacks in RmmEventHandler
+   *                    (onAllocated, onDeallocated)
+   * @throws RmmException if an active handler is already set
+   */
+  public static void setEventHandler(RmmEventHandler handler,
+                                     boolean enableDebug) throws RmmException {
     long[] allocThresholds = (handler != null) ? sortThresholds(handler.getAllocThresholds()) : null;
     long[] deallocThresholds = (handler != null) ? sortThresholds(handler.getDeallocThresholds()) : null;
-    setEventHandlerInternal(handler, allocThresholds, deallocThresholds);
+    setEventHandlerInternal(handler, allocThresholds, deallocThresholds, enableDebug);
   }
 
   /** Clears the active RMM event handler if one is set. */
   public static void clearEventHandler() throws RmmException {
-    setEventHandlerInternal(null, null, null);
+    setEventHandlerInternal(null, null, null, false);
   }
 
   private static long[] sortThresholds(long[] thresholds) {
@@ -300,7 +316,8 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) {
   static native void freeDeviceBuffer(long rmmBufferAddress) throws RmmException;
 
   static native void setEventHandlerInternal(RmmEventHandler handler,
-      long[] allocThresholds, long[] deallocThresholds) throws RmmException;
+      long[] allocThresholds, long[] deallocThresholds,
+      boolean enableDebug) throws RmmException;
 
   /**
    * Allocate device memory using `cudaMalloc` and return a pointer to device memory.
diff --git a/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java b/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java
index 19707b85bcb..347ef471a15 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java
@@ -32,6 +32,18 @@ default boolean onAllocFailure(long sizeRequested) {
         "Unexpected invocation of deprecated onAllocFailure without retry count.");
   }
 
+  /**
+   * Invoked after every memory allocation when debug mode is enabled.
+   * @param size number of bytes allocated
+   */
+  default void onAllocated(long size) {}
+
+  /**
+   * Invoked after every memory deallocation when debug mode is enabled.
+   * @param size number of bytes deallocated
+   */
+  default void onDeallocated(long size) {}
+
   /**
    * Invoked on a memory allocation failure.
    * @param sizeRequested number of bytes that failed to allocate
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index f52d3201a10..4acc14c760c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -69,7 +69,7 @@
 #include <cudf/strings/strip.hpp>
 #include <cudf/strings/substring.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
@@ -1397,6 +1397,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringS(JNIEnv *env, jclass,
+                                                                  jlong cv_handle, jint start) {
+  JNI_NULL_CHECK(env, cv_handle, "column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const cv = reinterpret_cast<cudf::column_view const *>(cv_handle);
+    auto const scv = cudf::strings_column_view{*cv};
+    return release_as_jlong(cudf::strings::slice_strings(scv, start));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(JNIEnv *env, jclass,
                                                                  jlong column_view, jint start,
                                                                  jint end) {
@@ -1405,8 +1417,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(JNIEnv *env, jc
     cudf::jni::auto_set_device(env);
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
     cudf::strings_column_view scv(*cv);
-    return release_as_jlong((end == -1 ? cudf::strings::slice_strings(scv, start) :
-                                         cudf::strings::slice_strings(scv, start, end)));
+    return release_as_jlong(cudf::strings::slice_strings(scv, start, end));
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 529345b6bd8..b12f1ed0841 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -90,11 +90,14 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   void reset_scoped_max_total_allocated(std::size_t initial_value) override {
     std::scoped_lock lock(max_total_allocated_mutex);
-    scoped_allocated = 0;
+    scoped_allocated = initial_value;
     scoped_max_total_allocated = initial_value;
   }
 
-  std::size_t get_scoped_max_total_allocated() override { return scoped_max_total_allocated; }
+  std::size_t get_scoped_max_total_allocated() override {
+    std::scoped_lock lock(max_total_allocated_mutex);
+    return scoped_max_total_allocated;
+  }
 
 private:
   Upstream *const resource;
@@ -123,7 +126,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
     if (result) {
       total_allocated += num_bytes;
       scoped_allocated += num_bytes;
-
       std::scoped_lock lock(max_total_allocated_mutex);
       max_total_allocated = std::max(total_allocated.load(), max_total_allocated);
       scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated);
@@ -193,7 +195,7 @@ std::size_t get_scoped_max_total_allocated() {
  * @brief An RMM device memory resource adaptor that delegates to the wrapped resource
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).
  */
-class java_event_handler_memory_resource final : public device_memory_resource {
+class java_event_handler_memory_resource : public device_memory_resource {
 public:
   java_event_handler_memory_resource(JNIEnv *env, jobject jhandler, jlongArray jalloc_thresholds,
                                      jlongArray jdealloc_thresholds,
@@ -250,8 +252,6 @@ class java_event_handler_memory_resource final : public device_memory_resource {
 
 private:
   device_memory_resource *const resource;
-  JavaVM *jvm;
-  jobject handler_obj;
   jmethodID on_alloc_fail_method;
   bool use_old_alloc_fail_interface;
   jmethodID on_alloc_threshold_method;
@@ -309,6 +309,18 @@ class java_event_handler_memory_resource final : public device_memory_resource {
     }
   }
 
+  bool supports_get_mem_info() const noexcept override { return resource->supports_get_mem_info(); }
+
+  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override {
+    return resource->get_mem_info(stream);
+  }
+
+  bool supports_streams() const noexcept override { return resource->supports_streams(); }
+
+protected:
+  JavaVM *jvm;
+  jobject handler_obj;
+
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     std::size_t total_before;
     void *result;
@@ -348,20 +360,65 @@ class java_event_handler_memory_resource final : public device_memory_resource {
     check_for_threshold_callback(total_after, total_before, dealloc_thresholds,
                                  on_dealloc_threshold_method, "onDeallocThreshold", total_after);
   }
+};
 
-  bool supports_get_mem_info() const noexcept override { return resource->supports_get_mem_info(); }
+class java_debug_event_handler_memory_resource final : public java_event_handler_memory_resource {
+public:
+  java_debug_event_handler_memory_resource(JNIEnv *env, jobject jhandler,
+                                           jlongArray jalloc_thresholds,
+                                           jlongArray jdealloc_thresholds,
+                                           device_memory_resource *resource_to_wrap)
+      : java_event_handler_memory_resource(env, jhandler, jalloc_thresholds, jdealloc_thresholds,
+                                           resource_to_wrap) {
+    jclass cls = env->GetObjectClass(jhandler);
+    if (cls == nullptr) {
+      throw cudf::jni::jni_exception("class not found");
+    }
 
-  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override {
-    return resource->get_mem_info(stream);
+    on_allocated_method = env->GetMethodID(cls, "onAllocated", "(J)V");
+    if (on_allocated_method == nullptr) {
+      throw cudf::jni::jni_exception("onAllocated method");
+    }
+
+    on_deallocated_method = env->GetMethodID(cls, "onDeallocated", "(J)V");
+    if (on_deallocated_method == nullptr) {
+      throw cudf::jni::jni_exception("onDeallocated method");
+    }
   }
 
-  bool supports_streams() const noexcept override { return resource->supports_streams(); }
+private:
+  jmethodID on_allocated_method;
+  jmethodID on_deallocated_method;
+
+  void on_allocated_callback(std::size_t num_bytes, rmm::cuda_stream_view stream) {
+    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+    env->CallVoidMethod(handler_obj, on_allocated_method, num_bytes);
+    if (env->ExceptionCheck()) {
+      throw std::runtime_error("onAllocated handler threw an exception");
+    }
+  }
+
+  void on_deallocated_callback(void *p, std::size_t size, rmm::cuda_stream_view stream) {
+    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+    env->CallVoidMethod(handler_obj, on_deallocated_method, size);
+  }
+
+  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
+    void *result = java_event_handler_memory_resource::do_allocate(num_bytes, stream);
+    on_allocated_callback(num_bytes, stream);
+    return result;
+  }
+
+  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+    java_event_handler_memory_resource::do_deallocate(p, size, stream);
+    on_deallocated_callback(p, size, stream);
+  }
 };
 
 std::unique_ptr<java_event_handler_memory_resource> Java_memory_resource{};
 
 void set_java_device_memory_resource(JNIEnv *env, jobject handler_obj, jlongArray jalloc_thresholds,
-                                     jlongArray jdealloc_thresholds) {
+                                     jlongArray jdealloc_thresholds, jboolean enable_debug) {
   if (Java_memory_resource && handler_obj != nullptr) {
     JNI_THROW_NEW(env, RMM_EXCEPTION_CLASS, "Another event handler is already set", )
   }
@@ -378,8 +435,13 @@ void set_java_device_memory_resource(JNIEnv *env, jobject handler_obj, jlongArra
   }
   if (handler_obj != nullptr) {
     auto resource = rmm::mr::get_current_device_resource();
-    Java_memory_resource.reset(new java_event_handler_memory_resource(
-        env, handler_obj, jalloc_thresholds, jdealloc_thresholds, resource));
+    if (enable_debug) {
+      Java_memory_resource.reset(new java_debug_event_handler_memory_resource(
+          env, handler_obj, jalloc_thresholds, jdealloc_thresholds, resource));
+    } else {
+      Java_memory_resource.reset(new java_event_handler_memory_resource(
+          env, handler_obj, jalloc_thresholds, jdealloc_thresholds, resource));
+    }
     auto replaced_resource = rmm::mr::set_current_device_resource(Java_memory_resource.get());
     if (resource != replaced_resource) {
       rmm::mr::set_current_device_resource(replaced_resource);
@@ -493,7 +555,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_shutdownInternal(JNIEnv *env, jclass clazz) {
   try {
     cudf::jni::auto_set_device(env);
-    set_java_device_memory_resource(env, nullptr, nullptr, nullptr);
+    set_java_device_memory_resource(env, nullptr, nullptr, nullptr, false);
     // Instead of trying to undo all of the adaptors that we added in reverse order
     // we just reset the base adaptor so the others will not be called any more
     // and then clean them up in really any order.  There should be no interaction with
@@ -517,7 +579,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumTotalBytesAllocated(JN
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetScopedMaximumBytesAllocatedInternal(
-    JNIEnv *env, jclass, long initialValue) {
+    JNIEnv *env, jclass, jlong initialValue) {
   reset_scoped_max_total_allocated(initialValue);
 }
 
@@ -562,9 +624,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeDeviceBuffer(JNIEnv *env, jcl
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setEventHandlerInternal(
     JNIEnv *env, jclass, jobject handler_obj, jlongArray jalloc_thresholds,
-    jlongArray jdealloc_thresholds) {
+    jlongArray jdealloc_thresholds, jboolean enable_debug) {
   try {
-    set_java_device_memory_resource(env, handler_obj, jalloc_thresholds, jdealloc_thresholds);
+    set_java_device_memory_resource(env, handler_obj, jalloc_thresholds, jdealloc_thresholds,
+                                    enable_debug);
   }
   CATCH_STD(env, )
 }
diff --git a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
index df710c71f63..c332ce660d1 100644
--- a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
+++ b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 
 import org.junit.jupiter.api.Test;
 
+import java.util.concurrent.atomic.AtomicInteger;
+
 import static org.junit.jupiter.api.Assertions.*;
 
 public class MemoryBufferTest extends CudfTestBase {
@@ -168,4 +170,49 @@ private void verifyOutput(HostMemoryBuffer out) {
     out.getBytes(bytes, 0, 0, 16);
     assertArrayEquals(EXPECTED, bytes);
   }
+
+  @Test
+  public void testEventHandlerIsCalledForEachClose() {
+    final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
+    try (DeviceMemoryBuffer b = DeviceMemoryBuffer.allocate(256)) {
+      b.setEventHandler(refCount -> onClosedWasCalled.incrementAndGet());
+    }
+    assertEquals(1, onClosedWasCalled.get());
+    onClosedWasCalled.set(0);
+
+    try (DeviceMemoryBuffer b = DeviceMemoryBuffer.allocate(256)) {
+      b.setEventHandler(refCount -> onClosedWasCalled.incrementAndGet());
+      DeviceMemoryBuffer sliced = b.slice(0, b.getLength());
+      sliced.close();
+    }
+    assertEquals(2, onClosedWasCalled.get());
+  }
+
+  @Test
+  public void testEventHandlerIsNotCalledIfNotSet() {
+    final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
+    try (DeviceMemoryBuffer b = DeviceMemoryBuffer.allocate(256)) {
+      assertNull(b.getEventHandler());
+    }
+    assertEquals(0, onClosedWasCalled.get());
+    try (DeviceMemoryBuffer b = DeviceMemoryBuffer.allocate(256)) {
+      b.setEventHandler(refCount -> onClosedWasCalled.incrementAndGet());
+      b.setEventHandler(null);
+    }
+    assertEquals(0, onClosedWasCalled.get());
+  }
+
+  @Test
+  public void testEventHandlerReturnsPreviousHandlerOnReset() {
+    try (DeviceMemoryBuffer b = DeviceMemoryBuffer.allocate(256)) {
+      MemoryBuffer.EventHandler handler = refCount -> {};
+      MemoryBuffer.EventHandler handler2 = refCount -> {};
+
+      assertNull(b.setEventHandler(handler));
+      assertEquals(handler, b.setEventHandler(null));
+
+      assertNull(b.setEventHandler(handler2));
+      assertEquals(handler2, b.setEventHandler(handler));
+    }
+  }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index 18ff5f4081e..c081f51c9f2 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -115,7 +115,7 @@ public void testScopedMaxOutstanding(int rmmAllocMode) {
     try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
       Rmm.resetScopedMaximumBytesAllocated(1024);
       try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
-        assertEquals(20480, Rmm.getScopedMaximumBytesAllocated());
+        assertEquals(21504, Rmm.getScopedMaximumBytesAllocated());
         assertEquals(21504, Rmm.getMaximumTotalBytesAllocated());
       }
     }
@@ -157,6 +157,8 @@ public void testEventHandler(int rmmAllocMode) {
     AtomicInteger invokedCount = new AtomicInteger();
     AtomicLong amountRequested = new AtomicLong();
     AtomicInteger timesRetried = new AtomicInteger();
+    AtomicLong totalAllocated = new AtomicLong();
+    AtomicLong totalDeallocated = new AtomicLong();
 
     RmmEventHandler handler = new BaseRmmEventHandler() {
       @Override
@@ -166,6 +168,16 @@ public boolean onAllocFailure(long sizeRequested, int retryCount) {
         amountRequested.set(sizeRequested);
         return count != 3;
       }
+
+      @Override
+      public void onAllocated(long sizeAllocated) {
+        totalAllocated.addAndGet(sizeAllocated);
+      }
+
+      @Override
+      public void onDeallocated(long sizeDeallocated) {
+        totalDeallocated.addAndGet(sizeDeallocated);
+      }
     };
 
     Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
@@ -175,6 +187,10 @@ public boolean onAllocFailure(long sizeRequested, int retryCount) {
     assertTrue(addr.address != 0);
     assertEquals(0, invokedCount.get());
 
+    // by default, we don't get callbacks on allocated or deallocated
+    assertEquals(0, totalAllocated.get());
+    assertEquals(0, totalDeallocated.get());
+
     // Try to allocate too much
     long requested = TOO_MUCH_MEMORY;
     try {
@@ -192,6 +208,14 @@ public boolean onAllocFailure(long sizeRequested, int retryCount) {
     requested = 8192;
     addr = Rmm.alloc(requested);
     addr.close();
+
+    // test the debug event handler
+    Rmm.clearEventHandler();
+    Rmm.setEventHandler(handler, /*enableDebug*/ true);
+    addr = Rmm.alloc(1024);
+    addr.close();
+    assertEquals(1024, totalAllocated.get());
+    assertEquals(1024, totalDeallocated.get());
   }
 
   @Test
diff --git a/notebooks/10min.ipynb b/notebooks/10min.ipynb
new file mode 120000
index 00000000000..bd57fc7375e
--- /dev/null
+++ b/notebooks/10min.ipynb
@@ -0,0 +1 @@
+../docs/cudf/source/user_guide/10min.ipynb
\ No newline at end of file
diff --git a/notebooks/cupy-interop.ipynb b/notebooks/cupy-interop.ipynb
new file mode 120000
index 00000000000..0ba88107fc5
--- /dev/null
+++ b/notebooks/cupy-interop.ipynb
@@ -0,0 +1 @@
+../docs/cudf/source/user_guide/cupy-interop.ipynb
\ No newline at end of file
diff --git a/notebooks/guide-to-udfs.ipynb b/notebooks/guide-to-udfs.ipynb
new file mode 120000
index 00000000000..a4bbe597fee
--- /dev/null
+++ b/notebooks/guide-to-udfs.ipynb
@@ -0,0 +1 @@
+../docs/cudf/source/user_guide/guide-to-udfs.ipynb
\ No newline at end of file
diff --git a/notebooks/missing-data.ipynb b/notebooks/missing-data.ipynb
new file mode 120000
index 00000000000..7e3b01ae0b3
--- /dev/null
+++ b/notebooks/missing-data.ipynb
@@ -0,0 +1 @@
+../docs/cudf/source/user_guide/missing-data.ipynb
\ No newline at end of file
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index f8eb3af86d7..8a3224237b6 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -31,9 +31,25 @@ project(
 option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
        OFF
 )
+option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
+mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
 
 # If the user requested it we attempt to find CUDF.
 if(FIND_CUDF_CPP)
+  if(USE_LIBARROW_FROM_PYARROW)
+    # We need to find arrow before libcudf since libcudf requires it but doesn't bundle it. TODO:
+    # These options should probably all become optional since in practice they aren't meaningful
+    # except in the case where we actually compile Arrow.
+    set(CUDF_USE_ARROW_STATIC OFF)
+    set(CUDF_ENABLE_ARROW_S3 OFF)
+    set(CUDF_ENABLE_ARROW_ORC OFF)
+    set(CUDF_ENABLE_ARROW_PYTHON OFF)
+    set(CUDF_ENABLE_ARROW_PARQUET OFF)
+    include(rapids-find)
+    include(rapids-export)
+    include(../../cpp/cmake/thirdparty/get_arrow.cmake)
+  endif()
+
   find_package(cudf ${cudf_version} REQUIRED)
 else()
   set(cudf_FOUND OFF)
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index d9a7a5b8754..1de91e6a3e9 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -132,36 +132,37 @@ def _copy_range(Column input_column,
     return Column.from_unique_ptr(move(c_result))
 
 
-def copy_range(Column input_column,
+def copy_range(Column source_column,
                Column target_column,
-               size_type input_begin,
-               size_type input_end,
+               size_type source_begin,
+               size_type source_end,
                size_type target_begin,
                size_type target_end,
                bool inplace):
     """
-    Copy input_column from input_begin to input_end to
-    target_column from target_begin to target_end
-    """
-
-    if abs(target_end - target_begin) <= 1:
-        return target_column
+    Copy a contiguous range from a source to a target column
 
-    if target_begin < 0:
-        target_begin = target_begin + target_column.size
-
-    if target_end < 0:
-        target_end = target_end + target_column.size
+    Notes
+    -----
+    Expects the source and target ranges to have been sanitised to be
+    in-range for the source and target column respectively. For
+    example via ``slice.indices``.
+    """
 
-    if target_begin > target_end:
+    assert (
+        source_end - source_begin == target_end - target_begin,
+        "Source and target ranges must be same length"
+    )
+    if target_end >= target_begin and inplace:
+        # FIXME: Are we allowed to do this when inplace=False?
         return target_column
 
-    if inplace is True:
-        _copy_range_in_place(input_column, target_column,
-                             input_begin, input_end, target_begin)
+    if inplace:
+        _copy_range_in_place(source_column, target_column,
+                             source_begin, source_end, target_begin)
     else:
-        return _copy_range(input_column, target_column,
-                           input_begin, input_end, target_begin)
+        return _copy_range(source_column, target_column,
+                           source_begin, source_end, target_begin)
 
 
 def gather(
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index 5534d732f53..3889fcc4cc0 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -4,7 +4,6 @@
 from packaging import version
 
 PANDAS_VERSION = version.parse(pd.__version__)
-PANDAS_GE_100 = PANDAS_VERSION >= version.parse("1.0")
 PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1")
 PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")
 PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2")
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f22add316e6..59851a1c11b 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -848,6 +848,8 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
         raise NotImplementedError()
 
     def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
+        if self.dtype == dtype:
+            return self
         if is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
 
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 0beb07bb591..5ee9024a0d8 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -399,4 +399,12 @@ def _get_decimal_type(lhs_dtype, rhs_dtype, op):
                 # to try the next dtype
                 continue
 
-    raise OverflowError("Maximum supported decimal type is Decimal128")
+    # Instead of raising an overflow error, we create a `Decimal128Dtype`
+    # with max possible scale & precision, see example of this demonstration
+    # here: https://learn.microsoft.com/en-us/sql/t-sql/data-types/
+    # precision-scale-and-length-transact-sql?view=sql-server-ver16#examples
+    scale = min(
+        scale, cudf.Decimal128Dtype.MAX_PRECISION - (precision - scale)
+    )
+    precision = min(cudf.Decimal128Dtype.MAX_PRECISION, max_precision)
+    return cudf.Decimal128Dtype(precision=precision, scale=scale)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 5c24b222a1b..99fcac57306 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1992,6 +1992,247 @@ def _make_operands_and_index_for_binop(
                     operands[k] = (left_default, v, reflect, None)
         return operands, index
 
+    @classmethod
+    @_cudf_nvtx_annotate
+    def from_dict(
+        cls,
+        data: dict,
+        orient: str = "columns",
+        dtype: Dtype = None,
+        columns: list = None,
+    ) -> DataFrame:
+        """
+        Construct DataFrame from dict of array-like or dicts.
+        Creates DataFrame object from dictionary by columns or by index
+        allowing dtype specification.
+
+        Parameters
+        ----------
+        data : dict
+            Of the form {field : array-like} or {field : dict}.
+        orient : {'columns', 'index', 'tight'}, default 'columns'
+            The "orientation" of the data. If the keys of the passed dict
+            should be the columns of the resulting DataFrame, pass 'columns'
+            (default). Otherwise if the keys should be rows, pass 'index'.
+            If 'tight', assume a dict with keys ['index', 'columns', 'data',
+            'index_names', 'column_names'].
+        dtype : dtype, default None
+            Data type to force, otherwise infer.
+        columns : list, default None
+            Column labels to use when ``orient='index'``. Raises a ``ValueError``
+            if used with ``orient='columns'`` or ``orient='tight'``.
+
+        Returns
+        -------
+        DataFrame
+
+        See Also
+        --------
+        DataFrame.from_records : DataFrame from structured ndarray, sequence
+            of tuples or dicts, or DataFrame.
+        DataFrame : DataFrame object creation using constructor.
+        DataFrame.to_dict : Convert the DataFrame to a dictionary.
+
+        Examples
+        --------
+        By default the keys of the dict become the DataFrame columns:
+
+        >>> import cudf
+        >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
+        >>> cudf.DataFrame.from_dict(data)
+           col_1 col_2
+        0      3     a
+        1      2     b
+        2      1     c
+        3      0     d
+
+        Specify ``orient='index'`` to create the DataFrame using dictionary
+        keys as rows:
+
+        >>> data = {'row_1': [3, 2, 1, 0], 'row_2': [10, 11, 12, 13]}
+        >>> cudf.DataFrame.from_dict(data, orient='index')
+                0   1   2   3
+        row_1   3   2   1   0
+        row_2  10  11  12  13
+
+        When using the 'index' orientation, the column names can be
+        specified manually:
+
+        >>> cudf.DataFrame.from_dict(data, orient='index',
+        ...                          columns=['A', 'B', 'C', 'D'])
+                A   B   C   D
+        row_1   3   2   1   0
+        row_2  10  11  12  13
+
+        Specify ``orient='tight'`` to create the DataFrame using a 'tight'
+        format:
+
+        >>> data = {'index': [('a', 'b'), ('a', 'c')],
+        ...         'columns': [('x', 1), ('y', 2)],
+        ...         'data': [[1, 3], [2, 4]],
+        ...         'index_names': ['n1', 'n2'],
+        ...         'column_names': ['z1', 'z2']}
+        >>> cudf.DataFrame.from_dict(data, orient='tight')
+        z1     x  y
+        z2     1  2
+        n1 n2
+        a  b   1  3
+           c   2  4
+        """  # noqa: E501
+
+        orient = orient.lower()
+        if orient == "index":
+            if len(data) > 0 and isinstance(
+                next(iter(data.values())), (cudf.Series, cupy.ndarray)
+            ):
+                result = cls(data).T
+                result.columns = columns
+                if dtype is not None:
+                    result = result.astype(dtype)
+                return result
+            else:
+                return cls.from_pandas(
+                    pd.DataFrame.from_dict(
+                        data=data,
+                        orient=orient,
+                        dtype=dtype,
+                        columns=columns,
+                    )
+                )
+        elif orient == "columns":
+            if columns is not None:
+                raise ValueError(
+                    "Cannot use columns parameter with orient='columns'"
+                )
+            return cls(data, columns=None, dtype=dtype)
+        elif orient == "tight":
+            if columns is not None:
+                raise ValueError(
+                    "Cannot use columns parameter with orient='right'"
+                )
+
+            index = _from_dict_create_index(
+                data["index"], data["index_names"], cudf
+            )
+            columns = _from_dict_create_index(
+                data["columns"], data["column_names"], pd
+            )
+            return cls(data["data"], index=index, columns=columns, dtype=dtype)
+        else:
+            raise ValueError(
+                "Expected 'index', 'columns' or 'tight' for orient "
+                f"parameter. Got '{orient}' instead"
+            )
+
+    @_cudf_nvtx_annotate
+    def to_dict(
+        self,
+        orient: str = "dict",
+        into: type[dict] = dict,
+    ) -> dict | list[dict]:
+        """
+        Convert the DataFrame to a dictionary.
+
+        The type of the key-value pairs can be customized with the parameters
+        (see below).
+
+        Parameters
+        ----------
+        orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
+            Determines the type of the values of the dictionary.
+
+            - 'dict' (default) : dict like {column -> {index -> value}}
+            - 'list' : dict like {column -> [values]}
+            - 'series' : dict like {column -> Series(values)}
+            - 'split' : dict like
+              {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
+            - 'tight' : dict like
+              {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
+              'index_names' -> [index.names], 'column_names' -> [column.names]}
+            - 'records' : list like
+              [{column -> value}, ... , {column -> value}]
+            - 'index' : dict like {index -> {column -> value}}
+            Abbreviations are allowed. `s` indicates `series` and `sp`
+            indicates `split`.
+
+        into : class, default dict
+            The collections.abc.Mapping subclass used for all Mappings
+            in the return value.  Can be the actual class or an empty
+            instance of the mapping type you want.  If you want a
+            collections.defaultdict, you must pass it initialized.
+
+        Returns
+        -------
+        dict, list or collections.abc.Mapping
+            Return a collections.abc.Mapping object representing the DataFrame.
+            The resulting transformation depends on the `orient` parameter.
+
+        See Also
+        --------
+        DataFrame.from_dict: Create a DataFrame from a dictionary.
+        DataFrame.to_json: Convert a DataFrame to JSON format.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'col1': [1, 2],
+        ...                      'col2': [0.5, 0.75]},
+        ...                     index=['row1', 'row2'])
+        >>> df
+              col1  col2
+        row1     1  0.50
+        row2     2  0.75
+        >>> df.to_dict()
+        {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
+
+        You can specify the return orientation.
+
+        >>> df.to_dict('series')
+        {'col1': row1    1
+                 row2    2
+        Name: col1, dtype: int64,
+        'col2': row1    0.50
+                row2    0.75
+        Name: col2, dtype: float64}
+
+        >>> df.to_dict('split')
+        {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
+         'data': [[1, 0.5], [2, 0.75]]}
+
+        >>> df.to_dict('records')
+        [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
+
+        >>> df.to_dict('index')
+        {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
+
+        >>> df.to_dict('tight')
+        {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
+         'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
+
+        You can also specify the mapping type.
+
+        >>> from collections import OrderedDict, defaultdict
+        >>> df.to_dict(into=OrderedDict)
+        OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
+                     ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
+
+        If you want a `defaultdict`, you need to initialize it:
+
+        >>> dd = defaultdict(list)
+        >>> df.to_dict('records', into=dd)
+        [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
+         defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
+        """  # noqa: E501
+        orient = orient.lower()
+
+        if orient == "series":
+            # Special case needed to avoid converting
+            # cudf.Series objects into pd.Series
+            into_c = pd.core.common.standardize_mapping(into)
+            return into_c((k, v) for k, v in self.items())
+
+        return self.to_pandas().to_dict(orient=orient, into=into)
+
     @_cudf_nvtx_annotate
     def scatter_by_map(
         self, map_index, map_size=None, keep_index=True, **kwargs
@@ -3942,7 +4183,7 @@ def apply(
         ``apply`` relies on Numba to JIT compile ``func``.
         Thus the allowed operations within ``func`` are limited to `those
         supported by the CUDA Python Numba target
-        <https://numba.pydata.org/numba-doc/latest/cuda/cudapysupported.html>`__.
+        <https://numba.readthedocs.io/en/stable/cuda/cudapysupported.html>`__.
         For more information, see the `cuDF guide to user defined functions
         <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>`__.
 
@@ -4297,7 +4538,7 @@ def apply_chunks(
         respectively (See `numba CUDA kernel documentation`_).
 
         .. _numba CUDA kernel documentation:\
-        http://numba.pydata.org/numba-doc/latest/cuda/kernels.html
+        https://numba.readthedocs.io/en/stable/cuda/kernels.html
 
         In the example below, the *kernel* is invoked concurrently on each
         specified chunk. The *kernel* computes the corresponding output
@@ -7444,3 +7685,11 @@ def _reassign_categories(categories, cols, col_idxs):
                 offset=cols[name].offset,
                 size=cols[name].size,
             )
+
+
+def _from_dict_create_index(indexlist, namelist, library):
+    if len(namelist) > 1:
+        index = library.MultiIndex.from_tuples(indexlist, names=namelist)
+    else:
+        index = library.Index(indexlist, name=namelist[0])
+    return index
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 29d5c9ae26d..687338f882d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -321,15 +321,19 @@ def __len__(self):
 
     @_cudf_nvtx_annotate
     def astype(self, dtype, copy=False, **kwargs):
-        result = {}
+        result_data = {}
         for col_name, col in self._data.items():
             dt = dtype.get(col_name, col.dtype)
             if not is_dtype_equal(dt, col.dtype):
-                result[col_name] = col.astype(dt, copy=copy, **kwargs)
+                result_data[col_name] = col.astype(dt, copy=copy, **kwargs)
             else:
-                result[col_name] = col.copy() if copy else col
+                result_data[col_name] = col.copy() if copy else col
 
-        return result
+        return ColumnAccessor._create_unsafe(
+            data=result_data,
+            multiindex=self._data.multiindex,
+            level_names=self._data.level_names,
+        )
 
     @_cudf_nvtx_annotate
     def equals(self, other):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 57469c0ff72..28039366725 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -270,13 +270,6 @@ def __init__(self, data=None, index=None):
         # to ensure that this constructor is always invoked with an index.
         self._index = index
 
-    def to_dict(self, *args, **kwargs):  # noqa: D102
-        raise TypeError(
-            "cuDF does not support conversion to host memory "
-            "via `to_dict()` method. Consider using "
-            "`.to_pandas().to_dict()` to construct a Python dictionary."
-        )
-
     @property
     def _num_rows(self) -> int:
         # Important to use the index because the data may be empty.
@@ -1045,6 +1038,206 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             zip(self._column_names, data_columns), self._index
         )
 
+    @_cudf_nvtx_annotate
+    def truncate(self, before=None, after=None, axis=0, copy=True):
+        """
+        Truncate a Series or DataFrame before and after some index value.
+
+        This is a useful shorthand for boolean indexing based on index
+        values above or below certain thresholds.
+
+        Parameters
+        ----------
+        before : date, str, int
+            Truncate all rows before this index value.
+        after : date, str, int
+            Truncate all rows after this index value.
+        axis : {0 or 'index', 1 or 'columns'}, optional
+            Axis to truncate. Truncates the index (rows) by default.
+        copy : bool, default is True,
+            Return a copy of the truncated section.
+
+        Returns
+        -------
+            The truncated Series or DataFrame.
+
+        Notes
+        -----
+        If the index being truncated contains only datetime values,
+        `before` and `after` may be specified as strings instead of
+        Timestamps.
+
+        .. pandas-compat::
+            **DataFrame.truncate, Series.truncate**
+
+            The ``copy`` parameter is only present for API compatibility, but
+            ``copy=False`` is not supported. This method always generates a
+            copy.
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> cs1 = cudf.Series([1, 2, 3, 4])
+        >>> cs1
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+
+        >>> cs1.truncate(before=1, after=2)
+        1    2
+        2    3
+        dtype: int64
+
+        >>> import cudf
+        >>> dates = cudf.date_range(
+        ...     '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s'
+        ... )
+        >>> cs2 = cudf.Series(range(len(dates)), index=dates)
+        >>> cs2
+        2021-01-01 23:45:00     0
+        2021-01-01 23:45:01     1
+        2021-01-01 23:45:02     2
+        2021-01-01 23:45:03     3
+        2021-01-01 23:45:04     4
+        2021-01-01 23:45:05     5
+        2021-01-01 23:45:06     6
+        2021-01-01 23:45:07     7
+        2021-01-01 23:45:08     8
+        2021-01-01 23:45:09     9
+        2021-01-01 23:45:10    10
+        2021-01-01 23:45:11    11
+        2021-01-01 23:45:12    12
+        2021-01-01 23:45:13    13
+        2021-01-01 23:45:14    14
+        2021-01-01 23:45:15    15
+        2021-01-01 23:45:16    16
+        2021-01-01 23:45:17    17
+        2021-01-01 23:45:18    18
+        2021-01-01 23:45:19    19
+        2021-01-01 23:45:20    20
+        2021-01-01 23:45:21    21
+        2021-01-01 23:45:22    22
+        2021-01-01 23:45:23    23
+        2021-01-01 23:45:24    24
+        ...
+        2021-01-01 23:45:56    56
+        2021-01-01 23:45:57    57
+        2021-01-01 23:45:58    58
+        2021-01-01 23:45:59    59
+        dtype: int64
+
+
+        >>> cs2.truncate(
+        ...     before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+        ... )
+        2021-01-01 23:45:18    18
+        2021-01-01 23:45:19    19
+        2021-01-01 23:45:20    20
+        2021-01-01 23:45:21    21
+        2021-01-01 23:45:22    22
+        2021-01-01 23:45:23    23
+        2021-01-01 23:45:24    24
+        2021-01-01 23:45:25    25
+        2021-01-01 23:45:26    26
+        2021-01-01 23:45:27    27
+        dtype: int64
+
+        >>> cs3 = cudf.Series({'A': 1, 'B': 2, 'C': 3, 'D': 4})
+        >>> cs3
+        A    1
+        B    2
+        C    3
+        D    4
+        dtype: int64
+
+        >>> cs3.truncate(before='B', after='C')
+        B    2
+        C    3
+        dtype: int64
+
+        **DataFrame**
+
+        >>> df = cudf.DataFrame({
+        ...     'A': ['a', 'b', 'c', 'd', 'e'],
+        ...     'B': ['f', 'g', 'h', 'i', 'j'],
+        ...     'C': ['k', 'l', 'm', 'n', 'o']
+        ... }, index=[1, 2, 3, 4, 5])
+        >>> df
+           A  B  C
+        1  a  f  k
+        2  b  g  l
+        3  c  h  m
+        4  d  i  n
+        5  e  j  o
+
+        >>> df.truncate(before=2, after=4)
+           A  B  C
+        2  b  g  l
+        3  c  h  m
+        4  d  i  n
+
+        >>> df.truncate(before="A", after="B", axis="columns")
+           A  B
+        1  a  f
+        2  b  g
+        3  c  h
+        4  d  i
+        5  e  j
+
+        >>> import cudf
+        >>> dates = cudf.date_range(
+        ...     '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s'
+        ... )
+        >>> df2 = cudf.DataFrame(data={'A': 1, 'B': 2}, index=dates)
+        >>> df2.head()
+                             A  B
+        2021-01-01 23:45:00  1  2
+        2021-01-01 23:45:01  1  2
+        2021-01-01 23:45:02  1  2
+        2021-01-01 23:45:03  1  2
+        2021-01-01 23:45:04  1  2
+
+        >>> df2.truncate(
+        ...     before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+        ... )
+                             A  B
+        2021-01-01 23:45:18  1  2
+        2021-01-01 23:45:19  1  2
+        2021-01-01 23:45:20  1  2
+        2021-01-01 23:45:21  1  2
+        2021-01-01 23:45:22  1  2
+        2021-01-01 23:45:23  1  2
+        2021-01-01 23:45:24  1  2
+        2021-01-01 23:45:25  1  2
+        2021-01-01 23:45:26  1  2
+        2021-01-01 23:45:27  1  2
+        """
+        if not copy:
+            raise ValueError("Truncating with copy=False is not supported.")
+        axis = self._get_axis_from_axis_arg(axis)
+        ax = self._index if axis == 0 else self._data.to_pandas_index()
+
+        if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
+            raise ValueError("truncate requires a sorted index")
+
+        if type(ax) is cudf.core.index.DatetimeIndex:
+            before = pd.to_datetime(before)
+            after = pd.to_datetime(after)
+
+        if before is not None and after is not None and before > after:
+            raise ValueError(f"Truncate: {after} must be after {before}")
+
+        if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
+            before, after = after, before
+
+        slicer = [slice(None, None)] * self.ndim
+        slicer[axis] = slice(before, after)
+        return self.loc[tuple(slicer)].copy()
+
     @cached_property
     def loc(self):
         """Select rows and columns by label or boolean mask.
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index e05e8662fe4..e516177ad29 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -392,4 +392,6 @@ def _dispatch_scalar_unaop(self, op):
         return getattr(self.value, op)()
 
     def astype(self, dtype):
+        if self.dtype == dtype:
+            return self
         return Scalar(self.value, dtype)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 799a286695a..8f4f6fe57d6 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -35,6 +35,7 @@
     is_integer_dtype,
     is_list_dtype,
     is_scalar,
+    is_string_dtype,
     is_struct_dtype,
 )
 from cudf.core.abc import Serializable
@@ -214,19 +215,20 @@ def __setitem__(self, key, value):
             value = column.as_column(value)
 
         if (
-            not isinstance(
-                self._frame._column.dtype,
-                (cudf.core.dtypes.DecimalDtype, cudf.CategoricalDtype),
+            (
+                _is_non_decimal_numeric_dtype(self._frame._column.dtype)
+                or is_string_dtype(self._frame._column.dtype)
             )
             and hasattr(value, "dtype")
             and _is_non_decimal_numeric_dtype(value.dtype)
         ):
             # normalize types if necessary:
-            if not is_integer(key):
-                to_dtype = np.result_type(
-                    value.dtype, self._frame._column.dtype
-                )
-                value = value.astype(to_dtype)
+            # In contrast to Column.__setitem__ (which downcasts the value to
+            # the dtype of the column) here we upcast the series to the
+            # larger data type mimicking pandas
+            to_dtype = np.result_type(value.dtype, self._frame._column.dtype)
+            value = value.astype(to_dtype)
+            if to_dtype != self._frame._column.dtype:
                 self._frame._column._mimic_inplace(
                     self._frame._column.astype(to_dtype), inplace=True
                 )
@@ -283,6 +285,10 @@ def __setitem__(self, key, value):
         self._frame.iloc[key] = value
 
     def _loc_to_iloc(self, arg):
+        if isinstance(arg, tuple) and arg and isinstance(arg[0], slice):
+            if len(arg) > 1:
+                raise IndexError("Too many Indexers")
+            arg = arg[0]
         if _is_scalar_or_zero_d_array(arg):
             if not _is_non_decimal_numeric_dtype(self._frame.index.dtype):
                 # TODO: switch to cudf.utils.dtypes.is_integer(arg)
@@ -725,6 +731,45 @@ def drop(
             labels, axis, index, columns, level, inplace, errors
         )
 
+    @_cudf_nvtx_annotate
+    def to_dict(self, into: type[dict] = dict) -> dict:
+        """
+        Convert Series to {label -> value} dict or dict-like object.
+
+        Parameters
+        ----------
+        into : class, default dict
+            The collections.abc.Mapping subclass to use as the return
+            object. Can be the actual class or an empty
+            instance of the mapping type you want.  If you want a
+            collections.defaultdict, you must pass it initialized.
+
+        Returns
+        -------
+        collections.abc.Mapping
+            Key-value representation of Series.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([1, 2, 3, 4])
+        >>> s
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+        >>> s.to_dict()
+        {0: 1, 1: 2, 2: 3, 3: 4}
+        >>> from collections import OrderedDict, defaultdict
+        >>> s.to_dict(OrderedDict)
+        OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
+        >>> dd = defaultdict(list)
+        >>> s.to_dict(dd)
+        defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
+        """
+        return self.to_pandas().to_dict(into=into)
+
     @_cudf_nvtx_annotate
     def append(self, to_append, ignore_index=False, verify_integrity=False):
         """Append values from another ``Series`` or array-like object.
diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py
index 4730f1fa296..8421d763167 100644
--- a/python/cudf/cudf/core/udf/__init__.py
+++ b/python/cudf/cudf/core/udf/__init__.py
@@ -1,10 +1,7 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
-import numpy as np
-from numba import cuda, types
-from numba.cuda.cudaimpl import (
-    lower as cuda_lower,
-    registry as cuda_lowering_registry,
-)
+
+from numba import types
+from numba.cuda.cudaimpl import lower as cuda_lower
 
 from cudf.core.dtypes import dtype
 from cudf.core.udf import api, row_function, utils
@@ -23,7 +20,7 @@
     | {types.boolean}
 )
 _STRING_UDFS_ENABLED = False
-
+cudf_str_dtype = dtype(str)
 try:
     import strings_udf
     from strings_udf import ptxpath
@@ -31,32 +28,39 @@
     if ptxpath:
         utils.ptx_files.append(ptxpath)
 
-        from strings_udf._lib.cudf_jit_udf import to_string_view_array
-        from strings_udf._typing import str_view_arg_handler, string_view
+        from strings_udf._lib.cudf_jit_udf import (
+            column_from_udf_string_array,
+            column_to_string_view_array,
+        )
+        from strings_udf._typing import (
+            str_view_arg_handler,
+            string_view,
+            udf_string,
+        )
 
         from . import strings_typing  # isort: skip
         from . import strings_lowering  # isort: skip
 
-        cuda_lower(api.Masked, strings_typing.string_view, types.boolean)(
+        cuda_lower(api.Masked, string_view, types.boolean)(
             masked_lowering.masked_constructor
         )
-
-        # add an overload of pack_return(string_view)
-        cuda_lower(api.pack_return, strings_typing.string_view)(
-            masked_lowering.pack_return_scalar_impl
-        )
-
-        _supported_masked_types |= {strings_typing.string_view}
-        utils.launch_arg_getters[dtype("O")] = to_string_view_array
-        utils.masked_array_types[dtype("O")] = string_view
         utils.JIT_SUPPORTED_TYPES |= STRING_TYPES
+        _supported_masked_types |= {string_view, udf_string}
+
+        utils.launch_arg_getters[cudf_str_dtype] = column_to_string_view_array
+        utils.output_col_getters[cudf_str_dtype] = column_from_udf_string_array
+        utils.masked_array_types[cudf_str_dtype] = string_view
+        row_function.itemsizes[cudf_str_dtype] = string_view.size_bytes
 
         utils.arg_handlers.append(str_view_arg_handler)
-        row_function.itemsizes[dtype("O")] = string_view.size_bytes
+
+        masked_typing.MASKED_INIT_MAP[udf_string] = udf_string
 
         _STRING_UDFS_ENABLED = True
 
 except ImportError as e:
     # allow cuDF to work without strings_udf
     pass
-masked_typing.register_masked_constructor(_supported_masked_types)
+
+masked_typing._register_masked_constructor_typing(_supported_masked_types)
+masked_lowering._register_masked_constructor_lowering(_supported_masked_types)
diff --git a/python/cudf/cudf/core/udf/masked_lowering.py b/python/cudf/cudf/core/udf/masked_lowering.py
index f825b6538bf..37f3117e756 100644
--- a/python/cudf/cudf/core/udf/masked_lowering.py
+++ b/python/cudf/cudf/core/udf/masked_lowering.py
@@ -372,10 +372,6 @@ def cast_masked_to_masked(context, builder, fromty, toty, val):
 
 
 # Masked constructor for use in a kernel for testing
-@lower_builtin(api.Masked, types.Boolean, types.boolean)
-@lower_builtin(api.Masked, types.Number, types.boolean)
-@lower_builtin(api.Masked, types.NPDatetime, types.boolean)
-@lower_builtin(api.Masked, types.NPTimedelta, types.boolean)
 def masked_constructor(context, builder, sig, args):
     ty = sig.return_type
     value, valid = args
@@ -385,6 +381,11 @@ def masked_constructor(context, builder, sig, args):
     return masked._getvalue()
 
 
+def _register_masked_constructor_lowering(supported_masked_types):
+    for ty in supported_masked_types:
+        lower_builtin(api.Masked, ty, types.boolean)(masked_constructor)
+
+
 # Allows us to make an instance of MaskedType a global variable
 # and properly use it inside functions we will later compile
 @cuda_lowering_registry.lower_constant(MaskedType)
diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py
index a815a9f6dae..7baf2d585e2 100644
--- a/python/cudf/cudf/core/udf/masked_typing.py
+++ b/python/cudf/cudf/core/udf/masked_typing.py
@@ -169,7 +169,7 @@ def typeof_masked(val, c):
 
 # Implemented typing for Masked(value, valid) - the construction of a Masked
 # type in a kernel.
-def register_masked_constructor(supported_masked_types):
+def _register_masked_constructor_typing(supported_masked_types):
     class MaskedConstructor(ConcreteTemplate):
         key = api.Masked
         cases = [
diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py
index 59041977f87..fdfd013bad7 100644
--- a/python/cudf/cudf/core/udf/strings_lowering.py
+++ b/python/cudf/cudf/core/udf/strings_lowering.py
@@ -7,7 +7,7 @@
 from numba.core.typing import signature as nb_signature
 from numba.cuda.cudaimpl import lower as cuda_lower
 
-from strings_udf._typing import size_type, string_view
+from strings_udf._typing import size_type, string_view, udf_string
 from strings_udf.lowering import (
     contains_impl,
     count_impl,
@@ -22,8 +22,11 @@
     istitle_impl,
     isupper_impl,
     len_impl,
+    lstrip_impl,
     rfind_impl,
+    rstrip_impl,
     startswith_impl,
+    strip_impl,
 )
 
 from cudf.core.udf.masked_typing import MaskedType
@@ -79,6 +82,13 @@ def masked_binary_func_impl(context, builder, sig, args):
     )
 
 
+create_binary_string_func("MaskedType.strip", strip_impl, udf_string)
+
+create_binary_string_func("MaskedType.lstrip", lstrip_impl, udf_string)
+
+create_binary_string_func("MaskedType.rstrip", rstrip_impl, udf_string)
+
+
 create_binary_string_func(
     "MaskedType.startswith",
     startswith_impl,
diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py
index 1179688651f..f8f50600b12 100644
--- a/python/cudf/cudf/core/udf/strings_typing.py
+++ b/python/cudf/cudf/core/udf/strings_typing.py
@@ -13,7 +13,9 @@
     id_unary_funcs,
     int_binary_funcs,
     size_type,
+    string_return_attrs,
     string_view,
+    udf_string,
 )
 
 from cudf.core.udf import masked_typing
@@ -172,6 +174,13 @@ def resolve_valid(self, mod):
         create_masked_binary_attr(f"MaskedType.{func}", size_type),
     )
 
+for func in string_return_attrs:
+    setattr(
+        MaskedStringViewAttrs,
+        f"resolve_{func}",
+        create_masked_binary_attr(f"MaskedType.{func}", udf_string),
+    )
+
 for func in id_unary_funcs:
     setattr(
         MaskedStringViewAttrs,
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index fa79088046c..4d40d41f9c3 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -10,6 +10,8 @@
 from numba.np import numpy_support
 from numba.types import CPointer, Poison, Tuple, boolean, int64, void
 
+import rmm
+
 from cudf.core.column.column import as_column
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.udf.masked_typing import MaskedType
@@ -31,6 +33,9 @@
 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 arg_handlers: List[Any] = []
 ptx_files: List[Any] = []
+masked_array_types: Dict[Any, Any] = {}
+launch_arg_getters: Dict[Any, Any] = {}
+output_col_getters: Dict[Any, Any] = {}
 
 
 @_cudf_nvtx_annotate
@@ -54,6 +59,7 @@ def _get_udf_return_type(argty, func: Callable, args=()):
     # Get the return type. The PTX is also returned by compile_udf, but is not
     # needed here.
     ptx, output_type = cudautils.compile_udf(func, compile_sig)
+
     if not isinstance(output_type, MaskedType):
         numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
     else:
@@ -64,6 +70,7 @@ def _get_udf_return_type(argty, func: Callable, args=()):
         if not isinstance(numba_output_type, MaskedType)
         else numba_output_type.value_type
     )
+    result = result if result.is_internal else result.return_type
 
     # _get_udf_return_type will throw a TypingError if the user tries to use
     # a field in the row containing an unsupported dtype, except in the
@@ -112,9 +119,6 @@ def _supported_cols_from_frame(frame):
     }
 
 
-masked_array_types: Dict[Any, Any] = {}
-
-
 def _masked_array_type_from_col(col):
     """
     Return a type representing a tuple of arrays,
@@ -142,9 +146,12 @@ def _construct_signature(frame, return_type, args):
     actually JIT the kernel itself later, accounting for types
     and offsets. Skips columns with unsupported dtypes.
     """
-
+    if not return_type.is_internal:
+        return_type = CPointer(return_type)
+    else:
+        return_type = return_type[::1]
     # Tuple of arrays, first the output data array, then the mask
-    return_type = Tuple((return_type[::1], boolean[::1]))
+    return_type = Tuple((return_type, boolean[::1]))
     offsets = []
     sig = [return_type, int64]
     for col in _supported_cols_from_frame(frame).values():
@@ -213,7 +220,12 @@ def _compile_or_get(frame, func, args, kernel_getter=None):
     # could be a MaskedType or a scalar type.
 
     kernel, scalar_return_type = kernel_getter(frame, func, args)
-    np_return_type = numpy_support.as_dtype(scalar_return_type)
+    np_return_type = (
+        numpy_support.as_dtype(scalar_return_type)
+        if scalar_return_type.is_internal
+        else scalar_return_type.np_dtype
+    )
+
     precompiled[cache_key] = (kernel, np_return_type)
 
     return kernel, np_return_type
@@ -230,9 +242,6 @@ def _get_kernel(kernel_string, globals_, sig, func):
     return kernel
 
 
-launch_arg_getters: Dict[Any, Any] = {}
-
-
 def _get_input_args_from_frame(fr):
     args = []
     offsets = []
@@ -254,8 +263,12 @@ def _get_input_args_from_frame(fr):
 
 
 def _return_arr_from_dtype(dt, size):
+    if extensionty := masked_array_types.get(dt):
+        return rmm.DeviceBuffer(size=size * extensionty.return_type.size_bytes)
     return cp.empty(size, dtype=dt)
 
 
 def _post_process_output_col(col, retty):
+    if getter := output_col_getters.get(retty):
+        col = getter(col)
     return as_column(col, retty)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 1fcfbe5fc91..4ec770e0d6b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9,6 +9,7 @@
 import string
 import textwrap
 import warnings
+from collections import OrderedDict, defaultdict
 from contextlib import contextmanager
 from copy import copy
 
@@ -289,6 +290,62 @@ def test_axes(data):
         assert_eq(e, a)
 
 
+def test_dataframe_truncate_axis_0():
+    df = cudf.DataFrame(
+        {
+            "A": ["a", "b", "c", "d", "e"],
+            "B": ["f", "g", "h", "i", "j"],
+            "C": ["k", "l", "m", "n", "o"],
+        },
+        index=[1, 2, 3, 4, 5],
+    )
+    pdf = df.to_pandas()
+
+    expected = pdf.truncate(before=2, after=4, axis="index")
+    actual = df.truncate(before=2, after=4, axis="index")
+    assert_eq(actual, expected)
+
+    expected = pdf.truncate(before=1, after=4, axis=0)
+    actual = df.truncate(before=1, after=4, axis=0)
+    assert_eq(expected, actual)
+
+
+def test_dataframe_truncate_axis_1():
+    df = cudf.DataFrame(
+        {
+            "A": ["a", "b", "c", "d", "e"],
+            "B": ["f", "g", "h", "i", "j"],
+            "C": ["k", "l", "m", "n", "o"],
+        },
+        index=[1, 2, 3, 4, 5],
+    )
+    pdf = df.to_pandas()
+
+    expected = pdf.truncate(before="A", after="B", axis="columns")
+    actual = df.truncate(before="A", after="B", axis="columns")
+    assert_eq(actual, expected)
+
+    expected = pdf.truncate(before="A", after="B", axis=1)
+    actual = df.truncate(before="A", after="B", axis=1)
+    assert_eq(actual, expected)
+
+
+def test_dataframe_truncate_datetimeindex():
+    dates = cudf.date_range(
+        "2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s"
+    )
+    df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates)
+    pdf = df.to_pandas()
+    expected = pdf.truncate(
+        before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+    )
+    actual = df.truncate(
+        before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+    )
+
+    assert_eq(actual, expected)
+
+
 def test_series_init_none():
 
     # test for creating empty series
@@ -6762,27 +6819,172 @@ def test_cudf_isclose_different_index():
     assert_eq(expected, cudf.isclose(s1, s2))
 
 
-def test_dataframe_to_dict_error():
-    df = cudf.DataFrame({"a": [1, 2, 3], "b": [9, 5, 3]})
-    with pytest.raises(
-        TypeError,
-        match=re.escape(
-            r"cuDF does not support conversion to host memory "
-            r"via `to_dict()` method. Consider using "
-            r"`.to_pandas().to_dict()` to construct a Python dictionary."
+@pytest.mark.parametrize(
+    "orient", ["dict", "list", "split", "tight", "records", "index", "series"]
+)
+@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)])
+def test_dataframe_to_dict(orient, into):
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": [9, 5, 3]}, index=[10, 11, 12])
+    pdf = df.to_pandas()
+
+    actual = df.to_dict(orient=orient, into=into)
+    expected = pdf.to_dict(orient=orient, into=into)
+    if orient == "series":
+        assert actual.keys() == expected.keys()
+        for key in actual.keys():
+            assert_eq(expected[key], actual[key])
+    else:
+        assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data, orient, dtype, columns",
+    [
+        (
+            {"col_1": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]},
+            "columns",
+            None,
+            None,
         ),
-    ):
-        df.to_dict()
+        ({"col_1": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]}, "index", None, None),
+        (
+            {"col_1": [None, 2, 1, 0], "col_2": [3, None, 1, 0]},
+            "index",
+            None,
+            ["A", "B", "C", "D"],
+        ),
+        (
+            {
+                "col_1": ["ab", "cd", "ef", "gh"],
+                "col_2": ["zx", "one", "two", "three"],
+            },
+            "index",
+            None,
+            ["A", "B", "C", "D"],
+        ),
+        (
+            {
+                "index": [("a", "b"), ("a", "c")],
+                "columns": [("x", 1), ("y", 2)],
+                "data": [[1, 3], [2, 4]],
+                "index_names": ["n1", "n2"],
+                "column_names": ["z1", "z2"],
+            },
+            "tight",
+            "float64",
+            None,
+        ),
+    ],
+)
+def test_dataframe_from_dict(data, orient, dtype, columns):
 
-    with pytest.raises(
-        TypeError,
-        match=re.escape(
-            r"cuDF does not support conversion to host memory "
-            r"via `to_dict()` method. Consider using "
-            r"`.to_pandas().to_dict()` to construct a Python dictionary."
+    expected = pd.DataFrame.from_dict(
+        data=data, orient=orient, dtype=dtype, columns=columns
+    )
+
+    actual = cudf.DataFrame.from_dict(
+        data=data, orient=orient, dtype=dtype, columns=columns
+    )
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("dtype", ["int64", "str", None])
+def test_dataframe_from_dict_transposed(dtype):
+    pd_data = {"a": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]}
+    gd_data = {key: cudf.Series(val) for key, val in pd_data.items()}
+
+    expected = pd.DataFrame.from_dict(pd_data, orient="index", dtype=dtype)
+    actual = cudf.DataFrame.from_dict(gd_data, orient="index", dtype=dtype)
+
+    gd_data = {key: cupy.asarray(val) for key, val in pd_data.items()}
+    actual = cudf.DataFrame.from_dict(gd_data, orient="index", dtype=dtype)
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "pd_data, gd_data, orient, dtype, columns",
+    [
+        (
+            {"col_1": np.array([3, 2, 1, 0]), "col_2": np.array([3, 2, 1, 0])},
+            {
+                "col_1": cupy.array([3, 2, 1, 0]),
+                "col_2": cupy.array([3, 2, 1, 0]),
+            },
+            "columns",
+            None,
+            None,
         ),
-    ):
-        df["a"].to_dict()
+        (
+            {"col_1": np.array([3, 2, 1, 0]), "col_2": np.array([3, 2, 1, 0])},
+            {
+                "col_1": cupy.array([3, 2, 1, 0]),
+                "col_2": cupy.array([3, 2, 1, 0]),
+            },
+            "index",
+            None,
+            None,
+        ),
+        (
+            {
+                "col_1": np.array([None, 2, 1, 0]),
+                "col_2": np.array([3, None, 1, 0]),
+            },
+            {
+                "col_1": cupy.array([np.nan, 2, 1, 0]),
+                "col_2": cupy.array([3, np.nan, 1, 0]),
+            },
+            "index",
+            None,
+            ["A", "B", "C", "D"],
+        ),
+        (
+            {
+                "col_1": np.array(["ab", "cd", "ef", "gh"]),
+                "col_2": np.array(["zx", "one", "two", "three"]),
+            },
+            {
+                "col_1": np.array(["ab", "cd", "ef", "gh"]),
+                "col_2": np.array(["zx", "one", "two", "three"]),
+            },
+            "index",
+            None,
+            ["A", "B", "C", "D"],
+        ),
+        (
+            {
+                "index": [("a", "b"), ("a", "c")],
+                "columns": [("x", 1), ("y", 2)],
+                "data": [np.array([1, 3]), np.array([2, 4])],
+                "index_names": ["n1", "n2"],
+                "column_names": ["z1", "z2"],
+            },
+            {
+                "index": [("a", "b"), ("a", "c")],
+                "columns": [("x", 1), ("y", 2)],
+                "data": [cupy.array([1, 3]), cupy.array([2, 4])],
+                "index_names": ["n1", "n2"],
+                "column_names": ["z1", "z2"],
+            },
+            "tight",
+            "float64",
+            None,
+        ),
+    ],
+)
+def test_dataframe_from_dict_cp_np_arrays(
+    pd_data, gd_data, orient, dtype, columns
+):
+
+    expected = pd.DataFrame.from_dict(
+        data=pd_data, orient=orient, dtype=dtype, columns=columns
+    )
+
+    actual = cudf.DataFrame.from_dict(
+        data=gd_data, orient=orient, dtype=dtype, columns=columns
+    )
+
+    assert_eq(expected, actual, check_dtype=dtype is not None)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index c37381a3af9..c7174adf342 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 import decimal
 from decimal import Decimal
@@ -377,3 +377,9 @@ def test_decimal_invalid_precision():
 
     with pytest.raises(pa.ArrowInvalid):
         _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1))
+
+
+def test_decimal_overflow():
+    s = cudf.Series([Decimal("0.0009384233522166997927180531650178250")])
+    result = s * s
+    assert_eq(cudf.Decimal128Dtype(precision=38, scale=37), result.dtype)
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index f15d705c4e2..a677ace18ec 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -5,8 +5,6 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow as pa
-import pyarrow.orc
 import pytest
 
 import cudf
@@ -71,5 +69,5 @@ def mock_open(*args, **kwargs):
     monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open)
     gdf.to_orc(f"gcs://{gcs_fname}")
 
-    got = pa.orc.ORCFile(local_filepath).read().to_pandas()
+    got = pd.read_orc(local_filepath)
     assert_eq(pdf, got)
diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py
index 8730cb187b5..f8de16f8609 100644
--- a/python/cudf/cudf/tests/test_hdfs.py
+++ b/python/cudf/cudf/tests/test_hdfs.py
@@ -8,7 +8,6 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
-from pyarrow import orc
 
 import cudf
 from cudf.testing._utils import assert_eq
@@ -212,7 +211,7 @@ def test_read_orc(datadir, hdfs, test_url):
         hd_fpath = f"hdfs://{basedir}/file.orc"
 
     got = cudf.read_orc(hd_fpath)
-    expect = orc.ORCFile(buffer).read().to_pandas()
+    expect = pd.read_orc(buffer)
     assert_eq(expect, got)
 
 
@@ -232,7 +231,7 @@ def test_write_orc(pdf, hdfs, test_url):
 
     assert hdfs.exists(f"{basedir}/test_orc_writer.orc")
     with hdfs.open(f"{basedir}/test_orc_writer.orc", mode="rb") as f:
-        got = orc.ORCFile(f).read().to_pandas()
+        got = pd.read_orc(f)
 
     assert_eq(pdf, got)
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 00d6e0b2899..14238be7bc1 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import copy
+import gzip
 import itertools
 import os
 from io import BytesIO, StringIO
@@ -897,3 +898,72 @@ def test_json_dtypes_nested_data():
         pdf, schema=df.to_arrow().schema, safe=False
     )
     assert df.to_arrow().equals(pa_table_pdf)
+
+
+@pytest.mark.parametrize(
+    "tag, data",
+    [
+        (
+            "normal",
+            """\
+{"a": 1, "b": 2}
+{"a": 3, "b": 4}""",
+        ),
+        (
+            "multiple",
+            """\
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 }""",
+        ),
+        (
+            "reordered",
+            """\
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "c": 12 , "b" : [4, 5   ]}
+    { "b" : [6      ],  "a": { "y" : 6}, "c": 13}
+    { "c" : 14, "a": { "y" : 6}, "b" : [7      ]}
+""",
+        ),
+        (
+            "missing",
+            """
+            { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+            { "a": { "y" : 6}, "b" : [4, 5   ]}
+            { "a": { "y" : 6}, "c": 13 }
+            { "a": { "y" : 6}, "b" : [7      ], "c": 14 }
+        """,
+        ),
+    ],
+)
+def test_order_nested_json_reader(tag, data):
+    expected = cudf.read_json(StringIO(data), engine="pandas", lines=True)
+    target = cudf.read_json(
+        StringIO(data), engine="cudf_experimental", lines=True
+    )
+
+    assert_eq(expected, target, check_dtype=True)
+
+
+def test_json_round_trip_gzip():
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["abc", "def", "ghi"]})
+    bytes = BytesIO()
+    with gzip.open(bytes, mode="wb") as fo:
+        df.to_json(fo, orient="records", lines=True)
+    bytes.seek(0)
+    with gzip.open(bytes, mode="rb") as fo:
+        written_df = cudf.read_json(fo, orient="records", lines=True)
+    assert_eq(written_df, df)
+
+    # Testing writing from middle of the file.
+    loc = bytes.tell()
+
+    with gzip.open(bytes, mode="wb") as fo:
+        fo.seek(loc)
+        df.to_json(fo, orient="records", lines=True)
+    bytes.seek(loc)
+    with gzip.open(bytes, mode="rb") as fo:
+        fo.seek(loc)
+        written_df = cudf.read_json(fo, orient="records", lines=True)
+    assert_eq(written_df, df)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index fbd9b83330e..1699c11617a 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -84,12 +84,8 @@ def _make_path_or_buf(src):
 )
 def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
     path = datadir / inputfile
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    expect = orcfile.read(columns=columns).to_pandas()
+    expect = pd.read_orc(path, columns=columns)
     got = cudf.read_orc(
         path, engine=engine, columns=columns, use_index=use_index
     )
@@ -119,8 +115,7 @@ def test_orc_reader_local_filepath():
 def test_orc_reader_filepath_or_buffer(path_or_buf, src):
     cols = ["int1", "long1", "float1", "double1"]
 
-    orcfile = pa.orc.ORCFile(path_or_buf("filepath"))
-    expect = orcfile.read(columns=cols).to_pandas()
+    expect = pd.read_orc(path_or_buf("filepath"), columns=cols)
     got = cudf.read_orc(path_or_buf(src), columns=cols)
 
     assert_eq(expect, got)
@@ -128,12 +123,8 @@ def test_orc_reader_filepath_or_buffer(path_or_buf, src):
 
 def test_orc_reader_trailing_nulls(datadir):
     path = datadir / "TestOrcFile.nulls-at-end-snappy.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    expect = orcfile.read().to_pandas().fillna(0)
+    expect = pd.read_orc(path).fillna(0)
     got = cudf.read_orc(path).fillna(0)
 
     # PANDAS uses NaN to represent invalid data, which forces float dtype
@@ -164,12 +155,8 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
 
 def test_orc_reader_strings(datadir):
     path = datadir / "TestOrcFile.testStringAndBinaryStatistics.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    expect = orcfile.read(columns=["string1"])
+    expect = pd.read_orc(path, columns=["string1"])
     got = cudf.read_orc(path, columns=["string1"])
 
     assert_eq(expect, got, check_categorical=False)
@@ -285,12 +272,8 @@ def test_orc_read_stripes(datadir, engine):
 @pytest.mark.parametrize("skiprows", [0, 1, 3000])
 def test_orc_read_rows(datadir, skiprows, num_rows):
     path = datadir / "TestOrcFile.decimal.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    pdf = orcfile.read().to_pandas()
+    pdf = pd.read_orc(path)
     gdf = cudf.read_orc(path, skiprows=skiprows, num_rows=num_rows)
 
     # Slice rows out of the whole dataframe for comparison as PyArrow doesn't
@@ -329,19 +312,17 @@ def test_orc_read_skiprows():
     # repro for other sizes of data
     skiprows = 10
 
-    expected = cudf.read_orc(buff)[skiprows:].reset_index(drop=True)
+    expected = (
+        pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool")
+    )
     got = cudf.read_orc(buff, skiprows=skiprows)
     assert_eq(expected, got)
 
 
 def test_orc_reader_uncompressed_block(datadir):
     path = datadir / "uncompressed_snappy.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    expect = orcfile.read().to_pandas()
+    expect = pd.read_orc(path)
     got = cudf.read_orc(path)
 
     assert_eq(expect, got, check_categorical=False)
@@ -349,15 +330,8 @@ def test_orc_reader_uncompressed_block(datadir):
 
 def test_orc_reader_nodata_block(datadir):
     path = datadir / "nodata.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
 
-    expect = orcfile.read().to_pandas()
+    expect = pd.read_orc(path)
     got = cudf.read_orc(path, num_rows=1)
 
     assert_eq(expect, got, check_categorical=False)
@@ -386,19 +360,9 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
     pdf_fname = datadir / reference_file
     gdf_fname = tmpdir.join("gdf.orc")
 
-    try:
-        orcfile = pa.orc.ORCFile(pdf_fname)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
-
-    expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas())
+    expect = cudf.from_pandas(pd.read_orc(pdf_fname, columns=columns))
     expect.to_orc(gdf_fname.strpath, compression=compression)
-    got = cudf.from_pandas(
-        pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
-    )
+    got = cudf.from_pandas(pd.read_orc(gdf_fname, columns=columns))
 
     assert_frame_equal(expect, got)
 
@@ -409,17 +373,9 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
     pdf_fname = datadir / reference_file
     gdf_fname = tmpdir.join("gdf.orc")
 
-    try:
-        orcfile = pa.orc.ORCFile(pdf_fname)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
-
-    expect = cudf.from_pandas(orcfile.read().to_pandas())
+    expect = cudf.from_pandas(pd.read_orc(pdf_fname))
     expect.to_orc(gdf_fname.strpath, statistics=stats_freq)
-    got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas())
+    got = cudf.from_pandas(pd.read_orc(gdf_fname))
 
     assert_frame_equal(expect, got)
 
@@ -430,14 +386,6 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
     pdf_fname = datadir / reference_file
     gdf_fname = tmpdir.join("chunked_gdf.orc")
 
-    try:
-        orcfile = pa.orc.ORCFile(pdf_fname)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
-
     columns = [
         "boolean1",
         "byte1",
@@ -447,7 +395,7 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
         "float1",
         "double1",
     ]
-    pdf = orcfile.read(columns=columns).to_pandas()
+    pdf = pd.read_orc(pdf_fname, columns=columns)
     gdf = cudf.from_pandas(pdf)
     expect = pd.concat([pdf, pdf]).reset_index(drop=True)
 
@@ -456,7 +404,7 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
     writer.write_table(gdf)
     writer.close()
 
-    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+    got = pd.read_orc(gdf_fname)
 
     assert_eq(expect, got)
 
@@ -486,15 +434,7 @@ def test_chunked_orc_writer(
     pdf_fname = datadir / reference_file
     gdf_fname = tmpdir.join("chunked_gdf.orc")
 
-    try:
-        orcfile = pa.orc.ORCFile(pdf_fname)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
-
-    pdf = orcfile.read(columns=columns).to_pandas()
+    pdf = pd.read_orc(pdf_fname, columns=columns)
     gdf = cudf.from_pandas(pdf)
     expect = pd.concat([pdf, pdf]).reset_index(drop=True)
 
@@ -503,7 +443,7 @@ def test_chunked_orc_writer(
     writer.write_table(gdf)
     writer.close()
 
-    got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
+    got = pd.read_orc(gdf_fname, columns=columns)
     assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))
 
 
@@ -521,7 +461,7 @@ def test_orc_writer_strings(tmpdir, dtypes):
 
     expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
     expect.to_orc(gdf_fname)
-    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+    got = pd.read_orc(gdf_fname)
 
     assert_eq(expect, got)
 
@@ -546,7 +486,7 @@ def test_chunked_orc_writer_strings(tmpdir, dtypes):
     writer.write_table(gdf)
     writer.close()
 
-    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+    got = pd.read_orc(gdf_fname)
 
     assert_eq(expect, got)
 
@@ -577,13 +517,8 @@ def test_orc_writer_sliced(tmpdir):
 def test_orc_reader_decimal_type(datadir, orc_file):
     file_path = datadir / orc_file
 
-    try:
-        orcfile = pa.orc.ORCFile(file_path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
-
-    pdf = orcfile.read().to_pandas()
-    df = cudf.read_orc(file_path).to_pandas()
+    pdf = pd.read_orc(file_path)
+    df = cudf.read_orc(file_path)
 
     assert_eq(pdf, df)
 
@@ -591,13 +526,8 @@ def test_orc_reader_decimal_type(datadir, orc_file):
 def test_orc_decimal_precision_fail(datadir):
     file_path = datadir / "TestOrcFile.int_decimal.precision_19.orc"
 
-    try:
-        orcfile = pa.orc.ORCFile(file_path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
-
     # Shouldn't cause failure if decimal column is not chosen to be read.
-    pdf = orcfile.read(columns=["int"]).to_pandas()
+    pdf = pd.read_orc(file_path, columns=["int"])
     gdf = cudf.read_orc(file_path, columns=["int"])
 
     assert_eq(pdf, gdf)
@@ -624,13 +554,9 @@ def test_orc_reader_tzif_timestamps(datadir):
     # Contains timstamps in the range covered by the TZif file
     # Other timedate tests only cover "future" times
     path = datadir / "TestOrcFile.lima_timezone.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(path).to_pandas()
+    pdf = pd.read_orc(path)
+    gdf = cudf.read_orc(path)
 
     assert_eq(pdf, gdf)
 
@@ -882,13 +808,9 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
 
 def test_orc_reader_gmt_timestamps(datadir):
     path = datadir / "TestOrcFile.gmt.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(path).to_pandas()
+    pdf = pd.read_orc(path)
+    gdf = cudf.read_orc(path)
     assert_eq(pdf, gdf)
 
 
@@ -914,7 +836,7 @@ def test_orc_bool_encode_fail():
     okay_df.to_orc(buffer)
 
     # Also validate data
-    pdf = pa.orc.ORCFile(buffer).read().to_pandas()
+    pdf = pd.read_orc(buffer)
 
     assert_eq(okay_df.to_pandas(nullable=True), pdf)
 
@@ -929,8 +851,8 @@ def test_nanoseconds_overflow():
     cudf_got = cudf.read_orc(buffer)
     assert_eq(expected, cudf_got)
 
-    pyarrow_got = pa.orc.ORCFile(buffer).read()
-    assert_eq(expected.to_pandas(), pyarrow_got.to_pandas())
+    pandas_got = pd.read_orc(buffer)
+    assert_eq(expected, pandas_got)
 
 
 def test_empty_dataframe():
@@ -1207,7 +1129,7 @@ def test_skip_rows_for_nested_types(columns, list_struct_buff):
 def test_pyspark_struct(datadir):
     path = datadir / "TestOrcFile.testPySparkStruct.orc"
 
-    pdf = pa.orc.ORCFile(path).read().to_pandas()
+    pdf = pd.read_orc(path)
     gdf = cudf.read_orc(path)
 
     assert_eq(pdf, gdf)
@@ -1391,13 +1313,9 @@ def test_map_type_read(columns, num_rows, use_index):
 
 def test_orc_reader_decimal(datadir):
     path = datadir / "TestOrcFile.decimal.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(path).to_pandas()
+    pdf = pd.read_orc(path)
+    gdf = cudf.read_orc(path)
 
     assert_eq(pdf, gdf)
 
@@ -1478,7 +1396,7 @@ def test_orc_writer_lists(data):
         buffer, stripe_size_rows=2048, row_index_stride=512
     )
 
-    pdf_out = pa.orc.ORCFile(buffer).read().to_pandas()
+    pdf_out = pd.read_orc(buffer)
     assert_eq(pdf_out, pdf_in)
 
 
@@ -1500,7 +1418,7 @@ def test_chunked_orc_writer_lists():
     writer.write_table(gdf)
     writer.close()
 
-    got = pa.orc.ORCFile(buffer).read().to_pandas()
+    got = pd.read_orc(buffer)
     assert_eq(expect, got)
 
 
@@ -1508,17 +1426,9 @@ def test_writer_timestamp_stream_size(datadir, tmpdir):
     pdf_fname = datadir / "TestOrcFile.largeTimestamps.orc"
     gdf_fname = tmpdir.join("gdf.orc")
 
-    try:
-        orcfile = pa.orc.ORCFile(pdf_fname)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
-
-    expect = orcfile.read().to_pandas()
+    expect = pd.read_orc(pdf_fname)
     cudf.from_pandas(expect).to_orc(gdf_fname.strpath)
-    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+    got = pd.read_orc(gdf_fname)
 
     assert_eq(expect, got)
 
@@ -1591,7 +1501,7 @@ def test_orc_writer_lists_empty_rg(data):
     df = cudf.read_orc(buffer)
     assert_eq(df, cudf_in)
 
-    pdf_out = pa.orc.ORCFile(buffer).read().to_pandas()
+    pdf_out = pd.read_orc(buffer)
     assert_eq(pdf_in, pdf_out)
 
 
@@ -1696,7 +1606,7 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
 
     # Segfaults when RLE stream sizes don't account for varint length
     pa_out = pa.orc.ORCFile(reencoded).read()
-    assert_eq(df.to_pandas(), pa_out)
+    assert df.to_arrow().equals(pa_out)
 
 
 def test_empty_columns():
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index d3eceeddc10..6a55fece6ff 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2658,11 +2658,11 @@ def test_parquet_writer_zstd():
 
     buff = BytesIO()
     try:
-        expected.to_orc(buff, compression="ZSTD")
+        expected.to_parquet(buff, compression="ZSTD")
     except RuntimeError:
         pytest.mark.xfail(reason="Newer nvCOMP version is required")
     else:
-        got = pd.read_orc(buff)
+        got = pd.read_parquet(buff)
         assert_eq(expected, got)
 
 
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index d2339930b91..de3bba25223 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -7,9 +7,7 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow as pa
 import pyarrow.fs as pa_fs
-import pyarrow.orc
 import pytest
 from fsspec.core import get_fs_token_paths
 
@@ -442,7 +440,7 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
     source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
     fname = "test_orc_reader.orc"
     bucket = "orc"
-    expect = pa.orc.ORCFile(source_file).read().to_pandas()
+    expect = pd.read_orc(source_file)
 
     with open(source_file, "rb") as f:
         buffer = f.read()
@@ -465,7 +463,7 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
     source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
     fname = "test_orc_reader.orc"
     bucket = "orc"
-    expect = pa.orc.ORCFile(source_file).read().to_pandas()
+    expect = pd.read_orc(source_file)
 
     with open(source_file, "rb") as f:
         buffer = f.read()
@@ -491,7 +489,7 @@ def test_write_orc(s3_base, s3so, pdf):
         assert s3fs.exists(f"s3://{bucket}/{fname}")
 
         with s3fs.open(f"s3://{bucket}/{fname}") as f:
-            got = pa.orc.ORCFile(f).read().to_pandas()
+            got = pd.read_orc(f)
 
     assert_eq(pdf, got)
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index c9587438b09..2525f055738 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -3,6 +3,7 @@
 import hashlib
 import operator
 import re
+from collections import OrderedDict, defaultdict
 from string import ascii_letters, digits
 
 import cupy as cp
@@ -1614,6 +1615,47 @@ def test_axes(data):
         assert_eq(e, a)
 
 
+def test_series_truncate():
+    csr = cudf.Series([1, 2, 3, 4])
+    psr = csr.to_pandas()
+
+    assert_eq(csr.truncate(), psr.truncate())
+    assert_eq(csr.truncate(1, 2), psr.truncate(1, 2))
+    assert_eq(csr.truncate(before=1, after=2), psr.truncate(before=1, after=2))
+
+
+def test_series_truncate_errors():
+    csr = cudf.Series([1, 2, 3, 4])
+    with pytest.raises(ValueError):
+        csr.truncate(axis=1)
+    with pytest.raises(ValueError):
+        csr.truncate(copy=False)
+
+    csr.index = [3, 2, 1, 6]
+    psr = csr.to_pandas()
+    assert_exceptions_equal(
+        lfunc=csr.truncate,
+        rfunc=psr.truncate,
+    )
+
+
+def test_series_truncate_datetimeindex():
+    dates = cudf.date_range(
+        "2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s"
+    )
+    csr = cudf.Series(range(len(dates)), index=dates)
+    psr = csr.to_pandas()
+
+    assert_eq(
+        csr.truncate(
+            before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+        ),
+        psr.truncate(
+            before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+        ),
+    )
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -1957,3 +1999,14 @@ def test_int64_equality():
     s = cudf.Series(np.asarray([2**63 - 10, 2**63 - 100], dtype=np.int64))
     assert (s != np.int64(2**63 - 1)).all()
     assert (s != cudf.Scalar(2**63 - 1, dtype=np.int64)).all()
+
+
+@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)])
+def test_series_to_dict(into):
+    gs = cudf.Series(["ab", "de", "zx"], index=[10, 20, 100])
+    ps = gs.to_pandas()
+
+    actual = gs.to_dict(into=into)
+    expected = ps.to_dict(into=into)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 13b342e6c3b..0298a62b9d2 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -108,6 +108,16 @@ def test_series_set_item(psr, arg):
     assert_eq(psr, gsr)
 
 
+def test_series_setitem_singleton_range():
+    sr = cudf.Series([1, 2, 3], dtype=np.int64)
+    psr = sr.to_pandas()
+    value = np.asarray([7], dtype=np.int64)
+    sr.iloc[:1] = value
+    psr.iloc[:1] = value
+    assert_eq(sr, cudf.Series([7, 2, 3], dtype=np.int64))
+    assert_eq(sr, psr, check_dtype=True)
+
+
 @pytest.mark.parametrize(
     "df",
     [
@@ -297,3 +307,48 @@ def test_series_slice_setitem_struct():
     actual[0:3] = cudf.Scalar({"a": {"b": 5050}, "b": 101})
 
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
+@pytest.mark.parametrize("indices", [0, [1, 2]])
+def test_series_setitem_upcasting(dtype, indices):
+    sr = pd.Series([0, 0, 0], dtype=dtype)
+    cr = cudf.from_pandas(sr)
+    assert_eq(sr, cr)
+    # Must be a non-integral floating point value that can't be losslessly
+    # converted to float32, otherwise pandas will try and match the source
+    # column dtype.
+    new_value = np.float64(np.pi)
+    col_ref = cr._column
+    sr[indices] = new_value
+    cr[indices] = new_value
+    if PANDAS_GE_150:
+        assert_eq(sr, cr)
+    else:
+        # pandas bug, incorrectly fails to upcast from float32 to float64
+        assert_eq(sr.values, cr.values)
+    if dtype == np.float64:
+        # no-op type cast should not modify backing column
+        assert col_ref == cr._column
+
+
+# TODO: these two tests could perhaps be changed once specifics of
+# pandas compat wrt upcasting are decided on; this is just baking in
+# status-quo.
+def test_series_setitem_upcasting_string_column():
+    sr = pd.Series([0, 0, 0], dtype=str)
+    cr = cudf.from_pandas(sr)
+    new_value = np.float64(10.5)
+    sr[0] = str(new_value)
+    cr[0] = new_value
+    assert_eq(sr, cr)
+
+
+def test_series_setitem_upcasting_string_value():
+    sr = cudf.Series([0, 0, 0], dtype=int)
+    # This is a distinction with pandas, which lets you instead make an
+    # object column with ["10", 0, 0]
+    sr[0] = "10"
+    assert_eq(pd.Series([10, 0, 0], dtype=int), sr)
+    with pytest.raises(ValueError):
+        sr[0] = "non-integer"
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index f1d110ba168..7af47f981d6 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -868,6 +868,41 @@ def func(row):
     run_masked_udf_test(func, str_udf_data, check_dtype=False)
 
 
+@string_udf_test
+def test_string_udf_return_string(str_udf_data):
+    def func(row):
+        return row["str_col"]
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
+@string_udf_test
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_strip(str_udf_data, strip_char):
+    def func(row):
+        return row["str_col"].strip(strip_char)
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
+@string_udf_test
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_lstrip(str_udf_data, strip_char):
+    def func(row):
+        return row["str_col"].lstrip(strip_char)
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
+@string_udf_test
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_rstrip(str_udf_data, strip_char):
+    def func(row):
+        return row["str_col"].rstrip(strip_char)
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]]
 )
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 9146405c6ed..2c4b73666a5 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1953,7 +1953,15 @@ def _fsspec_data_transfer(
 
     # Calculate total file size
     if file_like:
-        file_size = path_or_fob.size
+        try:
+            file_size = path_or_fob.size
+        except AttributeError:
+            # Find file size if there is no `size`
+            # attribute
+            old_file_position = path_or_fob.tell()
+            path_or_fob.seek(0, os.SEEK_END)
+            file_size = path_or_fob.tell()
+            path_or_fob.seek(old_file_position, os.SEEK_SET)
     file_size = file_size or fs.size(path_or_fob)
 
     # Check if a direct read makes the most sense
diff --git a/python/custreamz/README.md b/python/custreamz/README.md
index 99ada746ec8..a1d98425d66 100644
--- a/python/custreamz/README.md
+++ b/python/custreamz/README.md
@@ -48,8 +48,8 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 ### CUDA/GPU requirements
 
-* CUDA 10.0+
-* NVIDIA driver 410.48+
+* CUDA 11.0+
+* NVIDIA driver 450.80.02+
 * Pascal architecture or better (Compute Capability >=6.0)
 
 ### Conda
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index f02c75eb3e8..49b5e725fed 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -469,16 +469,18 @@ class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
         """
 
         @staticmethod
-        def from_dict(data, npartitions, orient="columns", **kwargs):
-            from dask_cudf import from_cudf
-
-            if orient != "columns":
-                raise ValueError(f"orient={orient} is not supported")
-            # TODO: Use cudf.from_dict
-            # (See: https://github.com/rapidsai/cudf/issues/11934)
-            return from_cudf(
-                cudf.DataFrame(data),
+        def from_dict(
+            data, npartitions, orient="columns", dtype=None, columns=None
+        ):
+
+            return _default_backend(
+                dd.from_dict,
+                data,
                 npartitions=npartitions,
+                orient=orient,
+                dtype=dtype,
+                columns=columns,
+                constructor=cudf.DataFrame,
             )
 
         @staticmethod
@@ -493,13 +495,10 @@ def read_parquet(*args, engine=None, **kwargs):
             )
 
         @staticmethod
-        def read_json(*args, engine=None, **kwargs):
-            return _default_backend(
-                dd.read_json,
-                *args,
-                engine=cudf.read_json,
-                **kwargs,
-            )
+        def read_json(*args, **kwargs):
+            from dask_cudf.io.json import read_json
+
+            return read_json(*args, **kwargs)
 
         @staticmethod
         def read_orc(*args, **kwargs):
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index f5258e6cab8..a56f70e7ae2 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+from functools import wraps
 from typing import Set
 
 import numpy as np
@@ -16,12 +17,8 @@
 import cudf
 from cudf.utils.utils import _dask_cudf_nvtx_annotate
 
-CUMULATIVE_AGGS = (
-    "cumsum",
-    "cumcount",
-)
-
-AGGS = (
+# aggregations that are dask-cudf optimized
+OPTIMIZED_AGGS = (
     "count",
     "mean",
     "std",
@@ -34,19 +31,18 @@
     "last",
 )
 
-SUPPORTED_AGGS = (*AGGS, *CUMULATIVE_AGGS)
-
 
-def _check_groupby_supported(func):
+def _check_groupby_optimized(func):
     """
     Decorator for dask-cudf's groupby methods that returns the dask-cudf
-    method if the groupby object is supported, otherwise reverting to the
-    upstream Dask method
+    optimized method if the groupby object is supported, otherwise
+    reverting to the upstream Dask method
     """
 
+    @wraps(func)
     def wrapper(*args, **kwargs):
         gb = args[0]
-        if _groupby_supported(gb):
+        if _groupby_optimized(gb):
             return func(*args, **kwargs)
         # note that we use upstream Dask's default kwargs for this call if
         # none are specified; this shouldn't be an issue as those defaults are
@@ -94,7 +90,7 @@ def _make_groupby_method_aggs(self, agg_name):
         return {c: agg_name for c in self.obj.columns if c != self.by}
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -109,7 +105,7 @@ def count(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -124,7 +120,7 @@ def mean(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -139,7 +135,7 @@ def std(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -154,7 +150,7 @@ def var(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -169,7 +165,7 @@ def sum(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -184,7 +180,7 @@ def min(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -199,7 +195,7 @@ def max(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -214,7 +210,7 @@ def collect(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -229,7 +225,7 @@ def first(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -250,7 +246,7 @@ def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
 
         arg = _redirect_aggs(arg)
 
-        if _groupby_supported(self) and _aggs_supported(arg, SUPPORTED_AGGS):
+        if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
             if isinstance(self._meta.grouping.keys, cudf.MultiIndex):
                 keys = self._meta.grouping.keys.names
             else:
@@ -287,7 +283,7 @@ def __init__(self, *args, sort=None, **kwargs):
         super().__init__(*args, sort=sort, **kwargs)
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -302,7 +298,7 @@ def count(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -317,7 +313,7 @@ def mean(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -332,7 +328,7 @@ def std(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -347,7 +343,7 @@ def var(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -362,7 +358,7 @@ def sum(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -377,7 +373,7 @@ def min(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -392,7 +388,7 @@ def max(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -407,7 +403,7 @@ def collect(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -422,7 +418,7 @@ def first(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -446,7 +442,7 @@ def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
         if not isinstance(arg, dict):
             arg = {self._slice: arg}
 
-        if _groupby_supported(self) and _aggs_supported(arg, SUPPORTED_AGGS):
+        if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
             return groupby_agg(
                 self.obj,
                 self.by,
@@ -569,9 +565,9 @@ def groupby_agg(
     """
     # Assert that aggregations are supported
     aggs = _redirect_aggs(aggs_in)
-    if not _aggs_supported(aggs, SUPPORTED_AGGS):
+    if not _aggs_optimized(aggs, OPTIMIZED_AGGS):
         raise ValueError(
-            f"Supported aggs include {SUPPORTED_AGGS} for groupby_agg API. "
+            f"Supported aggs include {OPTIMIZED_AGGS} for groupby_agg API. "
             f"Aggregations must be specified with dict or list syntax."
         )
 
@@ -689,8 +685,13 @@ def groupby_agg(
             "with `sort=False`, or set `shuffle=True`."
         )
 
+    # Determine required columns to enable column projection
+    required_columns = list(
+        set(gb_cols).union(aggs.keys()).intersection(ddf.columns)
+    )
+
     return aca(
-        [ddf],
+        [ddf[required_columns]],
         chunk=chunk,
         chunk_kwargs=chunk_kwargs,
         combine=combine,
@@ -735,7 +736,7 @@ def _redirect_aggs(arg):
 
 
 @_dask_cudf_nvtx_annotate
-def _aggs_supported(arg, supported: set):
+def _aggs_optimized(arg, supported: set):
     """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
         if isinstance(arg, dict):
@@ -757,8 +758,8 @@ def _aggs_supported(arg, supported: set):
 
 
 @_dask_cudf_nvtx_annotate
-def _groupby_supported(gb):
-    """Check that groupby input is supported by dask-cudf"""
+def _groupby_optimized(gb):
+    """Check that groupby input can use dask-cudf optimized codepath"""
     return isinstance(gb.obj, DaskDataFrame) and (
         isinstance(gb.by, str)
         or (isinstance(gb.by, list) and all(isinstance(x, str) for x in gb.by))
@@ -830,7 +831,7 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep):
         agg = col.split(sep)[-1]
         if agg in ("count", "sum"):
             agg_dict[col] = ["sum"]
-        elif agg in SUPPORTED_AGGS:
+        elif agg in OPTIMIZED_AGGS:
             agg_dict[col] = [agg]
         else:
             raise ValueError(f"Unexpected aggregation: {agg}")
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index 6c3c95d1a2e..6ab2ba415a5 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -6,4 +6,66 @@
 
 import cudf
 
-read_json = partial(dask.dataframe.read_json, engine=cudf.read_json)
+from dask_cudf.backends import _default_backend
+
+
+def read_json(url_path, engine="auto", **kwargs):
+    """Create a dask_cudf DataFrame collection from JSON data
+
+    This function wraps ``dask.dataframe.read_json``, and passes
+    ``engine=partial(cudf.read_json, engine="auto")`` by default.
+
+    Parameters
+    ----------
+    url_path: str, list of str
+        Location to read from. If a string, can include a glob character to
+        find a set of file names.
+        Supports protocol specifications such as ``"s3://"``.
+    engine : str or Callable, default "auto"
+        If str, this value will be used as the ``engine`` argument when
+        ``cudf.read_json`` is used to create each partition. If Callable,
+        this value will be used as the underlying function used to create
+        each partition from JSON data. The default value is "auto", so
+        that ``engine=partial(cudf.read_json, engine="auto")`` will be
+        pased to ``dask.dataframe.read_json`` by default.
+    **kwargs :
+        Key-word arguments to pass through to ``dask.dataframe.read_json``.
+
+    Returns
+    -------
+    dask_cudf.DataFrame
+
+    Examples
+    --------
+    Load single file
+
+    >>> from dask_cudf import read_json
+    >>> read_json('myfile.json')  # doctest: +SKIP
+
+    Load large line-delimited JSON files using partitions of approx
+    256MB size
+
+    >>> read_json('data/file*.csv', blocksize=2**28)  # doctest: +SKIP
+
+    Load nested JSON data
+
+    >>> read_json('myfile.json', engine='cudf_experimental')  # doctest: +SKIP
+
+    See Also
+    --------
+    dask.dataframe.io.json.read_json
+    """
+
+    # TODO: Add optimized code path to leverage the
+    # `byte_range` argument in `cudf.read_json` for
+    # local storage (see `dask_cudf.read_csv`)
+    return _default_backend(
+        dask.dataframe.read_json,
+        url_path,
+        engine=(
+            partial(cudf.read_json, engine=engine)
+            if isinstance(engine, str)
+            else engine
+        ),
+        **kwargs,
+    )
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index d19f7736e8e..9d26bf06545 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -71,3 +71,21 @@ def test_read_json_lines(lines):
         actual = dask_cudf.read_json(f, orient="records", lines=lines)
         actual_pd = pd.read_json(f, orient="records", lines=lines)
         dd.assert_eq(actual, actual_pd)
+
+
+def test_read_json_nested_experimental(tmp_path):
+    # Check that `engine="cudf_experimental"` can
+    # be used to support nested data
+    df = pd.DataFrame(
+        {
+            "a": [{"y": 2}, {"y": 4}, {"y": 6}, {"y": 8}],
+            "b": [[1, 2, 3], [4, 5], [6], [7]],
+            "c": [1, 3, 5, 7],
+        }
+    )
+    kwargs = dict(orient="records", lines=True)
+    with tmp_path / "data.json" as f:
+        df.to_json(f, **kwargs)
+        actual = dask_cudf.read_json(f, engine="cudf_experimental", **kwargs)
+        actual_pd = pd.read_json(f, **kwargs)
+        dd.assert_eq(actual, actual_pd)
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index f2047c34684..1f018e79ff7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -6,16 +6,28 @@
 
 import dask
 from dask import dataframe as dd
+from dask.utils_test import hlg_layer
 
 import cudf
 from cudf.core._compat import PANDAS_GE_120
 
 import dask_cudf
-from dask_cudf.groupby import AGGS, CUMULATIVE_AGGS, _aggs_supported
+from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
 
 
-@pytest.fixture
-def pdf():
+def assert_cudf_groupby_layers(ddf):
+    for prefix in ("cudf-aggregate-chunk", "cudf-aggregate-agg"):
+        try:
+            hlg_layer(ddf.dask, prefix)
+        except KeyError:
+            raise AssertionError(
+                "Expected Dask dataframe to contain groupby layer with "
+                f"prefix {prefix}"
+            )
+
+
+@pytest.fixture(params=["non_null", "null"])
+def pdf(request):
     np.random.seed(0)
 
     # note that column name "x" is a substring of the groupby key;
@@ -27,13 +39,17 @@ def pdf():
             "y": np.random.normal(size=10000),
         }
     )
+
+    # insert nulls into dataframe at random
+    if request.param == "null":
+        pdf = pdf.mask(np.random.choice([True, False], size=pdf.shape))
+
     return pdf
 
 
-@pytest.mark.parametrize("aggregation", AGGS)
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation, pdf):
-
     gdf = cudf.DataFrame.from_pandas(pdf)
     gdf_grouped = gdf.groupby("xx")
     ddf_grouped = dask_cudf.from_cudf(gdf, npartitions=5).groupby("xx")
@@ -42,30 +58,38 @@ def test_groupby_basic(series, aggregation, pdf):
         gdf_grouped = gdf_grouped.xx
         ddf_grouped = ddf_grouped.xx
 
-    a = getattr(gdf_grouped, aggregation)()
-    b = getattr(ddf_grouped, aggregation)().compute()
+    check_dtype = aggregation != "count"
 
-    if aggregation == "count":
-        dd.assert_eq(a, b, check_dtype=False)
-    else:
-        dd.assert_eq(a, b)
+    expect = getattr(gdf_grouped, aggregation)()
+    actual = getattr(ddf_grouped, aggregation)()
 
-    a = gdf_grouped.agg({"xx": aggregation})
-    b = ddf_grouped.agg({"xx": aggregation}).compute()
+    assert_cudf_groupby_layers(actual)
 
-    if aggregation == "count":
-        dd.assert_eq(a, b, check_dtype=False)
-    else:
-        dd.assert_eq(a, b)
+    dd.assert_eq(expect, actual, check_dtype=check_dtype)
+
+    expect = gdf_grouped.agg({"xx": aggregation})
+    actual = ddf_grouped.agg({"xx": aggregation})
+
+    assert_cudf_groupby_layers(actual)
+
+    dd.assert_eq(expect, actual, check_dtype=check_dtype)
 
 
+# TODO: explore adding support with `.agg()`
 @pytest.mark.parametrize("series", [True, False])
-@pytest.mark.parametrize("aggregation", CUMULATIVE_AGGS)
+@pytest.mark.parametrize("aggregation", ["cumsum", "cumcount"])
 def test_groupby_cumulative(aggregation, pdf, series):
     gdf = cudf.DataFrame.from_pandas(pdf)
     ddf = dask_cudf.from_cudf(gdf, npartitions=5)
 
-    gdf_grouped = gdf.groupby("xx")
+    if pdf.isna().sum().any():
+        with pytest.xfail(
+            reason="https://github.com/rapidsai/cudf/issues/12055"
+        ):
+            gdf_grouped = gdf.groupby("xx")
+    else:
+        gdf_grouped = gdf.groupby("xx")
+
     ddf_grouped = ddf.groupby("xx")
 
     if series:
@@ -73,7 +97,7 @@ def test_groupby_cumulative(aggregation, pdf, series):
         ddf_grouped = ddf_grouped.xx
 
     a = getattr(gdf_grouped, aggregation)()
-    b = getattr(ddf_grouped, aggregation)().compute()
+    b = getattr(ddf_grouped, aggregation)()
 
     if aggregation == "cumsum" and series:
         with pytest.xfail(reason="https://github.com/dask/dask/issues/9313"):
@@ -82,37 +106,35 @@ def test_groupby_cumulative(aggregation, pdf, series):
         dd.assert_eq(a, b)
 
 
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize(
     "func",
     [
-        lambda df: df.groupby("x").agg({"y": "max"}),
-        lambda df: df.groupby("x").agg(["sum", "max"]),
-        lambda df: df.groupby("x").y.agg(["sum", "max"]),
-        lambda df: df.groupby("x").agg("sum"),
-        lambda df: df.groupby("x").y.agg("sum"),
+        lambda df, agg: df.groupby("xx").agg({"y": agg}),
+        lambda df, agg: df.groupby("xx").y.agg({"y": agg}),
+        lambda df, agg: df.groupby("xx").agg([agg]),
+        lambda df, agg: df.groupby("xx").y.agg([agg]),
+        lambda df, agg: df.groupby("xx").agg(agg),
+        lambda df, agg: df.groupby("xx").y.agg(agg),
     ],
 )
-def test_groupby_agg(func):
-    pdf = pd.DataFrame(
-        {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
-        }
-    )
-
+def test_groupby_agg(func, aggregation, pdf):
     gdf = cudf.DataFrame.from_pandas(pdf)
 
     ddf = dask_cudf.from_cudf(gdf, npartitions=5)
 
-    a = func(gdf).to_pandas()
-    b = func(ddf).compute().to_pandas()
+    actual = func(ddf, aggregation)
+    expect = func(gdf, aggregation)
 
-    a.index.name = None
-    a.name = None
-    b.index.name = None
-    b.name = None
+    check_dtype = aggregation != "count"
 
-    dd.assert_eq(a, b)
+    assert_cudf_groupby_layers(actual)
+
+    # groupby.agg should add an explicit getitem layer
+    # to improve/enable column projection
+    assert hlg_layer(actual.dask, "getitem")
+
+    dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype)
 
 
 @pytest.mark.parametrize("split_out", [1, 3])
@@ -136,28 +158,6 @@ def test_groupby_agg_empty_partition(tmpdir, split_out):
     dd.assert_eq(gb.compute().sort_index(), expect)
 
 
-@pytest.mark.parametrize(
-    "func",
-    [lambda df: df.groupby("x").std(), lambda df: df.groupby("x").y.std()],
-)
-def test_groupby_std(func):
-    pdf = pd.DataFrame(
-        {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
-        }
-    )
-
-    gdf = cudf.DataFrame.from_pandas(pdf)
-
-    ddf = dask_cudf.from_cudf(gdf, npartitions=5)
-
-    a = func(gdf).to_pandas()
-    b = func(ddf).compute().to_pandas()
-
-    dd.assert_eq(a, b)
-
-
 # reason gotattr in cudf
 @pytest.mark.parametrize(
     "func",
@@ -710,7 +710,7 @@ def test_groupby_agg_redirect(aggregations):
     ],
 )
 def test_is_supported(arg, supported):
-    assert _aggs_supported(arg, AGGS) is supported
+    assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported
 
 
 def test_groupby_unique_lists():
@@ -746,22 +746,20 @@ def test_groupby_first_last(data, agg):
     gddf = dask_cudf.from_cudf(gdf, npartitions=2)
 
     dd.assert_eq(
-        ddf.groupby("a").agg(agg).compute(),
-        gddf.groupby("a").agg(agg).compute(),
+        ddf.groupby("a").agg(agg),
+        gddf.groupby("a").agg(agg),
     )
 
     dd.assert_eq(
-        getattr(ddf.groupby("a"), agg)().compute(),
-        getattr(gddf.groupby("a"), agg)().compute(),
+        getattr(ddf.groupby("a"), agg)(),
+        getattr(gddf.groupby("a"), agg)(),
     )
 
-    dd.assert_eq(
-        gdf.groupby("a").agg(agg), gddf.groupby("a").agg(agg).compute()
-    )
+    dd.assert_eq(gdf.groupby("a").agg(agg), gddf.groupby("a").agg(agg))
 
     dd.assert_eq(
         getattr(gdf.groupby("a"), agg)(),
-        getattr(gddf.groupby("a"), agg)().compute(),
+        getattr(gddf.groupby("a"), agg)(),
     )
 
 
diff --git a/python/strings_udf/cpp/CMakeLists.txt b/python/strings_udf/cpp/CMakeLists.txt
index e5b4aca7076..3e58d10d6e2 100644
--- a/python/strings_udf/cpp/CMakeLists.txt
+++ b/python/strings_udf/cpp/CMakeLists.txt
@@ -92,6 +92,10 @@ endfunction()
 # Create the shim library for each architecture.
 set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true)
 
+# always build a default PTX file in case RAPIDS_NO_INITIALIZE is set and the device cc can't be
+# safely queried through a context
+list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "60")
+
 list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-real" "")
 list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-virtual" "")
 list(SORT CMAKE_CUDA_ARCHITECTURES)
diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu
index 4d6690468ff..63e740c5226 100644
--- a/python/strings_udf/cpp/src/strings/udf/shim.cu
+++ b/python/strings_udf/cpp/src/strings/udf/shim.cu
@@ -17,6 +17,8 @@
 #include <cudf/strings/udf/char_types.cuh>
 #include <cudf/strings/udf/search.cuh>
 #include <cudf/strings/udf/starts_with.cuh>
+#include <cudf/strings/udf/strip.cuh>
+#include <cudf/strings/udf/udf_string.cuh>
 
 using namespace cudf::strings::udf;
 
@@ -215,3 +217,56 @@ extern "C" __device__ int pycount(int* nb_retval, void const* str, void const* s
   *nb_retval = count(*str_view, *substr_view);
   return 0;
 }
+
+extern "C" __device__ int udf_string_from_string_view(int* nb_retbal,
+                                                      void const* str,
+                                                      void* udf_str)
+{
+  auto str_view_ptr = reinterpret_cast<cudf::string_view const*>(str);
+  auto udf_str_ptr  = reinterpret_cast<udf_string*>(udf_str);
+  *udf_str_ptr      = udf_string(*str_view_ptr);
+
+  return 0;
+}
+
+extern "C" __device__ int strip(int* nb_retval,
+                                void* udf_str,
+                                void* const* to_strip,
+                                void* const* strip_str)
+{
+  auto to_strip_ptr  = reinterpret_cast<cudf::string_view const*>(to_strip);
+  auto strip_str_ptr = reinterpret_cast<cudf::string_view const*>(strip_str);
+  auto udf_str_ptr   = reinterpret_cast<udf_string*>(udf_str);
+
+  *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr);
+
+  return 0;
+}
+
+extern "C" __device__ int lstrip(int* nb_retval,
+                                 void* udf_str,
+                                 void* const* to_strip,
+                                 void* const* strip_str)
+{
+  auto to_strip_ptr  = reinterpret_cast<cudf::string_view const*>(to_strip);
+  auto strip_str_ptr = reinterpret_cast<cudf::string_view const*>(strip_str);
+  auto udf_str_ptr   = reinterpret_cast<udf_string*>(udf_str);
+
+  *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr, cudf::strings::side_type::LEFT);
+
+  return 0;
+}
+
+extern "C" __device__ int rstrip(int* nb_retval,
+                                 void* udf_str,
+                                 void* const* to_strip,
+                                 void* const* strip_str)
+{
+  auto to_strip_ptr  = reinterpret_cast<cudf::string_view const*>(to_strip);
+  auto strip_str_ptr = reinterpret_cast<cudf::string_view const*>(strip_str);
+  auto udf_str_ptr   = reinterpret_cast<udf_string*>(udf_str);
+
+  *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr, cudf::strings::side_type::RIGHT);
+
+  return 0;
+}
diff --git a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu
index 7927740fd49..b4d5014d9e0 100644
--- a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu
+++ b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu
@@ -58,7 +58,7 @@ std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const
 {
   return std::make_unique<rmm::device_buffer>(
     std::move(cudf::strings::detail::create_string_vector_from_column(
-                cudf::strings_column_view(input), stream)
+                cudf::strings_column_view(input), stream, rmm::mr::get_current_device_resource())
                 .release()));
 }
 
diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py
index 24f1a2d3bda..2222fb72009 100644
--- a/python/strings_udf/strings_udf/__init__.py
+++ b/python/strings_udf/strings_udf/__init__.py
@@ -43,7 +43,8 @@ def maybe_patch_numba_linker(driver_version):
 
 def _get_ptx_file():
     if "RAPIDS_NO_INITIALIZE" in os.environ:
-        cc = int(os.environ.get("STRINGS_UDF_CC", "52"))
+        # shim_60.ptx is always built
+        cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
     else:
         dev = cuda.get_current_device()
 
diff --git a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd
index fb8e3a949bf..7b90760abcc 100644
--- a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd
+++ b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd
@@ -11,9 +11,19 @@ from cudf._lib.cpp.types cimport size_type
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 
+cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
+        "cudf::strings::udf" nogil:
+    cdef cppclass udf_string
+
 cdef extern from "cudf/strings/udf/udf_apis.hpp"  namespace \
         "cudf::strings::udf" nogil:
     cdef unique_ptr[device_buffer] to_string_view_array(column_view) except +
+    cdef unique_ptr[column] column_from_udf_string_array(
+        udf_string* strings, size_type size,
+    ) except +
+    cdef void free_udf_string_array(
+        udf_string* strings, size_type size
+    ) except +
 
 cdef extern from "cudf/strings/detail/char_tables.hpp" namespace \
         "cudf::strings::detail" nogil:
diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx
index db6e206843c..4fc9e473fa3 100644
--- a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx
+++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx
@@ -6,15 +6,18 @@ from libcpp.utility cimport move
 from cudf.core.buffer import as_buffer
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column_view
+from cudf._lib.cpp.column.column cimport column, column_view
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from strings_udf._lib.cpp.strings_udf cimport (
+    column_from_udf_string_array as cpp_column_from_udf_string_array,
+    free_udf_string_array as cpp_free_udf_string_array,
     to_string_view_array as cpp_to_string_view_array,
+    udf_string,
 )
 
 
-def to_string_view_array(Column strings_col):
+def column_to_string_view_array(Column strings_col):
     cdef unique_ptr[device_buffer] c_buffer
     cdef column_view input_view = strings_col.view()
     with nogil:
@@ -22,3 +25,17 @@ def to_string_view_array(Column strings_col):
 
     device_buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer))
     return as_buffer(device_buffer)
+
+
+def column_from_udf_string_array(DeviceBuffer d_buffer):
+    cdef size_t size = int(d_buffer.c_size() / sizeof(udf_string))
+    cdef udf_string* data = <udf_string*>d_buffer.c_data()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_column_from_udf_string_array(data, size))
+        cpp_free_udf_string_array(data, size)
+
+    result = Column.from_unique_ptr(move(c_result))
+
+    return result
diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py
index 675507bccde..a309a9cb93c 100644
--- a/python/strings_udf/strings_udf/_typing.py
+++ b/python/strings_udf/strings_udf/_typing.py
@@ -3,6 +3,7 @@
 import operator
 
 import llvmlite.binding as ll
+import numpy as np
 from numba import types
 from numba.core.datamodel import default_manager
 from numba.core.extending import models, register_model
@@ -23,19 +24,33 @@
 
 
 # String object definitions
-class DString(types.Type):
+class UDFString(types.Type):
+
+    np_dtype = np.dtype("object")
+
     def __init__(self):
-        super().__init__(name="dstring")
+        super().__init__(name="udf_string")
         llty = default_manager[self].get_value_type()
         self.size_bytes = llty.get_abi_size(target_data)
 
+    @property
+    def return_type(self):
+        return self
+
 
 class StringView(types.Type):
+
+    np_dtype = np.dtype("object")
+
     def __init__(self):
         super().__init__(name="string_view")
         llty = default_manager[self].get_value_type()
         self.size_bytes = llty.get_abi_size(target_data)
 
+    @property
+    def return_type(self):
+        return UDFString()
+
 
 @register_model(StringView)
 class stringview_model(models.StructModel):
@@ -56,9 +71,9 @@ def __init__(self, dmm, fe_type):
         super().__init__(dmm, fe_type, self._members)
 
 
-@register_model(DString)
-class dstring_model(models.StructModel):
-    # from dstring.hpp:
+@register_model(UDFString)
+class udf_string_model(models.StructModel):
+    # from udf_string.hpp:
     # private:
     #   char* m_data{};
     #   cudf::size_type m_bytes{};
@@ -74,8 +89,9 @@ def __init__(self, dmm, fe_type):
         super().__init__(dmm, fe_type, self._members)
 
 
-any_string_ty = (StringView, DString, types.StringLiteral)
+any_string_ty = (StringView, UDFString, types.StringLiteral)
 string_view = StringView()
+udf_string = UDFString()
 
 
 class StrViewArgHandler:
@@ -93,7 +109,9 @@ class StrViewArgHandler:
     """
 
     def prepare_args(self, ty, val, **kwargs):
-        if isinstance(ty, types.CPointer) and isinstance(ty.dtype, StringView):
+        if isinstance(ty, types.CPointer) and isinstance(
+            ty.dtype, (StringView, UDFString)
+        ):
             return types.uint64, val.ptr
         else:
             return ty, val
@@ -113,7 +131,7 @@ def generic(self, args, kws):
         if isinstance(args[0], any_string_ty) and len(args) == 1:
             # length:
             # string_view -> int32
-            # dstring -> int32
+            # udf_string -> int32
             # literal -> int32
             return nb_signature(size_type, args[0])
 
@@ -163,7 +181,7 @@ def attr(self, mod):
     return attr
 
 
-def create_identifier_attr(attrname):
+def create_identifier_attr(attrname, retty):
     """
     Helper function wrapping numba's low level extension API. Provides
     the boilerplate needed to register a unary function of a string
@@ -174,7 +192,7 @@ class StringViewIdentifierAttr(AbstractTemplate):
         key = f"StringView.{attrname}"
 
         def generic(self, args, kws):
-            return nb_signature(types.boolean, recvr=self.this)
+            return nb_signature(retty, recvr=self.this)
 
     def attr(self, mod):
         return types.BoundFunction(StringViewIdentifierAttr, string_view)
@@ -211,6 +229,7 @@ def resolve_count(self, mod):
     "isnumeric",
     "istitle",
 ]
+string_return_attrs = ["strip", "lstrip", "rstrip"]
 
 for func in bool_binary_funcs:
     setattr(
@@ -219,12 +238,24 @@ def resolve_count(self, mod):
         create_binary_attr(func, types.boolean),
     )
 
+for func in string_return_attrs:
+    setattr(
+        StringViewAttrs,
+        f"resolve_{func}",
+        create_binary_attr(func, udf_string),
+    )
+
+
 for func in int_binary_funcs:
     setattr(
         StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type)
     )
 
 for func in id_unary_funcs:
-    setattr(StringViewAttrs, f"resolve_{func}", create_identifier_attr(func))
+    setattr(
+        StringViewAttrs,
+        f"resolve_{func}",
+        create_identifier_attr(func, types.boolean),
+    )
 
 cuda_decl_registry.register_attr(StringViewAttrs)
diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py
index df0902dfa98..17a1869e881 100644
--- a/python/strings_udf/strings_udf/lowering.py
+++ b/python/strings_udf/strings_udf/lowering.py
@@ -5,6 +5,7 @@
 
 from numba import cuda, types
 from numba.core import cgutils
+from numba.core.datamodel import default_manager
 from numba.core.typing import signature as nb_signature
 from numba.cuda.cudadrv import nvvm
 from numba.cuda.cudaimpl import (
@@ -13,21 +14,16 @@
 )
 
 from strings_udf._lib.tables import get_character_flags_table_ptr
-from strings_udf._typing import size_type, string_view
+from strings_udf._typing import size_type, string_view, udf_string
 
 character_flags_table_ptr = get_character_flags_table_ptr()
 
-
-# read-only functions
-# We will provide only one overload for this set of functions, which will
-# expect a string_view. When a literal is encountered, numba will promote it to
-# a string_view whereas when a dstring is encountered, numba will convert it to
-# a view via its native view() method.
-
 _STR_VIEW_PTR = types.CPointer(string_view)
+_UDF_STRING_PTR = types.CPointer(udf_string)
 
 
 # CUDA function declarations
+# read-only (input is a string_view, output is a fixed with type)
 _string_view_len = cuda.declare_device("len", size_type(_STR_VIEW_PTR))
 
 
@@ -39,6 +35,12 @@ def _declare_binary_func(lhs, rhs, out, name):
     )
 
 
+def _declare_strip_func(name):
+    return cuda.declare_device(
+        name, size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR)
+    )
+
+
 # A binary function of the form f(string, string) -> bool
 _declare_bool_str_str_func = partial(
     _declare_binary_func, _STR_VIEW_PTR, _STR_VIEW_PTR, types.boolean
@@ -60,6 +62,9 @@ def _declare_binary_func(lhs, rhs, out, name):
 _string_view_find = _declare_size_type_str_str_func("find")
 _string_view_rfind = _declare_size_type_str_str_func("rfind")
 _string_view_contains = _declare_bool_str_str_func("contains")
+_string_view_strip = _declare_strip_func("strip")
+_string_view_lstrip = _declare_strip_func("lstrip")
+_string_view_rstrip = _declare_strip_func("rstrip")
 
 
 # A binary function of the form f(string, int) -> bool
@@ -107,6 +112,35 @@ def cast_string_literal_to_string_view(context, builder, fromty, toty, val):
     return sv._getvalue()
 
 
+@cuda_lowering_registry.lower_cast(string_view, udf_string)
+def cast_string_view_to_udf_string(context, builder, fromty, toty, val):
+    sv_ptr = builder.alloca(default_manager[fromty].get_value_type())
+    udf_str_ptr = builder.alloca(default_manager[toty].get_value_type())
+    builder.store(val, sv_ptr)
+    _ = context.compile_internal(
+        builder,
+        call_create_udf_string_from_string_view,
+        nb_signature(types.void, _STR_VIEW_PTR, types.CPointer(udf_string)),
+        (sv_ptr, udf_str_ptr),
+    )
+    result = cgutils.create_struct_proxy(udf_string)(
+        context, builder, value=builder.load(udf_str_ptr)
+    )
+
+    return result._getvalue()
+
+
+# utilities
+_create_udf_string_from_string_view = cuda.declare_device(
+    "udf_string_from_string_view",
+    types.void(types.CPointer(string_view), types.CPointer(udf_string)),
+)
+
+
+def call_create_udf_string_from_string_view(sv, udf_str):
+    _create_udf_string_from_string_view(sv, udf_str)
+
+
 # String function implementations
 def call_len_string_view(st):
     return _string_view_len(st)
@@ -138,17 +172,44 @@ def deco(cuda_func):
         def binary_func_impl(context, builder, sig, args):
             lhs_ptr = builder.alloca(args[0].type)
             rhs_ptr = builder.alloca(args[1].type)
-
             builder.store(args[0], lhs_ptr)
             builder.store(args[1], rhs_ptr)
-            result = context.compile_internal(
-                builder,
-                cuda_func,
-                nb_signature(retty, _STR_VIEW_PTR, _STR_VIEW_PTR),
-                (lhs_ptr, rhs_ptr),
-            )
 
-            return result
+            # these conditional statements should compile out
+            if retty != udf_string:
+                # binary function of two strings yielding a fixed-width type
+                # example: str.startswith(other) -> bool
+                # shim functions can return the value through nb_retval
+                result = context.compile_internal(
+                    builder,
+                    cuda_func,
+                    nb_signature(retty, _STR_VIEW_PTR, _STR_VIEW_PTR),
+                    (lhs_ptr, rhs_ptr),
+                )
+                return result
+            else:
+                # binary function of two strings yielding a new string
+                # example: str.strip(other) -> str
+                # shim functions can not return a struct due to C linkage
+                # so we create a new udf_string and pass a pointer to it
+                # for the shim function to write the output to. The return
+                # value of compile_internal is therefore discarded (although
+                # this may change in the future if we need to return error
+                # codes, for instance).
+                udf_str_ptr = builder.alloca(
+                    default_manager[udf_string].get_value_type()
+                )
+
+                _ = context.compile_internal(
+                    builder,
+                    cuda_func,
+                    size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR),
+                    (udf_str_ptr, lhs_ptr, rhs_ptr),
+                )
+                result = cgutils.create_struct_proxy(udf_string)(
+                    context, builder, value=builder.load(udf_str_ptr)
+                )
+                return result._getvalue()
 
         return binary_func_impl
 
@@ -190,6 +251,21 @@ def lt_impl(st, rhs):
     return _string_view_lt(st, rhs)
 
 
+@create_binary_string_func("StringView.strip", udf_string)
+def strip_impl(result, to_strip, strip_char):
+    return _string_view_strip(result, to_strip, strip_char)
+
+
+@create_binary_string_func("StringView.lstrip", udf_string)
+def lstrip_impl(result, to_strip, strip_char):
+    return _string_view_lstrip(result, to_strip, strip_char)
+
+
+@create_binary_string_func("StringView.rstrip", udf_string)
+def rstrip_impl(result, to_strip, strip_char):
+    return _string_view_rstrip(result, to_strip, strip_char)
+
+
 @create_binary_string_func("StringView.startswith", types.boolean)
 def startswith_impl(sv, substr):
     return _string_view_startswith(sv, substr)
diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py
index 1a5dfa00dd7..522433d404f 100644
--- a/python/strings_udf/strings_udf/tests/test_string_udfs.py
+++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py
@@ -9,14 +9,18 @@
 from numba.types import CPointer, void
 
 import cudf
+import rmm
 from cudf.testing._utils import assert_eq
 
 import strings_udf
-from strings_udf._lib.cudf_jit_udf import to_string_view_array
-from strings_udf._typing import str_view_arg_handler, string_view
+from strings_udf._lib.cudf_jit_udf import (
+    column_from_udf_string_array,
+    column_to_string_view_array,
+)
+from strings_udf._typing import str_view_arg_handler, string_view, udf_string
 
 
-def get_kernel(func, dtype):
+def get_kernel(func, dtype, size):
     """
     Create a kernel for testing a single scalar string function
     Allocates an output vector with a dtype specified by the caller
@@ -25,15 +29,19 @@ def get_kernel(func, dtype):
     """
 
     func = cuda.jit(device=True)(func)
-    outty = numba.np.numpy_support.from_dtype(dtype)
-    sig = nb_signature(void, CPointer(string_view), outty[::1])
+
+    if dtype == "str":
+        outty = CPointer(udf_string)
+    else:
+        outty = numba.np.numpy_support.from_dtype(dtype)[::1]
+    sig = nb_signature(void, CPointer(string_view), outty)
 
     @cuda.jit(
         sig, link=[strings_udf.ptxpath], extensions=[str_view_arg_handler]
     )
     def kernel(input_strings, output_col):
         id = cuda.grid(1)
-        if id < len(output_col):
+        if id < size:
             st = input_strings[id]
             result = func(st)
             output_col[id] = result
@@ -50,15 +58,22 @@ def run_udf_test(data, func, dtype):
     and then assembles the result back into a cuDF series before
     comparing it with the equivalent pandas result
     """
-    dtype = np.dtype(dtype)
+    if dtype == "str":
+        output = rmm.DeviceBuffer(size=len(data) * udf_string.size_bytes)
+    else:
+        dtype = np.dtype(dtype)
+        output = cudf.core.column.column_empty(len(data), dtype=dtype)
+
     cudf_column = cudf.core.column.as_column(data)
-    str_view_ary = to_string_view_array(cudf_column)
+    str_views = column_to_string_view_array(cudf_column)
+
+    kernel = get_kernel(func, dtype, len(data))
+    kernel.forall(len(data))(str_views, output)
 
-    output_ary = cudf.core.column.column_empty(len(data), dtype=dtype)
+    if dtype == "str":
+        output = column_from_udf_string_array(output)
 
-    kernel = get_kernel(func, dtype)
-    kernel.forall(len(data))(str_view_ary, output_ary)
-    got = cudf.Series(output_ary, dtype=dtype)
+    got = cudf.Series(output, dtype=dtype)
     expect = pd.Series(data).apply(func)
     assert_eq(expect, got, check_dtype=False)
 
@@ -256,3 +271,34 @@ def func(st):
         return st.startswith(substr)
 
     run_udf_test(data, func, "bool")
+
+
+def test_string_udf_return_string(data):
+    def func(st):
+        return st
+
+    run_udf_test(data, func, "str")
+
+
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_strip(data, strip_char):
+    def func(st):
+        return st.strip(strip_char)
+
+    run_udf_test(data, func, "str")
+
+
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_lstrip(data, strip_char):
+    def func(st):
+        return st.lstrip(strip_char)
+
+    run_udf_test(data, func, "str")
+
+
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_rstrip(data, strip_char):
+    def func(st):
+        return st.rstrip(strip_char)
+
+    run_udf_test(data, func, "str")